ruohguo
/

avis

Model card Files Files and versions Community

ruohguo commited on 8 days ago

Commit

b80ae90

verified ·

1 Parent(s): 35d18ec

Upload 117 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
LICENSE +21 -0
README.md +141 -3
assets/teaser_figure.png +3 -0
avism/__init__.py +12 -0
avism/avism_model.py +460 -0
avism/avism_model_coco.py +460 -0
avism/config.py +56 -0
avism/data/__init__.py +4 -0
avism/data/augmentation.py +623 -0
avism/data/avis_eval.py +203 -0
avism/data/aviseval/__init__.py +5 -0
avism/data/aviseval/_timing.py +65 -0
avism/data/aviseval/datasets/__init__.py +1 -0
avism/data/aviseval/datasets/_base_dataset.py +326 -0
avism/data/aviseval/datasets/avis.py +367 -0
avism/data/aviseval/eval.py +209 -0
avism/data/aviseval/metrics/__init__.py +11 -0
avism/data/aviseval/metrics/_base_metric.py +132 -0
avism/data/aviseval/metrics/av_loc.py +191 -0
avism/data/aviseval/metrics/avisa.py +190 -0
avism/data/aviseval/metrics/clear.py +186 -0
avism/data/aviseval/metrics/count.py +44 -0
avism/data/aviseval/metrics/hota.py +202 -0
avism/data/aviseval/metrics/identity.py +135 -0
avism/data/aviseval/metrics/ideucl.py +135 -0
avism/data/aviseval/metrics/j_and_f.py +310 -0
avism/data/aviseval/metrics/track_map.py +462 -0
avism/data/aviseval/metrics/vace.py +131 -0
avism/data/aviseval/plotting.py +230 -0
avism/data/aviseval/utils.py +146 -0
avism/data/build.py +247 -0
avism/data/dataset_mapper.py +272 -0
avism/data/datasets/__init__.py +3 -0
avism/data/datasets/avis.py +209 -0
avism/data/datasets/avis_api/__init__.py +1 -0
avism/data/datasets/avis_api/avos.py +277 -0
avism/data/datasets/avis_api/avoseval.py +559 -0
avism/data/datasets/builtin.py +29 -0
avism/data/datasets/extract_audio_feat/audio_feature_extractor.py +77 -0
avism/data/datasets/extract_audio_feat/mel_features.py +233 -0
avism/data/datasets/extract_audio_feat/vggish_input.py +103 -0
avism/data/datasets/extract_audio_feat/vggish_params.py +53 -0
avism/data/datasets/extract_audio_feat/vggish_slim.py +134 -0
avism/modeling/__init__.py +0 -0
avism/modeling/avism_criterion.py +335 -0
avism/modeling/avism_matcher.py +194 -0
avism/modeling/transformer_decoder/__init__.py +1 -0
avism/modeling/transformer_decoder/avism.py +675 -0
avism/modeling/transformer_decoder/avism_coco.py +675 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/teaser_figure.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 GeWu-Lab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,141 @@
----
-license: mit
----

+# Audio-Visual Instance Segmentation
+[![AVIS](https://img.shields.io/badge/Paper-AVIS-2b9348.svg?logo=arXiv)](https://arxiv.org/abs/2412.03069)
+[![Project Page](https://img.shields.io/badge/Project_page-Visualizations-blue)](https://ruohaoguo.github.io/avis/)
+[![Dataset](https://img.shields.io/badge/Dataset-Download-yellow)](https://1drv.ms/u/c/3c9af704fb61931d/EVOs609SGMxLsbvVzVJHAa4Bmnu4GVZGjqYHQxDz0NKTew?e=WQU2Uf)
+Ruohao Guo, Xianghua Ying*, Yaru Chen, Dantong Niu, Guangyao Li, Liao Qu, Yanyu Qi, Jinxing Zhou, Bowei Xing, Wenzhen Yue, Ji Shi, Qixun Wang, Peiliang Zhang, Buwen Liang
+## 📰 News
+🔥**2025.03.01**: Codes and checkpoints are released!
+🔥**2025.02.27**: AVIS got accepted to **CVPR 2025**! 🎉🎉🎉
+🔥**2024.11.12**: Our [project page](https://ruohaoguo.github.io/avis/) is now available!
+🔥**2024.11.11**: The AVISeg dataset has been uploaded to [OneDrive](https://1drv.ms/u/c/3c9af704fb61931d/EVOs609SGMxLsbvVzVJHAa4Bmnu4GVZGjqYHQxDz0NKTew?e=WQU2Uf), welcome to download and use!
+## 🌿 Introduction
+In this paper, we propose a new multi-modal task, termed audio-visual instance segmentation (AVIS), which aims to simultaneously identify, segment and track individual sounding object instances in audible videos. To facilitate this research, we introduce a high-quality benchmark named AVISeg, containing over 90K instance masks from 26 semantic categories in 926 long videos. Additionally, we propose a strong baseline model for this task. Our model first localizes sound source within each frame, and condenses object-specific contexts into concise tokens. Then it builds long-range audio-visual dependencies between these tokens using window-based attention, and tracks sounding objects among the entire video sequences.
+<div align='center'>
+<img src="./assets/teaser_figure.png" class="interpolation-image" alt="radar." height="50%" width="100%" />
+</div>
+## ⚙️ Installation
+```bash
+conda create --name avism python=3.8 -y
+conda activate avism
+conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
+pip install -U opencv-python
+cd ./AVISM
+git clone https://github.com/facebookresearch/detectron2
+cd detectron2
+pip install -e .
+cd ../
+pip install -r requirements.txt
+cd mask2former/modeling/pixel_decoder/ops
+sh make.sh
+```
+## 🤗 Setup
+### Datasets
+Download and unzip datasets [OneDrive](https://1drv.ms/u/c/3c9af704fb61931d/EVOs609SGMxLsbvVzVJHAa4Bmnu4GVZGjqYHQxDz0NKTew?e=WQU2Uf) and put them in ```./datasets```.
+### Pretrained Backbones
+Download and unzip pre-trained backbones [OneDrive](https://1drv.ms/u/c/3c9af704fb61931d/ETDDliQ8zZFGmYxlLVPyi3sBis_fdjX0w8mJhyQnYVSdXA?e=Wt7pUb) and put them in ```./pre_models```.
+### Checkpoints
+Download the following checkpoints and put them in ```./checkpoints```.
+<table>
+  <tr>
+    <th style="width: 150px;">Backbone</th>
+    <th>Pre-trained Datasets</th>
+    <th>FSLA</th>
+    <th>HOTA</th>
+    <th>mAP</th>
+    <th>Model Weight</th>
+  </tr>
+  <tr>
+    <td align="center">ResNet-50</td>
+    <td align="center">ImageNet</td>
+    <td align="center">42.78</td>
+    <td align="center">61.73</td>
+    <td align="center">40.57</td>
+    <td align="center"><a href="https://1drv.ms/u/c/3c9af704fb61931d/EYyAuCNpRjxDqEohJfoDLO0BYgw0lbwKqQ1lwVXe_kIPVQ?e=PeRlyx">AVISM_R50_IN.pth</a></td>
+  </tr>
+  <tr>
+    <td align="center">ResNet-50</td>
+    <td align="center">ImageNet & COCO</td>
+    <td align="center">44.42</td>
+    <td align="center">64.52</td>
+    <td align="center">45.04</td>
+    <td align="center"><a href="https://1drv.ms/u/c/3c9af704fb61931d/EX0snZsxQwdBswQFdG4sc9kBd-Bd7lw5zaTGR6FvrSxinQ?e=bdZF5G">AVISM_R50_COCO.pth</a></td>
+  </tr>
+  <tr>
+    <td align="center">Swin-L</td>
+    <td align="center">ImageNet</td>
+    <td align="center">49.15</td>
+    <td align="center">68.81</td>
+    <td align="center">49.06</td>
+    <td align="center"><a href="https://1drv.ms/u/c/3c9af704fb61931d/EV4V5Bh5AqVBhLVMM1ucdN0BuOZgHu17W3JDGjKDMLZ1bg?e=hF8umh">AVISM_SwinL_IN.pth</a></td>
+  </tr>
+  <tr>
+    <td align="center">Swin-L</td>
+    <td align="center">ImageNet & COCO</td>
+    <td align="center">52.49</td>
+    <td align="center">71.13</td>
+    <td align="center">53.46</td>
+    <td align="center"><a href="https://1drv.ms/u/c/3c9af704fb61931d/EXuM4cUxPTpEk1M7FoPqtNEBi47L7uR-ZlnqDCJscmNsiA?e=7prFiN">AVISM_SwinL_COCO.pth</a></td>
+  </tr>
+</table>
+## 📌 Getting Started
+### Training
+```
+python train_net.py --num-gpus 2 --config-file configs/avism/R50/avism_R50_IN.yaml
+```
+### Evaluation
+```
+python train_net.py --config-file configs/avism/R50/avism_R50_IN.yaml --eval-only MODEL.WEIGHTS checkpoints/AVISM_R50_IN.pth
+```
+### Demo
+```
+python demo_video/demo.py --config-file configs/avism/R50/avism_R50_IN.yaml --opts MODEL.WEIGHTS checkpoints/AVISM_R50_IN.pth
+```
+## Acknowledgement
+We thank the great work from [Detectron2](https://github.com/facebookresearch/detectron2), [Mask2Former](https://github.com/facebookresearch/MaskFormer) and [VITA](https://github.com/sukjunhwang/VITA).
+## 📄 Citation
+If our work assists your research, feel free to give us a star ⭐ or cite us using
+```
+@article{guo2023audio,
+  title={Audio-Visual Instance Segmentation},
+  author={Guo, Ruohao and Ying, Xianghua and Chen, Yaru and Niu, Dantong and Li, Guangyao and Qu, Liao and Qi, Yanyu and Zhou, Jinxing and Xing, Bowei and Yue, Wenzhen and Shi, Ji and Wang, Qixun and Zhang, Peiliang and Liang, Buwen},
+  journal={arXiv preprint arXiv:2310.18709},
+  year={2023}
+}
+```

assets/teaser_figure.png ADDED Viewed

Git LFS Details

SHA256: 12ed508f8fa304c94b3ef327e7b3c938567c3f4b72375d7edd6c7bd4e2965035
Pointer size: 132 Bytes
Size of remote file: 8.13 MB

avism/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# model code
+from . import modeling
+# config
+from .config import add_avism_config
+# models
+from .avism_model import AVISM
+from .avism_model_coco import AVISM_COCO
+# video
+from .data import *

avism/avism_model.py ADDED Viewed

	@@ -0,0 +1,460 @@

+from typing import Tuple
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.data import MetadataCatalog
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
+from detectron2.modeling.backbone import Backbone
+from detectron2.structures import Boxes, ImageList, Instances, BitMasks
+from detectron2.utils.memory import retry_if_cuda_oom
+from mask2former.modeling.criterion import SetCriterion
+from mask2former.modeling.matcher import HungarianMatcher
+from .modeling.avism_criterion import AvismSetCriterion
+from .modeling.avism_matcher import AvismHungarianMatcher
+from .modeling.transformer_decoder.avism import Avism
+@META_ARCH_REGISTRY.register()
+class AVISM(nn.Module):
+    """
+    Main class for mask classification semantic segmentation architectures.
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        criterion: nn.Module,
+        num_queries: int,
+        object_mask_threshold: float,
+        overlap_threshold: float,
+        metadata,
+        size_divisibility: int,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+        # inference
+        test_topk_per_image: int,
+        # avism
+        avism_module: nn.Module,
+        avism_criterion: nn.Module,
+        num_frames: int,
+        num_classes: int,
+        is_multi_cls: bool,
+        apply_cls_thres: float,
+        freeze_detector: bool,
+        test_run_chunk_size: int,
+        test_interpolate_chunk_size: int,
+        is_coco: bool,
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            criterion: a module that defines the loss
+            num_queries: int, number of queries
+            object_mask_threshold: float, threshold to filter query based on classification score
+                for panoptic segmentation inference
+            overlap_threshold: overlap threshold used in general inference for panoptic segmentation
+            metadata: dataset meta, get `thing` and `stuff` category names for panoptic
+                segmentation inference
+            size_divisibility: Some backbones require the input height and width to be divisible by a
+                specific integer. We can use this to override such requirement.
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+            test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.sem_seg_head = sem_seg_head
+        self.criterion = criterion
+        self.num_queries = num_queries
+        self.overlap_threshold = overlap_threshold
+        self.object_mask_threshold = object_mask_threshold
+        self.metadata = metadata
+        if size_divisibility < 0:
+            # use backbone size_divisibility if not set
+            size_divisibility = self.backbone.size_divisibility
+        self.size_divisibility = size_divisibility
+        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+        # additional args
+        self.test_topk_per_image = test_topk_per_image
+        # avism hyper-parameters
+        self.num_frames = num_frames
+        self.num_classes = num_classes
+        self.avism_module = avism_module
+        self.avism_criterion = avism_criterion
+        self.is_multi_cls = is_multi_cls
+        self.apply_cls_thres = apply_cls_thres
+        if freeze_detector:
+            for name, p in self.named_parameters():
+                if not "avism_module" in name:
+                    p.requires_grad_(False)
+        self.test_run_chunk_size = test_run_chunk_size
+        self.test_interpolate_chunk_size = test_interpolate_chunk_size
+        self.is_coco = is_coco
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
+        # Loss parameters:
+        deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
+        no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT
+        avism_deep_supervision = cfg.MODEL.AVISM.DEEP_SUPERVISION
+        # loss weights
+        class_weight = cfg.MODEL.MASK_FORMER.CLASS_WEIGHT
+        dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT
+        mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT
+        sim_weight = cfg.MODEL.AVISM.SIM_WEIGHT
+        # building criterion
+        matcher = HungarianMatcher(
+            cost_class=class_weight,
+            cost_mask=mask_weight,
+            cost_dice=dice_weight,
+            num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
+        )
+        weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_dice": dice_weight}
+        if deep_supervision:
+            dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+        losses = ["labels", "masks"]
+        criterion = SetCriterion(
+            sem_seg_head.num_classes,
+            matcher=matcher,
+            weight_dict=weight_dict,
+            eos_coef=no_object_weight,
+            losses=losses,
+            num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
+            oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO,
+            importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO,
+            avism_last_layer_num=cfg.MODEL.AVISM.LAST_LAYER_NUM,
+        )
+        # Avism
+        num_classes = sem_seg_head.num_classes
+        hidden_dim = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
+        avism_module = Avism(cfg=cfg, in_channels=hidden_dim, aux_loss=avism_deep_supervision)
+        # building criterion for avism inference
+        avism_matcher = AvismHungarianMatcher(
+            cost_class=class_weight,
+            cost_mask=mask_weight,
+            cost_dice=dice_weight,
+            num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
+        )
+        avism_weight_dict = {
+            "loss_avism_ce": class_weight, "loss_avism_mask": mask_weight, "loss_avism_dice": dice_weight
+        }
+        if sim_weight > 0.0:
+            avism_weight_dict["loss_avism_sim"] = sim_weight
+        if avism_deep_supervision:
+            avism_dec_layers = cfg.MODEL.AVISM.DEC_LAYERS
+            aux_weight_dict = {}
+            for i in range(avism_dec_layers - 1):
+                aux_weight_dict.update({k + f"_{i}": v for k, v in avism_weight_dict.items()})
+            avism_weight_dict.update(aux_weight_dict)
+        avism_losses = ["avism_labels", "avism_masks"]
+        if sim_weight > 0.0:
+            avism_losses.append("fg_sim")
+        avism_criterion = AvismSetCriterion(
+            num_classes,
+            matcher=avism_matcher,
+            weight_dict=avism_weight_dict,
+            eos_coef=cfg.MODEL.AVISM.NO_OBJECT_WEIGHT,
+            losses=avism_losses,
+            num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
+            oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO,
+            importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO,
+            sim_use_clip=cfg.MODEL.AVISM.SIM_USE_CLIP,
+        )
+        return {
+            "backbone": backbone,
+            "sem_seg_head": sem_seg_head,
+            "criterion": criterion,
+            "num_queries": cfg.MODEL.AVISM.NUM_OBJECT_QUERIES,
+            "object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD,
+            "overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD,
+            "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
+            "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY,
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+            # inference
+            "test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
+            # avism
+            "avism_module": avism_module,
+            "avism_criterion": avism_criterion,
+            "num_frames": cfg.INPUT.SAMPLING_FRAME_NUM,
+            "num_classes": num_classes,
+            "is_multi_cls": cfg.MODEL.AVISM.MULTI_CLS_ON,
+            "apply_cls_thres": cfg.MODEL.AVISM.APPLY_CLS_THRES,
+            "freeze_detector": cfg.MODEL.AVISM.FREEZE_DETECTOR,
+            "test_run_chunk_size": cfg.MODEL.AVISM.TEST_RUN_CHUNK_SIZE,
+            "test_interpolate_chunk_size": cfg.MODEL.AVISM.TEST_INTERPOLATE_CHUNK_SIZE,
+            "is_coco": cfg.DATASETS.TEST[0].startswith("coco"),
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "instances": per-region ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+                * "sem_seg":
+                    A Tensor that represents the
+                    per-pixel segmentation prediced by the head.
+                    The prediction has shape KxHxW that represents the logits of
+                    each class for each pixel.
+                * "panoptic_seg":
+                    A tuple that represent panoptic output
+                    panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+                    segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                        Each dict contains keys "id", "category_id", "isthing".
+        """
+        if self.training:
+            return self.train_model(batched_inputs)
+        else:
+            # NOTE consider only B=1 case.
+            return self.inference(batched_inputs[0])
+    def train_model(self, batched_inputs):
+        images = []
+        audio_features = []
+        for video in batched_inputs:
+            for frame in video["image"]:
+                images.append(frame.to(self.device))
+            for audio_feat in video["audio"]:
+                audio_features.append(torch.tensor(audio_feat).to(self.device))
+        audio_features = torch.stack(audio_features)
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        image_features = self.backbone(images.tensor)
+        BT = len(images)
+        T = self.num_frames if self.training else BT
+        B = BT // T
+        outputs, frame_queries, mask_features = self.sem_seg_head(image_features, audio_features)
+        mask_features = self.avism_module.avism_mask_features(mask_features)
+        mask_features = mask_features.view(B, self.num_frames, *mask_features.shape[-3:])
+        # mask classification target
+        frame_targets, clip_targets = self.prepare_targets(batched_inputs, images)
+        # bipartite matching-based loss
+        losses, fg_indices = self.criterion(outputs, frame_targets)
+        avism_outputs = self.avism_module(frame_queries, audio_features)
+        avism_outputs["pred_masks"] = torch.einsum("lbqc,btchw->lbqthw", avism_outputs["pred_mask_embed"], mask_features)
+        for out in avism_outputs["aux_outputs"]:
+            out["pred_masks"] = torch.einsum("lbqc,btchw->lbqthw", out["pred_mask_embed"], mask_features)
+        for k in list(losses.keys()):
+            if k in self.criterion.weight_dict:
+                losses[k] *= self.criterion.weight_dict[k]
+            else:
+                # remove this loss if not specified in `weight_dict`
+                losses.pop(k)
+        avism_loss_dict = self.avism_criterion(avism_outputs, clip_targets, frame_targets, fg_indices)
+        avism_weight_dict = self.avism_criterion.weight_dict
+        for k in avism_loss_dict.keys():
+            if k in avism_weight_dict:
+                avism_loss_dict[k] *= avism_weight_dict[k]
+        losses.update(avism_loss_dict)
+        return losses
+    def prepare_targets(self, targets, images):
+        h_pad, w_pad = images.tensor.shape[-2:]
+        frame_gt_instances = []
+        clip_gt_instances = []
+        for targets_per_video in targets:
+            _num_instance = len(targets_per_video["instances"][0])
+            mask_shape = [_num_instance, self.num_frames, h_pad, w_pad]
+            gt_masks_per_video = torch.zeros(mask_shape, dtype=torch.bool, device=self.device)
+            gt_classes_per_video = targets_per_video["instances"][0].gt_classes.to(self.device)
+            gt_ids_per_video = []
+            for f_i, targets_per_frame in enumerate(targets_per_video["instances"]):
+                targets_per_frame = targets_per_frame.to(self.device)
+                h, w = targets_per_frame.image_size
+                _update_cls = gt_classes_per_video == -1
+                gt_classes_per_video[_update_cls] = targets_per_frame.gt_classes[_update_cls]
+                gt_ids_per_video.append(targets_per_frame.gt_ids)
+                if isinstance(targets_per_frame.gt_masks, BitMasks):
+                    gt_masks_per_video[:, f_i, :h, :w] = targets_per_frame.gt_masks.tensor
+                else: #polygon
+                    gt_masks_per_video[:, f_i, :h, :w] = targets_per_frame.gt_masks
+            gt_ids_per_video = torch.stack(gt_ids_per_video, dim=1)
+            gt_ids_per_video[gt_masks_per_video.sum(dim=(2,3)) == 0] = -1
+            valid_bool_frame = (gt_ids_per_video != -1)
+            valid_bool_clip = valid_bool_frame.any(dim=-1)
+            gt_classes_per_video = gt_classes_per_video[valid_bool_clip].long() # N,
+            gt_ids_per_video = gt_ids_per_video[valid_bool_clip].long()         # N, num_frames
+            gt_masks_per_video = gt_masks_per_video[valid_bool_clip].float()    # N, num_frames, H, W
+            valid_bool_frame = valid_bool_frame[valid_bool_clip]
+            if len(gt_ids_per_video) > 0:
+                min_id = max(gt_ids_per_video[valid_bool_frame].min(), 0)
+                gt_ids_per_video[valid_bool_frame] -= min_id
+            clip_gt_instances.append(
+                {
+                    "labels": gt_classes_per_video, "ids": gt_ids_per_video, "masks": gt_masks_per_video,
+                    "video_len": targets_per_video["length"], "frame_idx": targets_per_video["frame_idx"],
+                }
+            )
+            for f_i in range(self.num_frames):
+                _cls = gt_classes_per_video.clone()
+                _ids = gt_ids_per_video[:, f_i].clone()
+                _mask = gt_masks_per_video[:, f_i].clone()
+                valid = _ids != -1
+                frame_gt_instances.append({
+                    "labels": _cls[valid],
+                    "ids": _ids[valid],
+                    "masks": _mask[valid],
+                })
+        return frame_gt_instances, clip_gt_instances
+    def inference(self, batched_inputs):
+        frame_queries, mask_features = [], []
+        num_frames = len(batched_inputs["image"])
+        to_store = self.device if num_frames <= 36 else "cpu"
+        audio_features = torch.tensor(batched_inputs["audio"]).to(self.device)
+        with torch.no_grad():
+            for i in range(math.ceil(num_frames / self.test_run_chunk_size)):
+                images = batched_inputs["image"][i*self.test_run_chunk_size : (i+1)*self.test_run_chunk_size]
+                images = [(x.to(self.device) - self.pixel_mean) / self.pixel_std for x in images]
+                images = ImageList.from_tensors(images, self.size_divisibility)
+                audio_features_chunk = audio_features[i*self.test_run_chunk_size : (i+1)*self.test_run_chunk_size]
+                features = self.backbone(images.tensor)
+                outputs, _frame_queries, _mask_features = self.sem_seg_head(features, audio_features_chunk)
+                _mask_features = self.avism_module.avism_mask_features(_mask_features)
+                # BT is 1 as runs per frame
+                frame_queries.append(_frame_queries[-1])    # T', fQ, C
+                mask_features.append(_mask_features.to(to_store))  # T', C, H, W
+        interim_size = images.tensor.shape[-2:]
+        image_size = images.image_sizes[0]  # image size without padding after data augmentation
+        out_height = batched_inputs.get("height", image_size[0])  # raw image size before data augmentation
+        out_width = batched_inputs.get("width", image_size[1])
+        del outputs, images, batched_inputs
+        frame_queries = torch.cat(frame_queries)[None]  # 1, T, fQ, C
+        mask_features = torch.cat(mask_features)        # T, C, H, W
+        avism_outputs = self.avism_module(frame_queries, audio_features)
+        mask_cls = avism_outputs["pred_logits"][-1, 0]       # cQ, K+1
+        mask_embed = avism_outputs["pred_mask_embed"][-1, 0] # cQ, C
+        del avism_outputs
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
+        num_topk = self.test_topk_per_image
+        scores_per_video, topk_indices = scores.flatten(0, 1).topk(num_topk, sorted=False)
+        labels_per_video = labels[topk_indices]
+        topk_indices = torch.div(topk_indices, self.sem_seg_head.num_classes, rounding_mode='floor')
+        mask_embed = mask_embed[topk_indices]
+        masks_per_video = []
+        numerator = torch.zeros(len(mask_embed), dtype=torch.float, device=self.device)
+        denominator = torch.zeros(len(mask_embed), dtype=torch.float, device=self.device)
+        for i in range(math.ceil(len(mask_features) / self.test_interpolate_chunk_size)):
+            m_f = mask_features[i*self.test_interpolate_chunk_size : (i+1)*self.test_interpolate_chunk_size].to(self.device)
+            mask_pred = torch.einsum("qc,tchw->qthw", mask_embed, m_f)
+            # upsample masks
+            mask_pred = retry_if_cuda_oom(F.interpolate)(
+                mask_pred,
+                size=interim_size,
+                mode="bilinear",
+                align_corners=False,
+            ) # cQ, T, H, W
+            mask_pred = mask_pred[:, :, : image_size[0], : image_size[1]]
+            interim_mask_soft = mask_pred.sigmoid()
+            interim_mask_hard = interim_mask_soft > 0.5
+            numerator += (interim_mask_soft.flatten(1) * interim_mask_hard.flatten(1)).sum(1)
+            denominator += interim_mask_hard.flatten(1).sum(1)
+            mask_pred = F.interpolate(
+                mask_pred, size=(out_height, out_width), mode="bilinear", align_corners=False
+            ) > 0.
+            masks_per_video.append(mask_pred.to(to_store))
+        masks_per_video = torch.cat(masks_per_video, dim=1)
+        scores_per_video *= (numerator / (denominator + 1e-6))
+        confidence = 0.3
+        indices = torch.nonzero(scores_per_video > confidence).squeeze(-1)
+        scores_per_video = scores_per_video[indices]
+        labels_per_video = labels_per_video[indices]
+        masks_per_video = masks_per_video[indices]
+        processed_results = {
+            "image_size": (out_height, out_width),
+            "pred_scores": scores_per_video.tolist(),
+            "pred_labels": labels_per_video.tolist(),
+            "pred_masks": masks_per_video.cpu(),
+        }
+        return processed_results

avism/avism_model_coco.py ADDED Viewed

	@@ -0,0 +1,460 @@

+from typing import Tuple
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.data import MetadataCatalog
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
+from detectron2.modeling.backbone import Backbone
+from detectron2.structures import Boxes, ImageList, Instances, BitMasks
+from detectron2.utils.memory import retry_if_cuda_oom
+from mask2former.modeling.criterion import SetCriterion
+from mask2former.modeling.matcher import HungarianMatcher
+from .modeling.avism_criterion import AvismSetCriterion
+from .modeling.avism_matcher import AvismHungarianMatcher
+from .modeling.transformer_decoder.avism_coco import Avism_COCO
+@META_ARCH_REGISTRY.register()
+class AVISM_COCO(nn.Module):
+    """
+    Main class for mask classification semantic segmentation architectures.
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        criterion: nn.Module,
+        num_queries: int,
+        object_mask_threshold: float,
+        overlap_threshold: float,
+        metadata,
+        size_divisibility: int,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+        # inference
+        test_topk_per_image: int,
+        # avism
+        avism_module: nn.Module,
+        avism_criterion: nn.Module,
+        num_frames: int,
+        num_classes: int,
+        is_multi_cls: bool,
+        apply_cls_thres: float,
+        freeze_detector: bool,
+        test_run_chunk_size: int,
+        test_interpolate_chunk_size: int,
+        is_coco: bool,
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            criterion: a module that defines the loss
+            num_queries: int, number of queries
+            object_mask_threshold: float, threshold to filter query based on classification score
+                for panoptic segmentation inference
+            overlap_threshold: overlap threshold used in general inference for panoptic segmentation
+            metadata: dataset meta, get `thing` and `stuff` category names for panoptic
+                segmentation inference
+            size_divisibility: Some backbones require the input height and width to be divisible by a
+                specific integer. We can use this to override such requirement.
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+            test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.sem_seg_head = sem_seg_head
+        self.criterion = criterion
+        self.num_queries = num_queries
+        self.overlap_threshold = overlap_threshold
+        self.object_mask_threshold = object_mask_threshold
+        self.metadata = metadata
+        if size_divisibility < 0:
+            # use backbone size_divisibility if not set
+            size_divisibility = self.backbone.size_divisibility
+        self.size_divisibility = size_divisibility
+        self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+        # additional args
+        self.test_topk_per_image = test_topk_per_image
+        # avism hyper-parameters
+        self.num_frames = num_frames
+        self.num_classes = num_classes
+        self.vita_module = avism_module
+        self.vita_criterion = avism_criterion
+        self.is_multi_cls = is_multi_cls
+        self.apply_cls_thres = apply_cls_thres
+        if freeze_detector:
+            for name, p in self.named_parameters():
+                if not "vita_module" in name:
+                    p.requires_grad_(False)
+        self.test_run_chunk_size = test_run_chunk_size
+        self.test_interpolate_chunk_size = test_interpolate_chunk_size
+        self.is_coco = is_coco
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
+        # Loss parameters:
+        deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
+        no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT
+        avism_deep_supervision = cfg.MODEL.AVISM.DEEP_SUPERVISION
+        # loss weights
+        class_weight = cfg.MODEL.MASK_FORMER.CLASS_WEIGHT
+        dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT
+        mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT
+        sim_weight = cfg.MODEL.AVISM.SIM_WEIGHT
+        # building criterion
+        matcher = HungarianMatcher(
+            cost_class=class_weight,
+            cost_mask=mask_weight,
+            cost_dice=dice_weight,
+            num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
+        )
+        weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_dice": dice_weight}
+        if deep_supervision:
+            dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+        losses = ["labels", "masks"]
+        criterion = SetCriterion(
+            sem_seg_head.num_classes,
+            matcher=matcher,
+            weight_dict=weight_dict,
+            eos_coef=no_object_weight,
+            losses=losses,
+            num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
+            oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO,
+            importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO,
+            avism_last_layer_num=cfg.MODEL.AVISM.LAST_LAYER_NUM,
+        )
+        # Avism
+        num_classes = sem_seg_head.num_classes
+        hidden_dim = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
+        avism_module = Avism_COCO(cfg=cfg, in_channels=hidden_dim, aux_loss=avism_deep_supervision)
+        # building criterion for avism inference
+        avism_matcher = AvismHungarianMatcher(
+            cost_class=class_weight,
+            cost_mask=mask_weight,
+            cost_dice=dice_weight,
+            num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
+        )
+        avism_weight_dict = {
+            "loss_avism_ce": class_weight, "loss_avism_mask": mask_weight, "loss_avism_dice": dice_weight
+        }
+        if sim_weight > 0.0:
+            avism_weight_dict["loss_avism_sim"] = sim_weight
+        if avism_deep_supervision:
+            avism_dec_layers = cfg.MODEL.AVISM.DEC_LAYERS
+            aux_weight_dict = {}
+            for i in range(avism_dec_layers - 1):
+                aux_weight_dict.update({k + f"_{i}": v for k, v in avism_weight_dict.items()})
+            avism_weight_dict.update(aux_weight_dict)
+        avism_losses = ["avism_labels", "avism_masks"]
+        if sim_weight > 0.0:
+            avism_losses.append("fg_sim")
+        avism_criterion = AvismSetCriterion(
+            num_classes,
+            matcher=avism_matcher,
+            weight_dict=avism_weight_dict,
+            eos_coef=cfg.MODEL.AVISM.NO_OBJECT_WEIGHT,
+            losses=avism_losses,
+            num_points=cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS,
+            oversample_ratio=cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO,
+            importance_sample_ratio=cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO,
+            sim_use_clip=cfg.MODEL.AVISM.SIM_USE_CLIP,
+        )
+        return {
+            "backbone": backbone,
+            "sem_seg_head": sem_seg_head,
+            "criterion": criterion,
+            "num_queries": cfg.MODEL.AVISM.NUM_OBJECT_QUERIES,
+            "object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD,
+            "overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD,
+            "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
+            "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY,
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+            # inference
+            "test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
+            # avism
+            "avism_module": avism_module,
+            "avism_criterion": avism_criterion,
+            "num_frames": cfg.INPUT.SAMPLING_FRAME_NUM,
+            "num_classes": num_classes,
+            "is_multi_cls": cfg.MODEL.AVISM.MULTI_CLS_ON,
+            "apply_cls_thres": cfg.MODEL.AVISM.APPLY_CLS_THRES,
+            "freeze_detector": cfg.MODEL.AVISM.FREEZE_DETECTOR,
+            "test_run_chunk_size": cfg.MODEL.AVISM.TEST_RUN_CHUNK_SIZE,
+            "test_interpolate_chunk_size": cfg.MODEL.AVISM.TEST_INTERPOLATE_CHUNK_SIZE,
+            "is_coco": cfg.DATASETS.TEST[0].startswith("coco"),
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "instances": per-region ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+                * "sem_seg":
+                    A Tensor that represents the
+                    per-pixel segmentation prediced by the head.
+                    The prediction has shape KxHxW that represents the logits of
+                    each class for each pixel.
+                * "panoptic_seg":
+                    A tuple that represent panoptic output
+                    panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+                    segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                        Each dict contains keys "id", "category_id", "isthing".
+        """
+        if self.training:
+            return self.train_model(batched_inputs)
+        else:
+            # NOTE consider only B=1 case.
+            return self.inference(batched_inputs[0])
+    def train_model(self, batched_inputs):
+        images = []
+        audio_features = []
+        for video in batched_inputs:
+            for frame in video["image"]:
+                images.append(frame.to(self.device))
+            for audio_feat in video["audio"]:
+                audio_features.append(torch.tensor(audio_feat).to(self.device))
+        audio_features = torch.stack(audio_features)
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+        image_features = self.backbone(images.tensor)
+        BT = len(images)
+        T = self.num_frames if self.training else BT
+        B = BT // T
+        outputs, frame_queries, mask_features = self.sem_seg_head(image_features, audio_features)
+        mask_features = self.vita_module.vita_mask_features(mask_features)
+        mask_features = mask_features.view(B, self.num_frames, *mask_features.shape[-3:])
+        # mask classification target
+        frame_targets, clip_targets = self.prepare_targets(batched_inputs, images)
+        # bipartite matching-based loss
+        losses, fg_indices = self.criterion(outputs, frame_targets)
+        avism_outputs = self.vita_module(frame_queries, audio_features)
+        avism_outputs["pred_masks"] = torch.einsum("lbqc,btchw->lbqthw", avism_outputs["pred_mask_embed"], mask_features)
+        for out in avism_outputs["aux_outputs"]:
+            out["pred_masks"] = torch.einsum("lbqc,btchw->lbqthw", out["pred_mask_embed"], mask_features)
+        for k in list(losses.keys()):
+            if k in self.criterion.weight_dict:
+                losses[k] *= self.criterion.weight_dict[k]
+            else:
+                # remove this loss if not specified in `weight_dict`
+                losses.pop(k)
+        avism_loss_dict = self.vita_criterion(avism_outputs, clip_targets, frame_targets, fg_indices)
+        avism_weight_dict = self.vita_criterion.weight_dict
+        for k in avism_loss_dict.keys():
+            if k in avism_weight_dict:
+                avism_loss_dict[k] *= avism_weight_dict[k]
+        losses.update(avism_loss_dict)
+        return losses
+    def prepare_targets(self, targets, images):
+        h_pad, w_pad = images.tensor.shape[-2:]
+        frame_gt_instances = []
+        clip_gt_instances = []
+        for targets_per_video in targets:
+            _num_instance = len(targets_per_video["instances"][0])
+            mask_shape = [_num_instance, self.num_frames, h_pad, w_pad]
+            gt_masks_per_video = torch.zeros(mask_shape, dtype=torch.bool, device=self.device)
+            gt_classes_per_video = targets_per_video["instances"][0].gt_classes.to(self.device)
+            gt_ids_per_video = []
+            for f_i, targets_per_frame in enumerate(targets_per_video["instances"]):
+                targets_per_frame = targets_per_frame.to(self.device)
+                h, w = targets_per_frame.image_size
+                _update_cls = gt_classes_per_video == -1
+                gt_classes_per_video[_update_cls] = targets_per_frame.gt_classes[_update_cls]
+                gt_ids_per_video.append(targets_per_frame.gt_ids)
+                if isinstance(targets_per_frame.gt_masks, BitMasks):
+                    gt_masks_per_video[:, f_i, :h, :w] = targets_per_frame.gt_masks.tensor
+                else: #polygon
+                    gt_masks_per_video[:, f_i, :h, :w] = targets_per_frame.gt_masks
+            gt_ids_per_video = torch.stack(gt_ids_per_video, dim=1)
+            gt_ids_per_video[gt_masks_per_video.sum(dim=(2,3)) == 0] = -1
+            valid_bool_frame = (gt_ids_per_video != -1)
+            valid_bool_clip = valid_bool_frame.any(dim=-1)
+            gt_classes_per_video = gt_classes_per_video[valid_bool_clip].long() # N,
+            gt_ids_per_video = gt_ids_per_video[valid_bool_clip].long()         # N, num_frames
+            gt_masks_per_video = gt_masks_per_video[valid_bool_clip].float()    # N, num_frames, H, W
+            valid_bool_frame = valid_bool_frame[valid_bool_clip]
+            if len(gt_ids_per_video) > 0:
+                min_id = max(gt_ids_per_video[valid_bool_frame].min(), 0)
+                gt_ids_per_video[valid_bool_frame] -= min_id
+            clip_gt_instances.append(
+                {
+                    "labels": gt_classes_per_video, "ids": gt_ids_per_video, "masks": gt_masks_per_video,
+                    "video_len": targets_per_video["length"], "frame_idx": targets_per_video["frame_idx"],
+                }
+            )
+            for f_i in range(self.num_frames):
+                _cls = gt_classes_per_video.clone()
+                _ids = gt_ids_per_video[:, f_i].clone()
+                _mask = gt_masks_per_video[:, f_i].clone()
+                valid = _ids != -1
+                frame_gt_instances.append({
+                    "labels": _cls[valid],
+                    "ids": _ids[valid],
+                    "masks": _mask[valid],
+                })
+        return frame_gt_instances, clip_gt_instances
+    def inference(self, batched_inputs):
+        frame_queries, mask_features = [], []
+        num_frames = len(batched_inputs["image"])
+        to_store = self.device if num_frames <= 36 else "cpu"
+        audio_features = torch.tensor(batched_inputs["audio"]).to(self.device)
+        with torch.no_grad():
+            for i in range(math.ceil(num_frames / self.test_run_chunk_size)):
+                images = batched_inputs["image"][i*self.test_run_chunk_size : (i+1)*self.test_run_chunk_size]
+                images = [(x.to(self.device) - self.pixel_mean) / self.pixel_std for x in images]
+                images = ImageList.from_tensors(images, self.size_divisibility)
+                audio_features_chunk = audio_features[i*self.test_run_chunk_size : (i+1)*self.test_run_chunk_size]
+                features = self.backbone(images.tensor)
+                outputs, _frame_queries, _mask_features = self.sem_seg_head(features, audio_features_chunk)
+                _mask_features = self.vita_module.vita_mask_features(_mask_features)
+                # BT is 1 as runs per frame
+                frame_queries.append(_frame_queries[-1])    # T', fQ, C
+                mask_features.append(_mask_features.to(to_store))  # T', C, H, W
+        interim_size = images.tensor.shape[-2:]
+        image_size = images.image_sizes[0]  # image size without padding after data augmentation
+        out_height = batched_inputs.get("height", image_size[0])  # raw image size before data augmentation
+        out_width = batched_inputs.get("width", image_size[1])
+        del outputs, images, batched_inputs
+        frame_queries = torch.cat(frame_queries)[None]  # 1, T, fQ, C
+        mask_features = torch.cat(mask_features)        # T, C, H, W
+        avism_outputs = self.vita_module(frame_queries, audio_features)
+        mask_cls = avism_outputs["pred_logits"][-1, 0]       # cQ, K+1
+        mask_embed = avism_outputs["pred_mask_embed"][-1, 0] # cQ, C
+        del avism_outputs
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
+        num_topk = self.test_topk_per_image
+        scores_per_video, topk_indices = scores.flatten(0, 1).topk(num_topk, sorted=False)
+        labels_per_video = labels[topk_indices]
+        topk_indices = torch.div(topk_indices, self.sem_seg_head.num_classes, rounding_mode='floor')
+        mask_embed = mask_embed[topk_indices]
+        masks_per_video = []
+        numerator = torch.zeros(len(mask_embed), dtype=torch.float, device=self.device)
+        denominator = torch.zeros(len(mask_embed), dtype=torch.float, device=self.device)
+        for i in range(math.ceil(len(mask_features) / self.test_interpolate_chunk_size)):
+            m_f = mask_features[i*self.test_interpolate_chunk_size : (i+1)*self.test_interpolate_chunk_size].to(self.device)
+            mask_pred = torch.einsum("qc,tchw->qthw", mask_embed, m_f)
+            # upsample masks
+            mask_pred = retry_if_cuda_oom(F.interpolate)(
+                mask_pred,
+                size=interim_size,
+                mode="bilinear",
+                align_corners=False,
+            ) # cQ, T, H, W
+            mask_pred = mask_pred[:, :, : image_size[0], : image_size[1]]
+            interim_mask_soft = mask_pred.sigmoid()
+            interim_mask_hard = interim_mask_soft > 0.5
+            numerator += (interim_mask_soft.flatten(1) * interim_mask_hard.flatten(1)).sum(1)
+            denominator += interim_mask_hard.flatten(1).sum(1)
+            mask_pred = F.interpolate(
+                mask_pred, size=(out_height, out_width), mode="bilinear", align_corners=False
+            ) > 0.
+            masks_per_video.append(mask_pred.to(to_store))
+        masks_per_video = torch.cat(masks_per_video, dim=1)
+        scores_per_video *= (numerator / (denominator + 1e-6))
+        confidence = 0.3
+        indices = torch.nonzero(scores_per_video > confidence).squeeze(-1)
+        scores_per_video = scores_per_video[indices]
+        labels_per_video = labels_per_video[indices]
+        masks_per_video = masks_per_video[indices]
+        processed_results = {
+            "image_size": (out_height, out_width),
+            "pred_scores": scores_per_video.tolist(),
+            "pred_labels": labels_per_video.tolist(),
+            "pred_masks": masks_per_video.cpu(),
+        }
+        return processed_results

avism/config.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# -*- coding: utf-8 -*-
+from detectron2.config import CfgNode as CN
+def add_avism_config(cfg):
+    cfg.DATASETS.DATASET_RATIO = []
+    # DataLoader
+    cfg.INPUT.SAMPLING_FRAME_NUM = 2
+    cfg.INPUT.SAMPLING_FRAME_RANGE = 20
+    cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False
+    cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation"
+    # Pseudo Data Use
+    cfg.INPUT.PSEUDO = CN()
+    cfg.INPUT.PSEUDO.AUGMENTATIONS = ['rotation']
+    cfg.INPUT.PSEUDO.MIN_SIZE_TRAIN = (480, 512, 544, 576, 608, 640, 672, 704, 736, 768)
+    cfg.INPUT.PSEUDO.MAX_SIZE_TRAIN = 768
+    cfg.INPUT.PSEUDO.MIN_SIZE_TRAIN_SAMPLING = "choice_by_clip"
+    cfg.INPUT.PSEUDO.CROP = CN()
+    cfg.INPUT.PSEUDO.CROP.ENABLED = False
+    cfg.INPUT.PSEUDO.CROP.TYPE = "absolute_range"
+    cfg.INPUT.PSEUDO.CROP.SIZE = (384, 600)
+    # LSJ
+    cfg.INPUT.LSJ_AUG = CN()
+    cfg.INPUT.LSJ_AUG.ENABLED = False
+    cfg.INPUT.LSJ_AUG.IMAGE_SIZE = 1024
+    cfg.INPUT.LSJ_AUG.MIN_SCALE = 0.1
+    cfg.INPUT.LSJ_AUG.MAX_SCALE = 2.0
+    # AVISM
+    cfg.MODEL.AVISM = CN()
+    cfg.MODEL.AVISM.NHEADS = 8
+    cfg.MODEL.AVISM.DROPOUT = 0.0
+    cfg.MODEL.AVISM.DIM_FEEDFORWARD = 2048
+    cfg.MODEL.AVISM.ENC_LAYERS = 6
+    cfg.MODEL.AVISM.DEC_LAYERS = 3
+    cfg.MODEL.AVISM.ENC_WINDOW_SIZE = 0
+    cfg.MODEL.AVISM.PRE_NORM = False
+    cfg.MODEL.AVISM.HIDDEN_DIM = 256
+    cfg.MODEL.AVISM.NUM_OBJECT_QUERIES = 100
+    cfg.MODEL.AVISM.ENFORCE_INPUT_PROJ = True
+    cfg.MODEL.AVISM.NO_OBJECT_WEIGHT = 0.1
+    cfg.MODEL.AVISM.DEEP_SUPERVISION = True
+    cfg.MODEL.AVISM.LAST_LAYER_NUM = 3
+    cfg.MODEL.AVISM.MULTI_CLS_ON = True
+    cfg.MODEL.AVISM.APPLY_CLS_THRES = 0.01
+    cfg.MODEL.AVISM.SIM_USE_CLIP = True
+    cfg.MODEL.AVISM.SIM_WEIGHT = 0.5
+    cfg.MODEL.AVISM.FREEZE_DETECTOR = False
+    cfg.MODEL.AVISM.TEST_RUN_CHUNK_SIZE = 18
+    cfg.MODEL.AVISM.TEST_INTERPOLATE_CHUNK_SIZE = 5

avism/data/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .datasets import *
+from .dataset_mapper import AVISDatasetMapper
+from .build import *
+from .avis_eval import AVISEvaluator

avism/data/augmentation.py ADDED Viewed

	@@ -0,0 +1,623 @@

+import copy
+import numpy as np
+import logging
+import sys
+from fvcore.transforms.transform import (
+    HFlipTransform,
+    NoOpTransform,
+    VFlipTransform,
+)
+from PIL import Image
+from typing import Tuple
+from fvcore.transforms.transform import (
+    BlendTransform,
+    CropTransform,
+    HFlipTransform,
+    NoOpTransform,
+    PadTransform,
+    Transform,
+    TransformList,
+    VFlipTransform,
+)
+from detectron2.data import transforms as T
+class RandomApplyClip(T.Augmentation):
+    """
+    Randomly apply an augmentation with a given probability.
+    """
+    def __init__(self, tfm_or_aug, prob=0.5, clip_frame_cnt=1):
+        """
+        Args:
+            tfm_or_aug (Transform, Augmentation): the transform or augmentation
+                to be applied. It can either be a `Transform` or `Augmentation`
+                instance.
+            prob (float): probability between 0.0 and 1.0 that
+                the wrapper transformation is applied
+        """
+        super().__init__()
+        self.aug = T.augmentation._transform_to_aug(tfm_or_aug)
+        assert 0.0 <= prob <= 1.0, f"Probablity must be between 0.0 and 1.0 (given: {prob})"
+        self.prob = prob
+        self._cnt = 0
+        self.clip_frame_cnt = clip_frame_cnt
+    def get_transform(self, *args):
+        if self._cnt % self.clip_frame_cnt == 0:
+            self.do = self._rand_range() < self.prob
+            self._cnt = 0   # avoiding overflow
+        self._cnt += 1
+        if self.do:
+            return self.aug.get_transform(*args)
+        else:
+            return NoOpTransform()
+    def __call__(self, aug_input):
+        if self._cnt % self.clip_frame_cnt == 0:
+            self.do = self._rand_range() < self.prob
+            self._cnt = 0   # avoiding overflow
+        self._cnt += 1
+        if self.do:
+            return self.aug(aug_input)
+        else:
+            return NoOpTransform()
+class RandomRotationClip(T.Augmentation):
+    """
+    This method returns a copy of this image, rotated the given
+    number of degrees counter clockwise around the given center.
+    """
+    def __init__(self, angle, prob=0.5, expand=True, center=None, interp=None, clip_frame_cnt=1):
+        """
+        Args:
+            angle (list[float]): If ``sample_style=="range"``,
+                a [min, max] interval from which to sample the angle (in degrees).
+                If ``sample_style=="choice"``, a list of angles to sample from
+            expand (bool): choose if the image should be resized to fit the whole
+                rotated image (default), or simply cropped
+            center (list[[float, float]]):  If ``sample_style=="range"``,
+                a [[minx, miny], [maxx, maxy]] relative interval from which to sample the center,
+                [0, 0] being the top left of the image and [1, 1] the bottom right.
+                If ``sample_style=="choice"``, a list of centers to sample from
+                Default: None, which means that the center of rotation is the center of the image
+                center has no effect if expand=True because it only affects shifting
+        """
+        super().__init__()
+        if isinstance(angle, (float, int)):
+            angle = (angle, angle)
+        if center is not None and isinstance(center[0], (float, int)):
+            center = (center, center)
+        self.angle_save = None
+        self.center_save = None
+        self._cnt = 0
+        self._init(locals())
+    def get_transform(self, image):
+        h, w = image.shape[:2]
+        if self._cnt % self.clip_frame_cnt == 0:
+            center = None
+            angle = np.random.uniform(self.angle[0], self.angle[1], size=self.clip_frame_cnt)
+            if self.center is not None:
+                center = (
+                    np.random.uniform(self.center[0][0], self.center[1][0]),
+                    np.random.uniform(self.center[0][1], self.center[1][1]),
+                )
+            angle = np.sort(angle)
+            if self._rand_range() < self.prob:
+                angle = angle[::-1]
+            self.angle_save = angle
+            self.center_save = center
+            self._cnt = 0   # avoiding overflow
+        angle = self.angle_save[self._cnt]
+        center = self.center_save
+        self._cnt += 1
+        if center is not None:
+            center = (w * center[0], h * center[1])  # Convert to absolute coordinates
+        if angle % 360 == 0:
+            return NoOpTransform()
+        return T.RotationTransform(h, w, angle, expand=self.expand, center=center, interp=self.interp)
+class ResizeScaleClip(T.Augmentation):
+    """
+    Takes target size as input and randomly scales the given target size between `min_scale`
+    and `max_scale`. It then scales the input image such that it fits inside the scaled target
+    box, keeping the aspect ratio constant.
+    This implements the resize part of the Google's 'resize_and_crop' data augmentation:
+    https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/input_utils.py#L127
+    """
+    def __init__(
+        self,
+        min_scale: float,
+        max_scale: float,
+        target_height: int,
+        target_width: int,
+        interp: int = Image.BILINEAR,
+        clip_frame_cnt=1,
+    ):
+        """
+        Args:
+            min_scale: minimum image scale range.
+            max_scale: maximum image scale range.
+            target_height: target image height.
+            target_width: target image width.
+            interp: image interpolation method.
+        """
+        super().__init__()
+        self._init(locals())
+        self._cnt = 0
+    def _get_resize(self, image: np.ndarray, scale: float):
+        input_size = image.shape[:2]
+        # Compute new target size given a scale.
+        target_size = (self.target_height, self.target_width)
+        target_scale_size = np.multiply(target_size, scale)
+        # Compute actual rescaling applied to input image and output size.
+        output_scale = np.minimum(
+            target_scale_size[0] / input_size[0], target_scale_size[1] / input_size[1]
+        )
+        output_size = np.round(np.multiply(input_size, output_scale)).astype(int)
+        return T.ResizeTransform(
+            input_size[0], input_size[1], output_size[0], output_size[1], self.interp
+        )
+    def get_transform(self, image: np.ndarray):
+        if self._cnt % self.clip_frame_cnt == 0:
+            random_scale = np.random.uniform(self.min_scale, self.max_scale)
+            self.random_scale_save = random_scale
+            self._cnt = 0   # avoiding overflow
+        self._cnt += 1
+        random_scale = self.random_scale_save
+        return self._get_resize(image, random_scale)
+class RandomCropClip(T.Augmentation):
+    """
+    Randomly crop a rectangle region out of an image.
+    """
+    def __init__(self, crop_type: str, crop_size, clip_frame_cnt=1):
+        """
+        Args:
+            crop_type (str): one of "relative_range", "relative", "absolute", "absolute_range".
+            crop_size (tuple[float, float]): two floats, explained below.
+        - "relative": crop a (H * crop_size[0], W * crop_size[1]) region from an input image of
+          size (H, W). crop size should be in (0, 1]
+        - "relative_range": uniformly sample two values from [crop_size[0], 1]
+          and [crop_size[1]], 1], and use them as in "relative" crop type.
+        - "absolute" crop a (crop_size[0], crop_size[1]) region from input image.
+          crop_size must be smaller than the input image size.
+        - "absolute_range", for an input of size (H, W), uniformly sample H_crop in
+          [crop_size[0], min(H, crop_size[1])] and W_crop in [crop_size[0], min(W, crop_size[1])].
+          Then crop a region (H_crop, W_crop).
+        """
+        # TODO style of relative_range and absolute_range are not consistent:
+        # one takes (h, w) but another takes (min, max)
+        super().__init__()
+        assert crop_type in ["relative_range", "relative", "absolute", "absolute_range"]
+        self._init(locals())
+        self._cnt = 0
+    def get_transform(self, image):
+        h, w = image.shape[:2]      # 667, 500
+        if self._cnt % self.clip_frame_cnt == 0:
+            croph, cropw = self.get_crop_size((h, w))
+            assert h >= croph and w >= cropw, "Shape computation in {} has bugs.".format(self)
+            h0 = np.random.randint(h - croph + 1)   # rand(124) -> 5
+            w0 = np.random.randint(w - cropw + 1)   # rand(111) -> 634
+            h1 = np.random.randint(h0, h - croph + 1)
+            w1 = np.random.randint(w0, w - cropw + 1)
+            x = np.sort(np.random.rand(self.clip_frame_cnt))
+            h = h0 * x + h1 * (1-x)
+            w = w0 * x + w1 * (1-x)
+            h = np.round_(h).astype(int)
+            w = np.round_(w).astype(int)
+            if self._rand_range() < 0.5:
+                h = h[::-1]
+                w = w[::-1]
+            self.hw_save = (h, w)
+            self.crop_h_save, self.crop_w_save = croph, cropw
+            self._cnt = 0   # avoiding overflow
+        _h, _w = self.hw_save[0][self._cnt], self.hw_save[1][self._cnt]
+        self._cnt += 1
+        return T.CropTransform(_w, _h, self.crop_w_save, self.crop_h_save)
+    def get_crop_size(self, image_size):
+        """
+        Args:
+            image_size (tuple): height, width
+        Returns:
+            crop_size (tuple): height, width in absolute pixels
+        """
+        h, w = image_size
+        if self.crop_type == "relative":
+            ch, cw = self.crop_size
+            return int(h * ch + 0.5), int(w * cw + 0.5)
+        elif self.crop_type == "relative_range":
+            crop_size = np.asarray(self.crop_size, dtype=float)
+            ch, cw = crop_size + np.random.rand(2) * (1 - crop_size)
+            return int(h * ch + 0.5), int(w * cw + 0.5)
+        elif self.crop_type == "absolute":
+            return (min(self.crop_size[0], h), min(self.crop_size[1], w))
+        elif self.crop_type == "absolute_range":
+            assert self.crop_size[0] <= self.crop_size[1]
+            ch = np.random.randint(min(h, self.crop_size[0]), min(h, self.crop_size[1]) + 1)
+            cw = np.random.randint(min(w, self.crop_size[0]), min(w, self.crop_size[1]) + 1)
+            return ch, cw
+        else:
+            raise NotImplementedError("Unknown crop type {}".format(self.crop_type))
+class FixedSizeCropClip(T.Augmentation):
+    """
+    If `crop_size` is smaller than the input image size, then it uses a random crop of
+    the crop size. If `crop_size` is larger than the input image size, then it pads
+    the right and the bottom of the image to the crop size if `pad` is True, otherwise
+    it returns the smaller image.
+    """
+    def __init__(self, crop_size: Tuple[int], pad: bool = True, pad_value: float = 128.0, clip_frame_cnt=1):
+        """
+        Args:
+            crop_size: target image (height, width).
+            pad: if True, will pad images smaller than `crop_size` up to `crop_size`
+            pad_value: the padding value.
+        """
+        super().__init__()
+        self._init(locals())
+        self._cnt = 0
+    def _get_crop(self, image: np.ndarray):
+        # Compute the image scale and scaled size.
+        input_size = image.shape[:2]
+        output_size = self.crop_size
+        # Add random crop if the image is scaled up.
+        max_offset = np.subtract(input_size, output_size)
+        max_offset = np.maximum(max_offset, 0)
+        if self._cnt % self.clip_frame_cnt == 0:
+            offset = np.multiply(max_offset, np.random.uniform(0.0, 1.0))
+            offset = np.round(offset).astype(int)
+            self.offset_save = offset
+            self._cnt = 0   # avoiding overflow
+        self._cnt += 1
+        offset = self.offset_save
+        return CropTransform(
+            offset[1], offset[0], output_size[1], output_size[0], input_size[1], input_size[0]
+        )
+    def _get_pad(self, image: np.ndarray):
+        # Compute the image scale and scaled size.
+        input_size = image.shape[:2]
+        output_size = self.crop_size
+        # Add padding if the image is scaled down.
+        pad_size = np.subtract(output_size, input_size)
+        pad_size = np.maximum(pad_size, 0)
+        original_size = np.minimum(input_size, output_size)
+        return PadTransform(
+            0, 0, pad_size[1], pad_size[0], original_size[1], original_size[0], self.pad_value
+        )
+    def get_transform(self, image: np.ndarray):
+        transforms = [self._get_crop(image)]
+        if self.pad:
+            transforms.append(self._get_pad(image))
+        return TransformList(transforms)
+class ResizeShortestEdgeClip(T.Augmentation):
+    """
+    Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
+    If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
+    """
+    def __init__(
+        self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR, clip_frame_cnt=1
+    ):
+        """
+        Args:
+            short_edge_length (list[int]): If ``sample_style=="range"``,
+                a [min, max] interval from which to sample the shortest edge length.
+                If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
+            max_size (int): maximum allowed longest edge length.
+            sample_style (str): either "range" or "choice".
+        """
+        super().__init__()
+        assert sample_style in ["range", "choice", "range_by_clip", "choice_by_clip"], sample_style
+        self.is_range = ("range" in sample_style)
+        if isinstance(short_edge_length, int):
+            short_edge_length = (short_edge_length, short_edge_length)
+        if self.is_range:
+            assert len(short_edge_length) == 2, (
+                "short_edge_length must be two values using 'range' sample style."
+                f" Got {short_edge_length}!"
+            )
+        self._cnt = 0
+        self._init(locals())
+    def get_transform(self, image):
+        if self._cnt % self.clip_frame_cnt == 0:
+            if self.is_range:
+                self.size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
+            else:
+                self.size = np.random.choice(self.short_edge_length)
+            self._cnt = 0   # avoiding overflow
+            if self.size == 0:
+                return NoOpTransform()
+        self._cnt += 1
+        h, w = image.shape[:2]
+        scale = self.size * 1.0 / min(h, w)
+        if h < w:
+            newh, neww = self.size, scale * w
+        else:
+            newh, neww = scale * h, self.size
+        if max(newh, neww) > self.max_size:
+            scale = self.max_size * 1.0 / max(newh, neww)
+            newh = newh * scale
+            neww = neww * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return T.ResizeTransform(h, w, newh, neww, self.interp)
+class RandomFlipClip(T.Augmentation):
+    """
+    Flip the image horizontally or vertically with the given probability.
+    """
+    def __init__(self, prob=0.5, *, horizontal=True, vertical=False, clip_frame_cnt=1):
+        """
+        Args:
+            prob (float): probability of flip.
+            horizontal (boolean): whether to apply horizontal flipping
+            vertical (boolean): whether to apply vertical flipping
+        """
+        super().__init__()
+        if horizontal and vertical:
+            raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
+        if not horizontal and not vertical:
+            raise ValueError("At least one of horiz or vert has to be True!")
+        self._cnt = 0
+        self._init(locals())
+    def get_transform(self, image):
+        if self._cnt % self.clip_frame_cnt == 0:
+            self.do = self._rand_range() < self.prob
+            self._cnt = 0   # avoiding overflow
+        self._cnt += 1
+        h, w = image.shape[:2]
+        if self.do:
+            if self.horizontal:
+                return HFlipTransform(w)
+            elif self.vertical:
+                return VFlipTransform(h)
+        else:
+            return NoOpTransform()
+def build_augmentation(cfg, is_train):
+    logger = logging.getLogger(__name__)
+    aug_list = []
+    if is_train:
+        use_lsj = cfg.INPUT.LSJ_AUG.ENABLED
+        if use_lsj:
+            image_size = cfg.INPUT.LSJ_AUG.IMAGE_SIZE
+            min_scale = cfg.INPUT.LSJ_AUG.MIN_SCALE
+            max_scale = cfg.INPUT.LSJ_AUG.MAX_SCALE
+            if cfg.INPUT.RANDOM_FLIP != "none":
+                if cfg.INPUT.RANDOM_FLIP == "flip_by_clip":
+                    flip_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM
+                else:
+                    flip_clip_frame_cnt = 1
+                aug_list.append(
+                    # NOTE using RandomFlip modified for the support of flip maintenance
+                    RandomFlipClip(
+                        horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"),
+                        vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
+                        clip_frame_cnt=flip_clip_frame_cnt,
+                    )
+                )
+            aug_list.extend([
+                T.ResizeScale(
+                    min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
+                ),
+                T.FixedSizeCrop(crop_size=(image_size, image_size)),
+            ])
+        else:
+            min_size = cfg.INPUT.MIN_SIZE_TRAIN
+            max_size = cfg.INPUT.MAX_SIZE_TRAIN
+            sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
+            clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM if "by_clip" in cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else 1
+            # Crop
+            if cfg.INPUT.CROP.ENABLED:
+                crop_aug = RandomApplyClip(
+                    T.AugmentationList([
+                        ResizeShortestEdgeClip([400, 500, 600], 1333, sample_style, clip_frame_cnt=clip_frame_cnt),
+                        RandomCropClip(cfg.INPUT.PSEUDO.CROP.TYPE, cfg.INPUT.PSEUDO.CROP.SIZE, clip_frame_cnt=clip_frame_cnt)
+                    ]),
+                    clip_frame_cnt=clip_frame_cnt
+                )
+                aug_list.append(crop_aug)
+            # Resize
+            aug_list.append(ResizeShortestEdgeClip(min_size, max_size, sample_style, clip_frame_cnt=clip_frame_cnt))
+            # Flip
+            if cfg.INPUT.RANDOM_FLIP != "none":
+                if cfg.INPUT.RANDOM_FLIP == "flip_by_clip":
+                    flip_clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM
+                else:
+                    flip_clip_frame_cnt = 1
+                aug_list.append(
+                    # NOTE using RandomFlip modified for the support of flip maintenance
+                    RandomFlipClip(
+                        horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"),
+                        vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
+                        clip_frame_cnt=flip_clip_frame_cnt,
+                    )
+                )
+            # Additional augmentations : brightness, contrast, saturation, rotation
+            augmentations = cfg.INPUT.AUGMENTATIONS
+            if "brightness" in augmentations:
+                aug_list.append(T.RandomBrightness(0.9, 1.1))
+            if "contrast" in augmentations:
+                aug_list.append(T.RandomContrast(0.9, 1.1))
+            if "saturation" in augmentations:
+                aug_list.append(T.RandomSaturation(0.9, 1.1))
+            if "rotation" in augmentations:
+                aug_list.append(
+                    T.RandomRotation(
+                        [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], sample_style="range"
+                    )
+                )
+    else:
+        # Resize
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+        sample_style = "choice"
+        aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
+    return aug_list
+def build_pseudo_augmentation(cfg, is_train):
+    logger = logging.getLogger(__name__)
+    aug_list = []
+    if is_train:
+        use_lsj = cfg.INPUT.LSJ_AUG.ENABLED
+        if use_lsj:
+            image_size = cfg.INPUT.LSJ_AUG.IMAGE_SIZE
+            min_scale = cfg.INPUT.LSJ_AUG.MIN_SCALE
+            max_scale = cfg.INPUT.LSJ_AUG.MAX_SCALE
+            if cfg.INPUT.RANDOM_FLIP != "none":
+                if cfg.INPUT.RANDOM_FLIP == "flip_by_clip":
+                    clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM
+                else:
+                    clip_frame_cnt = 1
+                aug_list.append(
+                    # NOTE using RandomFlip modified for the support of flip maintenance
+                    RandomFlipClip(
+                        horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"),
+                        vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
+                        clip_frame_cnt=clip_frame_cnt,
+                    )
+                )
+            # Additional augmentations : brightness, contrast, saturation, rotation
+            augmentations = cfg.INPUT.PSEUDO.AUGMENTATIONS
+            if "brightness" in augmentations:
+                aug_list.append(T.RandomBrightness(0.9, 1.1))
+            if "contrast" in augmentations:
+                aug_list.append(T.RandomContrast(0.9, 1.1))
+            if "saturation" in augmentations:
+                aug_list.append(T.RandomSaturation(0.9, 1.1))
+            if "rotation" in augmentations:
+                aug_list.append(
+                    RandomRotationClip(
+                        [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], clip_frame_cnt=clip_frame_cnt,
+                    )
+                )
+            aug_list.extend([
+                ResizeScaleClip(
+                    min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size,
+                    clip_frame_cnt=clip_frame_cnt,
+                ),
+                FixedSizeCropClip(crop_size=(image_size, image_size), clip_frame_cnt=clip_frame_cnt),
+            ])
+        else:
+            min_size = cfg.INPUT.PSEUDO.MIN_SIZE_TRAIN
+            max_size = cfg.INPUT.PSEUDO.MAX_SIZE_TRAIN
+            sample_style = cfg.INPUT.PSEUDO.MIN_SIZE_TRAIN_SAMPLING
+            clip_frame_cnt = cfg.INPUT.SAMPLING_FRAME_NUM
+            # Crop
+            if cfg.INPUT.PSEUDO.CROP.ENABLED:
+                crop_aug = RandomApplyClip(
+                    T.AugmentationList([
+                        ResizeShortestEdgeClip([400, 500, 600], 1333, sample_style, clip_frame_cnt=clip_frame_cnt),
+                        RandomCropClip(cfg.INPUT.PSEUDO.CROP.TYPE, cfg.INPUT.PSEUDO.CROP.SIZE, clip_frame_cnt=clip_frame_cnt)
+                    ]),
+                    clip_frame_cnt=clip_frame_cnt
+                )
+                aug_list.append(crop_aug)
+            # Resize
+            aug_list.append(ResizeShortestEdgeClip(min_size, max_size, sample_style, clip_frame_cnt=clip_frame_cnt))
+            # Flip
+            aug_list.append(
+                # NOTE using RandomFlip modified for the support of flip maintenance
+                RandomFlipClip(
+                    horizontal=(cfg.INPUT.RANDOM_FLIP == "horizontal") or (cfg.INPUT.RANDOM_FLIP == "flip_by_clip"),
+                    vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
+                    clip_frame_cnt=clip_frame_cnt,
+                )
+            )
+            # Additional augmentations : brightness, contrast, saturation, rotation
+            augmentations = cfg.INPUT.PSEUDO.AUGMENTATIONS
+            if "brightness" in augmentations:
+                aug_list.append(T.RandomBrightness(0.9, 1.1))
+            if "contrast" in augmentations:
+                aug_list.append(T.RandomContrast(0.9, 1.1))
+            if "saturation" in augmentations:
+                aug_list.append(T.RandomSaturation(0.9, 1.1))
+            if "rotation" in augmentations:
+                aug_list.append(
+                    RandomRotationClip(
+                        [-15, 15], expand=False, center=[(0.4, 0.4), (0.6, 0.6)], clip_frame_cnt=clip_frame_cnt,
+                    )
+                )
+    else:
+        # Resize
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+        sample_style = "choice"
+        aug_list.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
+    return aug_list

avism/data/avis_eval.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import io
+import copy
+import json
+import logging
+import contextlib
+from collections import OrderedDict
+import numpy as np
+import torch
+import pycocotools.mask as mask_util
+from multiprocessing import freeze_support
+from fvcore.common.file_io import PathManager
+from detectron2.data import MetadataCatalog
+from detectron2.utils.file_io import PathManager
+from detectron2.evaluation import DatasetEvaluator
+from .datasets.avis_api.avos import AVOS
+import sys
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+import aviseval
+def eval_track(out_dir, gt_file):
+    freeze_support()
+    # Command line interface:
+    default_eval_config = aviseval.Evaluator.get_default_eval_config()
+    default_dataset_config = aviseval.datasets.AVIS.get_default_dataset_config()
+    default_dataset_config['TRACKERS_FOLDER'] = out_dir
+    default_dataset_config['GT_File'] = gt_file
+    default_metrics_config = {'METRICS': ['TrackMAP', 'HOTA']}   # 'CLEAR', 'Identity'
+    config = {**default_eval_config, **default_dataset_config, **default_metrics_config}  # Merge default configs
+    eval_config = {k: v for k, v in config.items() if k in default_eval_config.keys()}
+    dataset_config = {k: v for k, v in config.items() if k in default_dataset_config.keys()}
+    metrics_config = {k: v for k, v in config.items() if k in default_metrics_config.keys()}
+    # Run code
+    evaluator = aviseval.Evaluator(eval_config)
+    dataset_list = [aviseval.datasets.AVIS(dataset_config)]
+    metrics_list = []
+    for metric in [aviseval.metrics.TrackMAP, aviseval.metrics.HOTA]:
+        if metric.get_name() in metrics_config['METRICS']:
+            if metric == aviseval.metrics.TrackMAP:
+                default_track_map_config = metric.get_default_metric_config()
+                default_track_map_config['USE_TIME_RANGES'] = False
+                default_track_map_config['AREA_RANGES'] = [[0 ** 2, 128 ** 2],
+                                                           [128 ** 2, 256 ** 2],
+                                                           [256 ** 2, 1e5 ** 2]]
+                metrics_list.append(metric(default_track_map_config))
+            else:
+                metrics_list.append(metric())
+    if len(metrics_list) == 0:
+        raise Exception('No metrics selected for evaluation')
+    output_res, output_msg = evaluator.evaluate(dataset_list, metrics_list)
+    return output_res
+def instances_to_coco_json_video(inputs, outputs):
+    """
+    Dump an "Instances" object to a COCO-format json that's used for evaluation.
+    Args:
+        instances (Instances):
+        video_id (int): the image id
+    Returns:
+        list[dict]: list of json annotations in COCO format.
+    """
+    assert len(inputs) == 1, "More than one inputs are loaded for inference!"
+    video_id = inputs[0]["video_id"]
+    video_length = inputs[0]["length"]
+    scores = outputs["pred_scores"]
+    labels = outputs["pred_labels"]
+    masks = outputs["pred_masks"]
+    avis_results = []
+    for instance_id, (s, l, m) in enumerate(zip(scores, labels, masks)):
+        segms = [
+            mask_util.encode(np.array(_mask[:, :, None], order="F", dtype="uint8"))[0]
+            for _mask in m
+        ]
+        for rle in segms:
+            rle["counts"] = rle["counts"].decode("utf-8")
+        res = {
+            "video_id": video_id,
+            "score": s,
+            "category_id": l,
+            "segmentations": segms,
+        }
+        avis_results.append(res)
+    return avis_results
+class AVISEvaluator(DatasetEvaluator):
+    def __init__(
+            self,
+            dataset_name,
+            tasks=None,
+            distributed=True,
+            output_dir=None,
+            *,
+            use_fast_impl=True,
+    ):
+        self._logger = logging.getLogger(__name__)
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._use_fast_impl = use_fast_impl
+        self._cpu_device = torch.device("cpu")
+        self.dataset_name = dataset_name
+        self._metadata = MetadataCatalog.get(dataset_name)
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._avis_api = AVOS(json_file)
+        self._do_evaluation = "annotations" in self._avis_api.dataset
+    def reset(self):
+        self._predictions = []
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        prediction = instances_to_coco_json_video(inputs, outputs)
+        self._predictions.extend(prediction)
+    def evaluate(self):
+        """
+        Args:
+            img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
+        """
+        predictions = self._predictions
+        self._results = OrderedDict()
+        self._eval_predictions(predictions)
+        # Copy so the caller can do whatever with results
+        return copy.deepcopy(self._results)
+    def _eval_predictions(self, predictions):
+        """
+        Evaluate predictions. Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for AVIS format ...")
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
+            all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
+            num_classes = len(all_contiguous_ids)
+            assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
+            reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
+            for result in predictions:
+                category_id = result["category_id"]
+                assert category_id < num_classes, (
+                    f"A prediction has class={category_id}, "
+                    f"but the dataset only has {num_classes} classes and "
+                    f"predicted class id should be in [0, {num_classes - 1}]."
+                )
+                result["category_id"] = reverse_id_mapping[category_id]
+        o_d = None
+        if self._output_dir:
+            o_d = os.path.join(self._output_dir, "results")
+            os.makedirs(os.path.join(o_d, "model_final"), exist_ok=True)
+            file_path = os.path.join(o_d, "model_final", "results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(predictions))
+                f.flush()
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        assert o_d != None
+        output_res = eval_track(o_d, "test.json")
+        self._results["segm"] = output_res['AVIS']['model_final']

avism/data/aviseval/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .eval import Evaluator
+from . import datasets
+from . import metrics
+from . import plotting
+from . import utils

avism/data/aviseval/_timing.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from functools import wraps
+from time import perf_counter
+import inspect
+DO_TIMING = False
+DISPLAY_LESS_PROGRESS = False
+timer_dict = {}
+counter = 0
+def time(f):
+    @wraps(f)
+    def wrap(*args, **kw):
+        if DO_TIMING:
+            # Run function with timing
+            ts = perf_counter()
+            result = f(*args, **kw)
+            te = perf_counter()
+            tt = te-ts
+            # Get function name
+            arg_names = inspect.getfullargspec(f)[0]
+            if arg_names[0] == 'self' and DISPLAY_LESS_PROGRESS:
+                return result
+            elif arg_names[0] == 'self':
+                method_name = type(args[0]).__name__ + '.' + f.__name__
+            else:
+                method_name = f.__name__
+            # Record accumulative time in each function for analysis
+            if method_name in timer_dict.keys():
+                timer_dict[method_name] += tt
+            else:
+                timer_dict[method_name] = tt
+            # If code is finished, display timing summary
+            if method_name == "Evaluator.evaluate":
+                print("")
+                print("Timing analysis:")
+                for key, value in timer_dict.items():
+                    print('%-70s %2.4f sec' % (key, value))
+            else:
+                # Get function argument values for printing special arguments of interest
+                arg_titles = ['tracker', 'seq', 'cls']
+                arg_vals = []
+                for i, a in enumerate(arg_names):
+                    if a in arg_titles:
+                        arg_vals.append(args[i])
+                arg_text = '(' + ', '.join(arg_vals) + ')'
+                # Display methods and functions with different indentation.
+                if arg_names[0] == 'self':
+                    print('%-74s %2.4f sec' % (' '*4 + method_name + arg_text, tt))
+                elif arg_names[0] == 'test':
+                    pass
+                else:
+                    global counter
+                    counter += 1
+                    print('%i %-70s %2.4f sec' % (counter, method_name + arg_text, tt))
+            return result
+        else:
+            # If config["TIME_PROGRESS"] is false, or config["USE_PARALLEL"] is true, run functions normally without timing.
+            return f(*args, **kw)
+    return wrap

avism/data/aviseval/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .avis import AVIS

avism/data/aviseval/datasets/_base_dataset.py ADDED Viewed

	@@ -0,0 +1,326 @@

+import csv
+import io
+import zipfile
+import os
+import traceback
+import numpy as np
+from copy import deepcopy
+from abc import ABC, abstractmethod
+from .. import _timing
+from ..utils import TrackEvalException
+class _BaseDataset(ABC):
+    @abstractmethod
+    def __init__(self):
+        self.tracker_list = None
+        self.seq_list = None
+        self.class_list = None
+        self.output_fol = None
+        self.output_sub_fol = None
+        self.should_classes_combine = True
+        self.use_super_categories = False
+    # Functions to implement:
+    @staticmethod
+    @abstractmethod
+    def get_default_dataset_config():
+        ...
+    @abstractmethod
+    def _load_raw_file(self, tracker, seq, is_gt):
+        ...
+    @_timing.time
+    @abstractmethod
+    def get_preprocessed_seq_data(self, raw_data, cls):
+        ...
+    @abstractmethod
+    def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
+        ...
+    # Helper functions for all datasets:
+    @classmethod
+    def get_class_name(cls):
+        return cls.__name__
+    def get_name(self):
+        return self.get_class_name()
+    def get_output_fol(self, tracker):
+        return os.path.join(self.output_fol, tracker, self.output_sub_fol)
+    def get_display_name(self, tracker):
+        """ Can be overwritten if the trackers name (in files) is different to how it should be displayed.
+        By default this method just returns the trackers name as is.
+        """
+        return tracker
+    def get_eval_info(self):
+        """Return info about the dataset needed for the Evaluator"""
+        return self.tracker_list, self.seq_list, self.class_list
+    @_timing.time
+    def get_raw_seq_data(self, tracker, seq):
+        """ Loads raw data (tracker and ground-truth) for a single tracker on a single sequence.
+        Raw data includes all of the information needed for both preprocessing and evaluation, for all classes.
+        A later function (get_processed_seq_data) will perform such preprocessing and extract relevant information for
+        the evaluation of each class.
+        This returns a dict which contains the fields:
+        [num_timesteps]: integer
+        [gt_ids, tracker_ids, gt_classes, tracker_classes, tracker_confidences]:
+                                                                list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets, tracker_dets, gt_crowd_ignore_regions]: list (for each timestep) of lists of detections.
+        [similarity_scores]: list (for each timestep) of 2D NDArrays.
+        [gt_extras]: dict (for each extra) of lists (for each timestep) of 1D NDArrays (for each det).
+        gt_extras contains dataset specific information used for preprocessing such as occlusion and truncation levels.
+        Note that similarities are extracted as part of the dataset and not the metric, because almost all metrics are
+        independent of the exact method of calculating the similarity. However datasets are not (e.g. segmentation
+        masks vs 2D boxes vs 3D boxes).
+        We calculate the similarity before preprocessing because often both preprocessing and evaluation require it and
+        we don't wish to calculate this twice.
+        We calculate similarity between all gt and tracker classes (not just each class individually) to allow for
+        calculation of metrics such as class confusion matrices. Typically the impact of this on performance is low.
+        """
+        # Load raw data.
+        raw_gt_data = self._load_raw_file(tracker, seq, is_gt=True)
+        raw_tracker_data = self._load_raw_file(tracker, seq, is_gt=False)
+        raw_data = {**raw_tracker_data, **raw_gt_data}  # Merges dictionaries
+        # Calculate similarities for each timestep.
+        similarity_scores = []
+        for t, (gt_dets_t, tracker_dets_t) in enumerate(zip(raw_data['gt_dets'], raw_data['tracker_dets'])):
+            ious = self._calculate_similarities(gt_dets_t, tracker_dets_t)
+            similarity_scores.append(ious)
+        raw_data['similarity_scores'] = similarity_scores
+        return raw_data
+    @staticmethod
+    def _load_simple_text_file(file, time_col=0, id_col=None, remove_negative_ids=False, valid_filter=None,
+                               crowd_ignore_filter=None, convert_filter=None, is_zipped=False, zip_file=None,
+                               force_delimiters=None):
+        """ Function that loads data which is in a commonly used text file format.
+        Assumes each det is given by one row of a text file.
+        There is no limit to the number or meaning of each column,
+        however one column needs to give the timestep of each det (time_col) which is default col 0.
+        The file dialect (deliminator, num cols, etc) is determined automatically.
+        This function automatically separates dets by timestep,
+        and is much faster than alternatives such as np.loadtext or pandas.
+        If remove_negative_ids is True and id_col is not None, dets with negative values in id_col are excluded.
+        These are not excluded from ignore data.
+        valid_filter can be used to only include certain classes.
+        It is a dict with ints as keys, and lists as values,
+        such that a row is included if "row[key].lower() is in value" for all key/value pairs in the dict.
+        If None, all classes are included.
+        crowd_ignore_filter can be used to read crowd_ignore regions separately. It has the same format as valid filter.
+        convert_filter can be used to convert value read to another format.
+        This is used most commonly to convert classes given as string to a class id.
+        This is a dict such that the key is the column to convert, and the value is another dict giving the mapping.
+        Optionally, input files could be a zip of multiple text files for storage efficiency.
+        Returns read_data and ignore_data.
+        Each is a dict (with keys as timesteps as strings) of lists (over dets) of lists (over column values).
+        Note that all data is returned as strings, and must be converted to float/int later if needed.
+        Note that timesteps will not be present in the returned dict keys if there are no dets for them
+        """
+        if remove_negative_ids and id_col is None:
+            raise TrackEvalException('remove_negative_ids is True, but id_col is not given.')
+        if crowd_ignore_filter is None:
+            crowd_ignore_filter = {}
+        if convert_filter is None:
+            convert_filter = {}
+        try:
+            if is_zipped:  # Either open file directly or within a zip.
+                if zip_file is None:
+                    raise TrackEvalException('is_zipped set to True, but no zip_file is given.')
+                archive = zipfile.ZipFile(os.path.join(zip_file), 'r')
+                fp = io.TextIOWrapper(archive.open(file, 'r'))
+            else:
+                fp = open(file)
+            read_data = {}
+            crowd_ignore_data = {}
+            fp.seek(0, os.SEEK_END)
+            # check if file is empty
+            if fp.tell():
+                fp.seek(0)
+                dialect = csv.Sniffer().sniff(fp.readline(), delimiters=force_delimiters)  # Auto determine structure.
+                dialect.skipinitialspace = True  # Deal with extra spaces between columns
+                fp.seek(0)
+                reader = csv.reader(fp, dialect)
+                for row in reader:
+                    try:
+                        # Deal with extra trailing spaces at the end of rows
+                        if row[-1] in '':
+                            row = row[:-1]
+                        timestep = str(int(float(row[time_col])))
+                        # Read ignore regions separately.
+                        is_ignored = False
+                        for ignore_key, ignore_value in crowd_ignore_filter.items():
+                            if row[ignore_key].lower() in ignore_value:
+                                # Convert values in one column (e.g. string to id)
+                                for convert_key, convert_value in convert_filter.items():
+                                    row[convert_key] = convert_value[row[convert_key].lower()]
+                                # Save data separated by timestep.
+                                if timestep in crowd_ignore_data.keys():
+                                    crowd_ignore_data[timestep].append(row)
+                                else:
+                                    crowd_ignore_data[timestep] = [row]
+                                is_ignored = True
+                        if is_ignored:  # if det is an ignore region, it cannot be a normal det.
+                            continue
+                        # Exclude some dets if not valid.
+                        if valid_filter is not None:
+                            for key, value in valid_filter.items():
+                                if row[key].lower() not in value:
+                                    continue
+                        if remove_negative_ids:
+                            if int(float(row[id_col])) < 0:
+                                continue
+                        # Convert values in one column (e.g. string to id)
+                        for convert_key, convert_value in convert_filter.items():
+                            row[convert_key] = convert_value[row[convert_key].lower()]
+                        # Save data separated by timestep.
+                        if timestep in read_data.keys():
+                            read_data[timestep].append(row)
+                        else:
+                            read_data[timestep] = [row]
+                    except Exception:
+                        exc_str_init = 'In file %s the following line cannot be read correctly: \n' % os.path.basename(
+                            file)
+                        exc_str = ' '.join([exc_str_init]+row)
+                        raise TrackEvalException(exc_str)
+            fp.close()
+        except Exception:
+            print('Error loading file: %s, printing traceback.' % file)
+            traceback.print_exc()
+            raise TrackEvalException(
+                'File %s cannot be read because it is either not present or invalidly formatted' % os.path.basename(
+                    file))
+        return read_data, crowd_ignore_data
+    @staticmethod
+    def _calculate_mask_ious(masks1, masks2, is_encoded=False, do_ioa=False):
+        """ Calculates the IOU (intersection over union) between two arrays of segmentation masks.
+        If is_encoded a run length encoding with pycocotools is assumed as input format, otherwise an input of numpy
+        arrays of the shape (num_masks, height, width) is assumed and the encoding is performed.
+        If do_ioa (intersection over area) , then calculates the intersection over the area of masks1 - this is commonly
+        used to determine if detections are within crowd ignore region.
+        :param masks1:  first set of masks (numpy array of shape (num_masks, height, width) if not encoded,
+                        else pycocotools rle encoded format)
+        :param masks2:  second set of masks (numpy array of shape (num_masks, height, width) if not encoded,
+                        else pycocotools rle encoded format)
+        :param is_encoded: whether the input is in pycocotools rle encoded format
+        :param do_ioa: whether to perform IoA computation
+        :return: the IoU/IoA scores
+        """
+        # Only loaded when run to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+        # use pycocotools for run length encoding of masks
+        if not is_encoded:
+            masks1 = mask_utils.encode(np.array(np.transpose(masks1, (1, 2, 0)), order='F'))
+            masks2 = mask_utils.encode(np.array(np.transpose(masks2, (1, 2, 0)), order='F'))
+        # use pycocotools for iou computation of rle encoded masks
+        ious = mask_utils.iou(masks1, masks2, [do_ioa]*len(masks2))
+        if len(masks1) == 0 or len(masks2) == 0:
+            ious = np.asarray(ious).reshape(len(masks1), len(masks2))
+        assert (ious >= 0 - np.finfo('float').eps).all()
+        assert (ious <= 1 + np.finfo('float').eps).all()
+        return ious
+    @staticmethod
+    def _calculate_box_ious(bboxes1, bboxes2, box_format='xywh', do_ioa=False):
+        """ Calculates the IOU (intersection over union) between two arrays of boxes.
+        Allows variable box formats ('xywh' and 'x0y0x1y1').
+        If do_ioa (intersection over area) , then calculates the intersection over the area of boxes1 - this is commonly
+        used to determine if detections are within crowd ignore region.
+        """
+        if box_format in 'xywh':
+            # layout: (x0, y0, w, h)
+            bboxes1 = deepcopy(bboxes1)
+            bboxes2 = deepcopy(bboxes2)
+            bboxes1[:, 2] = bboxes1[:, 0] + bboxes1[:, 2]
+            bboxes1[:, 3] = bboxes1[:, 1] + bboxes1[:, 3]
+            bboxes2[:, 2] = bboxes2[:, 0] + bboxes2[:, 2]
+            bboxes2[:, 3] = bboxes2[:, 1] + bboxes2[:, 3]
+        elif box_format not in 'x0y0x1y1':
+            raise (TrackEvalException('box_format %s is not implemented' % box_format))
+        # layout: (x0, y0, x1, y1)
+        min_ = np.minimum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
+        max_ = np.maximum(bboxes1[:, np.newaxis, :], bboxes2[np.newaxis, :, :])
+        intersection = np.maximum(min_[..., 2] - max_[..., 0], 0) * np.maximum(min_[..., 3] - max_[..., 1], 0)
+        area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1])
+        if do_ioa:
+            ioas = np.zeros_like(intersection)
+            valid_mask = area1 > 0 + np.finfo('float').eps
+            ioas[valid_mask, :] = intersection[valid_mask, :] / area1[valid_mask][:, np.newaxis]
+            return ioas
+        else:
+            area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1])
+            union = area1[:, np.newaxis] + area2[np.newaxis, :] - intersection
+            intersection[area1 <= 0 + np.finfo('float').eps, :] = 0
+            intersection[:, area2 <= 0 + np.finfo('float').eps] = 0
+            intersection[union <= 0 + np.finfo('float').eps] = 0
+            union[union <= 0 + np.finfo('float').eps] = 1
+            ious = intersection / union
+            return ious
+    @staticmethod
+    def _calculate_euclidean_similarity(dets1, dets2, zero_distance=2.0):
+        """ Calculates the euclidean distance between two sets of detections, and then converts this into a similarity
+        measure with values between 0 and 1 using the following formula: sim = max(0, 1 - dist/zero_distance).
+        The default zero_distance of 2.0, corresponds to the default used in MOT15_3D, such that a 0.5 similarity
+        threshold corresponds to a 1m distance threshold for TPs.
+        """
+        dist = np.linalg.norm(dets1[:, np.newaxis]-dets2[np.newaxis, :], axis=2)
+        sim = np.maximum(0, 1 - dist/zero_distance)
+        return sim
+    @staticmethod
+    def _check_unique_ids(data, after_preproc=False):
+        """Check the requirement that the tracker_ids and gt_ids are unique per timestep"""
+        gt_ids = data['gt_ids']
+        tracker_ids = data['tracker_ids']
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(gt_ids, tracker_ids)):
+            if len(tracker_ids_t) > 0:
+                unique_ids, counts = np.unique(tracker_ids_t, return_counts=True)
+                if np.max(counts) != 1:
+                    duplicate_ids = unique_ids[counts > 1]
+                    exc_str_init = 'Tracker predicts the same ID more than once in a single timestep ' \
+                                   '(seq: %s, frame: %i, ids:' % (data['seq'], t+1)
+                    exc_str = ' '.join([exc_str_init] + [str(d) for d in duplicate_ids]) + ')'
+                    if after_preproc:
+                        exc_str_init += '\n Note that this error occurred after preprocessing (but not before), ' \
+                                        'so ids may not be as in file, and something seems wrong with preproc.'
+                    raise TrackEvalException(exc_str)
+            if len(gt_ids_t) > 0:
+                unique_ids, counts = np.unique(gt_ids_t, return_counts=True)
+                if np.max(counts) != 1:
+                    duplicate_ids = unique_ids[counts > 1]
+                    exc_str_init = 'Ground-truth has the same ID more than once in a single timestep ' \
+                                   '(seq: %s, frame: %i, ids:' % (data['seq'], t+1)
+                    exc_str = ' '.join([exc_str_init] + [str(d) for d in duplicate_ids]) + ')'
+                    if after_preproc:
+                        exc_str_init += '\n Note that this error occurred after preprocessing (but not before), ' \
+                                        'so ids may not be as in file, and something seems wrong with preproc.'
+                    raise TrackEvalException(exc_str)

avism/data/aviseval/datasets/avis.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import os
+import numpy as np
+import json
+from ._base_dataset import _BaseDataset
+from ..utils import TrackEvalException
+from .. import utils
+from .. import _timing
+class AVIS(_BaseDataset):
+    """Dataset class for AVIS tracking"""
+    @staticmethod
+    def get_default_dataset_config():
+        """Default class config values"""
+        default_config = {
+            'GT_FOLDER': "./datasets/",  # Location of GT data
+            'TRACKERS_FOLDER': "./outputs/avism_R50_IN/inference/",
+            'GT_File': "test.json",
+            # Trackers location
+            'OUTPUT_FOLDER': None,  # Where to save eval results (if None, same as TRACKERS_FOLDER)
+            'TRACKERS_TO_EVAL': None,  # Filenames of trackers to eval (if None, all in folder)
+            'CLASSES_TO_EVAL': None,  # Classes to eval (if None, all classes)
+            'SPLIT_TO_EVAL': None,  # Valid: 'train', 'val', 'train_sub_split'
+            'PRINT_CONFIG': False,  # Whether to print current config
+            'OUTPUT_SUB_FOLDER': '',  # Output files are saved in OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            'TRACKER_DISPLAY_NAMES': None,  # Names of trackers to display, if None: TRACKERS_TO_EVAL
+        }
+        return default_config
+    def __init__(self, config=None):
+        """Initialise dataset, checking that all required files are present"""
+        super().__init__()
+        # Fill non-given config values with defaults
+        self.config = utils.init_config(config, self.get_default_dataset_config(), self.get_name())
+        self.gt_fol = self.config['GT_FOLDER']
+        self.tracker_fol = self.config['TRACKERS_FOLDER']
+        self.use_super_categories = False
+        self.should_classes_combine = True
+        self.output_fol = self.config['OUTPUT_FOLDER']
+        if self.output_fol is None:
+            self.output_fol = self.tracker_fol
+        self.output_sub_fol = self.config['OUTPUT_SUB_FOLDER']
+        if not os.path.exists(self.gt_fol):
+            print("GT folder not found: " + self.gt_fol)
+            raise TrackEvalException("GT folder not found: " + os.path.basename(self.gt_fol))
+        gt_dir_files = [self.config['GT_File']]
+        if len(gt_dir_files) != 1:
+            raise TrackEvalException(self.gt_fol + ' does not contain exactly one json file.')
+        with open(os.path.join(self.gt_fol, gt_dir_files[0])) as f:
+            self.gt_data = json.load(f)
+        # Get classes to eval
+        self.valid_classes = [cls['name'] for cls in self.gt_data['categories']]
+        cls_name_to_cls_id_map = {cls['name']: cls['id'] for cls in self.gt_data['categories']}
+        if self.config['CLASSES_TO_EVAL']:
+            self.class_list = [cls.lower() if cls.lower() in self.valid_classes else None
+                               for cls in self.config['CLASSES_TO_EVAL']]
+            if not all(self.class_list):
+                raise TrackEvalException('Attempted to evaluate an invalid class. Only classes ' +
+                                         ', '.join(self.valid_classes) + ' are valid.')
+        else:
+            self.class_list = [cls['name'] for cls in self.gt_data['categories']]
+        self.class_name_to_class_id = {k: v for k, v in cls_name_to_cls_id_map.items() if k in self.class_list}
+        # Get sequences to eval and check gt files exist
+        self.seq_list = [vid['file_names'][0].split('/')[0] for vid in self.gt_data['videos']]
+        self.seq_name_to_seq_id = {vid['file_names'][0].split('/')[0]: vid['id'] for vid in self.gt_data['videos']}
+        self.seq_lengths = {vid['id']: len(vid['file_names']) for vid in self.gt_data['videos']}
+        # encode masks and compute track areas
+        self._prepare_gt_annotations()
+        # Get trackers to eval
+        if self.config['TRACKERS_TO_EVAL'] is None:
+            self.tracker_list = os.listdir(self.tracker_fol)
+        else:
+            self.tracker_list = self.config['TRACKERS_TO_EVAL']
+        if self.config['TRACKER_DISPLAY_NAMES'] is None:
+            self.tracker_to_disp = dict(zip(self.tracker_list, self.tracker_list))
+        elif (self.config['TRACKERS_TO_EVAL'] is not None) and (
+                len(self.config['TRACKER_DISPLAY_NAMES']) == len(self.tracker_list)):
+            self.tracker_to_disp = dict(zip(self.tracker_list, self.config['TRACKER_DISPLAY_NAMES']))
+        else:
+            raise TrackEvalException('List of tracker files and tracker display names do not match.')
+        # counter for globally unique track IDs
+        self.global_tid_counter = 0
+        self.tracker_data = dict()
+        for tracker in self.tracker_list:
+            tracker_dir_path = os.path.join(self.tracker_fol, tracker)
+            tr_dir_files = [file for file in os.listdir(tracker_dir_path) if file.endswith('.json')]
+            if len(tr_dir_files) != 1:
+                raise TrackEvalException(tracker_dir_path + ' does not contain exactly one json file.')
+            with open(os.path.join(tracker_dir_path, tr_dir_files[0])) as f:
+                curr_data = json.load(f)
+            self.tracker_data[tracker] = curr_data
+    def get_display_name(self, tracker):
+        return self.tracker_to_disp[tracker]
+    def _load_raw_file(self, tracker, seq, is_gt):
+        """Load a file (gt or tracker) in the YouTubeVIS format
+        If is_gt, this returns a dict which contains the fields:
+        [gt_ids, gt_classes] : list (for each timestep) of 1D NDArrays (for each det).
+        [gt_dets]: list (for each timestep) of lists of detections.
+        [classes_to_gt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_gt_track_ids, classes_to_gt_track_areas, classes_to_gt_track_iscrowd]: dictionary with class values
+                                as keys and lists (for each track) as values
+        if not is_gt, this returns a dict which contains the fields:
+        [tracker_ids, tracker_classes, tracker_confidences] : list (for each timestep) of 1D NDArrays (for each det).
+        [tracker_dets]: list (for each timestep) of lists of detections.
+        [classes_to_dt_tracks]: dictionary with class values as keys and list of dictionaries (with frame indices as
+                                keys and corresponding segmentations as values) for each track
+        [classes_to_dt_track_ids, classes_to_dt_track_areas]: dictionary with class values as keys and lists as values
+        [classes_to_dt_track_scores]: dictionary with class values as keys and 1D numpy arrays as values
+        """
+        # select sequence tracks
+        seq_id = self.seq_name_to_seq_id[seq]
+        if is_gt:
+            tracks = [ann for ann in self.gt_data['annotations'] if ann['video_id'] == seq_id]
+        else:
+            tracks = self._get_tracker_seq_tracks(tracker, seq_id)
+        # Convert data to required format
+        num_timesteps = self.seq_lengths[seq_id]
+        data_keys = ['ids', 'classes', 'dets']
+        if not is_gt:
+            data_keys += ['tracker_confidences']
+        raw_data = {key: [None] * num_timesteps for key in data_keys}
+        raw_data['raw_dets'] = [None] * num_timesteps
+        raw_data['raw_classes'] = [None] * num_timesteps
+        for t in range(num_timesteps):
+            raw_data['raw_dets'][t] = [track['segmentations'][t] for track in tracks]
+            raw_data['raw_classes'][t] = np.atleast_1d([track['category_id'] for track in tracks]).astype(int)
+            raw_data['dets'][t] = [track['segmentations'][t] for track in tracks if track['segmentations'][t]]
+            raw_data['ids'][t] = np.atleast_1d([track['id'] for track in tracks if track['segmentations'][t]]).astype(int)
+            raw_data['classes'][t] = np.atleast_1d([track['category_id'] for track in tracks if track['segmentations'][t]]).astype(int)
+            if not is_gt:
+                raw_data['tracker_confidences'][t] = np.atleast_1d([track['score'] for track in tracks if track['segmentations'][t]]).astype(float)
+        if is_gt:
+            key_map = {'ids': 'gt_ids',
+                       'classes': 'gt_classes',
+                       'dets': 'gt_dets'}
+        else:
+            key_map = {'ids': 'tracker_ids',
+                       'classes': 'tracker_classes',
+                       'dets': 'tracker_dets'}
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+        all_cls_ids = {self.class_name_to_class_id[cls] for cls in self.class_list}
+        classes_to_tracks = {cls: [track for track in tracks if track['category_id'] == cls] for cls in all_cls_ids}
+        # mapping from classes to track representations and track information
+        raw_data['classes_to_tracks'] = {cls: [{i: track['segmentations'][i]
+                                                for i in range(len(track['segmentations']))} for track in tracks]
+                                         for cls, tracks in classes_to_tracks.items()}
+        raw_data['classes_to_track_ids'] = {cls: [track['id'] for track in tracks]
+                                            for cls, tracks in classes_to_tracks.items()}
+        raw_data['classes_to_track_areas'] = {cls: [track['area'] for track in tracks]
+                                              for cls, tracks in classes_to_tracks.items()}
+        if is_gt:
+            raw_data['classes_to_gt_track_iscrowd'] = {cls: [track['iscrowd'] for track in tracks]
+                                                       for cls, tracks in classes_to_tracks.items()}
+        else:
+            raw_data['classes_to_dt_track_scores'] = {cls: np.array([track['score'] for track in tracks])
+                                                      for cls, tracks in classes_to_tracks.items()}
+        if is_gt:
+            key_map = {'classes_to_tracks': 'classes_to_gt_tracks',
+                       'classes_to_track_ids': 'classes_to_gt_track_ids',
+                       'classes_to_track_areas': 'classes_to_gt_track_areas'}
+        else:
+            key_map = {'classes_to_tracks': 'classes_to_dt_tracks',
+                       'classes_to_track_ids': 'classes_to_dt_track_ids',
+                       'classes_to_track_areas': 'classes_to_dt_track_areas'}
+        for k, v in key_map.items():
+            raw_data[v] = raw_data.pop(k)
+        raw_data['num_timesteps'] = num_timesteps
+        raw_data['seq'] = seq
+        return raw_data
+    @_timing.time
+    def get_preprocessed_seq_data(self, raw_data, cls):
+        """ Preprocess data for a single sequence for a single class ready for evaluation.
+        Inputs:
+             - raw_data is a dict containing the data for the sequence already read in by get_raw_seq_data().
+             - cls is the class to be evaluated.
+        Outputs:
+             - data is a dict containing all of the information that metrics need to perform evaluation.
+                It contains the following fields:
+                    [num_timesteps, num_gt_ids, num_tracker_ids, num_gt_dets, num_tracker_dets] : integers.
+                    [gt_ids, tracker_ids, tracker_confidences]: list (for each timestep) of 1D NDArrays (for each det).
+                    [gt_dets, tracker_dets]: list (for each timestep) of lists of detections.
+                    [similarity_scores]: list (for each timestep) of 2D NDArrays.
+        Notes:
+            General preprocessing (preproc) occurs in 4 steps. Some datasets may not use all of these steps.
+                1) Extract only detections relevant for the class to be evaluated (including distractor detections).
+                2) Match gt dets and tracker dets. Remove tracker dets that are matched to a gt det that is of a
+                    distractor class, or otherwise marked as to be removed.
+                3) Remove unmatched tracker dets if they fall within a crowd ignore region or don't meet a certain
+                    other criteria (e.g. are too small).
+                4) Remove gt dets that were only useful for preprocessing and not for actual evaluation.
+            After the above preprocessing steps, this function also calculates the number of gt and tracker detections
+                and unique track ids. It also relabels gt and tracker ids to be contiguous and checks that ids are
+                unique within each timestep.
+        YouTubeVIS:
+            In YouTubeVIS, the 4 preproc steps are as follow:
+                1) There are 40 classes which are evaluated separately.
+                2) No matched tracker dets are removed.
+                3) No unmatched tracker dets are removed.
+                4) No gt dets are removed.
+            Further, for TrackMAP computation track representations for the given class are accessed from a dictionary
+            and the tracks from the tracker data are sorted according to the tracker confidence.
+        """
+        cls_id = self.class_name_to_class_id[cls]
+        data_keys = ['gt_ids', 'tracker_ids', 'gt_dets', 'tracker_dets', 'similarity_scores']
+        data = {key: [None] * raw_data['num_timesteps'] for key in data_keys}
+        unique_gt_ids = []
+        unique_tracker_ids = []
+        num_gt_dets = 0
+        num_tracker_dets = 0
+        for t in range(raw_data['num_timesteps']):
+            # Only extract relevant dets for this class for eval (cls)
+            gt_class_mask = np.atleast_1d(raw_data['gt_classes'][t] == cls_id)
+            gt_class_mask = gt_class_mask.astype(bool)
+            gt_ids = raw_data['gt_ids'][t][gt_class_mask]
+            gt_dets = [raw_data['gt_dets'][t][ind] for ind in range(len(gt_class_mask)) if gt_class_mask[ind]]
+            tracker_class_mask = np.atleast_1d(raw_data['tracker_classes'][t] == cls_id)
+            tracker_class_mask = tracker_class_mask.astype(bool)
+            tracker_ids = raw_data['tracker_ids'][t][tracker_class_mask]
+            tracker_dets = [raw_data['tracker_dets'][t][ind] for ind in range(len(tracker_class_mask)) if
+                            tracker_class_mask[ind]]
+            similarity_scores = raw_data['similarity_scores'][t][gt_class_mask, :][:, tracker_class_mask]
+            data['tracker_ids'][t] = tracker_ids
+            data['tracker_dets'][t] = tracker_dets
+            data['gt_ids'][t] = gt_ids
+            data['gt_dets'][t] = gt_dets
+            data['similarity_scores'][t] = similarity_scores
+            unique_gt_ids += list(np.unique(data['gt_ids'][t]))
+            unique_tracker_ids += list(np.unique(data['tracker_ids'][t]))
+            num_tracker_dets += len(data['tracker_ids'][t])
+            num_gt_dets += len(data['gt_ids'][t])
+        # Re-label IDs such that there are no empty IDs
+        if len(unique_gt_ids) > 0:
+            unique_gt_ids = np.unique(unique_gt_ids)
+            gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
+            gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
+            for t in range(raw_data['num_timesteps']):
+                if len(data['gt_ids'][t]) > 0:
+                    data['gt_ids'][t] = gt_id_map[data['gt_ids'][t]].astype(int)
+        if len(unique_tracker_ids) > 0:
+            unique_tracker_ids = np.unique(unique_tracker_ids)
+            tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
+            tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
+            for t in range(raw_data['num_timesteps']):
+                if len(data['tracker_ids'][t]) > 0:
+                    data['tracker_ids'][t] = tracker_id_map[data['tracker_ids'][t]].astype(int)
+        # Ensure that ids are unique per timestep.
+        self._check_unique_ids(data)
+        # Record overview statistics.
+        data['num_tracker_dets'] = num_tracker_dets
+        data['num_gt_dets'] = num_gt_dets
+        data['num_tracker_ids'] = len(unique_tracker_ids)
+        data['num_gt_ids'] = len(unique_gt_ids)
+        data['num_timesteps'] = raw_data['num_timesteps']
+        data['seq'] = raw_data['seq']
+        # get track representations
+        data['gt_tracks'] = raw_data['classes_to_gt_tracks'][cls_id]
+        data['gt_track_ids'] = raw_data['classes_to_gt_track_ids'][cls_id]
+        data['gt_track_areas'] = raw_data['classes_to_gt_track_areas'][cls_id]
+        data['gt_track_iscrowd'] = raw_data['classes_to_gt_track_iscrowd'][cls_id]
+        data['dt_tracks'] = raw_data['classes_to_dt_tracks'][cls_id]
+        data['dt_track_ids'] = raw_data['classes_to_dt_track_ids'][cls_id]
+        data['dt_track_areas'] = raw_data['classes_to_dt_track_areas'][cls_id]
+        data['dt_track_scores'] = raw_data['classes_to_dt_track_scores'][cls_id]
+        data['iou_type'] = 'mask'
+        # sort tracker data tracks by tracker confidence scores
+        if data['dt_tracks']:
+            idx = np.argsort([-score for score in data['dt_track_scores']], kind="mergesort")
+            data['dt_track_scores'] = [data['dt_track_scores'][i] for i in idx]
+            data['dt_tracks'] = [data['dt_tracks'][i] for i in idx]
+            data['dt_track_ids'] = [data['dt_track_ids'][i] for i in idx]
+            data['dt_track_areas'] = [data['dt_track_areas'][i] for i in idx]
+        return data
+    def _calculate_similarities(self, gt_dets_t, tracker_dets_t):
+        similarity_scores = self._calculate_mask_ious(gt_dets_t, tracker_dets_t, is_encoded=True, do_ioa=False)
+        return similarity_scores
+    def _prepare_gt_annotations(self):
+        """
+        Prepares GT data by rle encoding segmentations and computing the average track area.
+        :return: None
+        """
+        # only loaded when needed to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+        for track in self.gt_data['annotations']:
+            h = track['height']
+            w = track['width']
+            for i, seg in enumerate(track['segmentations']):
+                if seg:
+                    masks = mask_utils.frPyObjects(seg, h, w)
+                    track['segmentations'][i] = mask_utils.merge(masks)
+                    # track['segmentations'][i] = mask_utils.frPyObjects(seg, h, w)
+            areas = [a for a in track['areas'] if a]
+            if len(areas) == 0:
+                track['area'] = 0
+            else:
+                track['area'] = np.array(areas).mean()
+    def _get_tracker_seq_tracks(self, tracker, seq_id):
+        """
+        Prepares tracker data for a given sequence. Extracts all annotations for given sequence ID, computes
+        average track area and assigns a track ID.
+        :param tracker: the given tracker
+        :param seq_id: the sequence ID
+        :return: the extracted tracks
+        """
+        # only loaded when needed to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+        tracks = [ann for ann in self.tracker_data[tracker] if ann['video_id'] == seq_id]
+        for track in tracks:
+            track['areas'] = []
+            for seg in track['segmentations']:
+                if seg:
+                    track['areas'].append(mask_utils.area(seg))
+                else:
+                    track['areas'].append(None)
+            areas = [a for a in track['areas'] if a]
+            if len(areas) == 0:
+                track['area'] = 0
+            else:
+                track['area'] = np.array(areas).mean()
+            track['id'] = self.global_tid_counter
+            self.global_tid_counter += 1
+        return tracks

avism/data/aviseval/eval.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import os
+import tqdm
+import traceback
+import numpy as np
+from . import utils
+from . import _timing
+from .metrics import Count
+from .utils import TrackEvalException
+from .metrics import compute_av_loc, combine_av_loc_sequences
+class Evaluator:
+    """Evaluator class for evaluating different metrics for different datasets"""
+    @staticmethod
+    def get_default_eval_config():
+        """Returns the default config values for evaluation"""
+        code_path = utils.get_code_path()
+        default_config = {
+            'USE_PARALLEL': False,
+            'NUM_PARALLEL_CORES': 8,
+            'BREAK_ON_ERROR': True,  # Raises exception and exits with error
+            'RETURN_ON_ERROR': False,  # if not BREAK_ON_ERROR, then returns from function on error
+            'LOG_ON_ERROR': os.path.join(code_path, 'error_log.txt'),  # if not None, save any errors into a log file.
+            'PRINT_RESULTS': False,
+            'PRINT_ONLY_COMBINED': False,
+            'PRINT_CONFIG': False,
+            'TIME_PROGRESS': False,
+            'DISPLAY_LESS_PROGRESS': True,
+            'OUTPUT_SUMMARY': False,
+            'OUTPUT_EMPTY_CLASSES': False,
+            'OUTPUT_DETAILED': False,
+            'PLOT_CURVES': False,
+        }
+        return default_config
+    def __init__(self, config=None):
+        """Initialise the evaluator with a config file"""
+        self.config = utils.init_config(config, self.get_default_eval_config(), 'Eval')
+        # Only run timing analysis if not run in parallel.
+        if self.config['TIME_PROGRESS'] and not self.config['USE_PARALLEL']:
+            _timing.DO_TIMING = True
+            if self.config['DISPLAY_LESS_PROGRESS']:
+                _timing.DISPLAY_LESS_PROGRESS = True
+    @_timing.time
+    def evaluate(self, dataset_list, metrics_list):
+        """Evaluate a set of metrics on a set of datasets"""
+        config = self.config
+        metrics_list = metrics_list + [Count()]  # Count metrics are always run
+        metric_names = utils.validate_metrics_list(metrics_list)
+        dataset_names = [dataset.get_name() for dataset in dataset_list]
+        output_res = {}
+        output_msg = {}
+        for dataset, dataset_name in zip(dataset_list, dataset_names):
+            # Get dataset info about what to evaluate
+            output_res[dataset_name] = {}
+            output_msg[dataset_name] = {}
+            tracker_list, seq_list, class_list = dataset.get_eval_info()
+            # Evaluate each tracker
+            for tracker in tracker_list:
+                # if not config['BREAK_ON_ERROR'] then go to next tracker without breaking
+                try:
+                    print('\nEvaluating model ...... \n')
+                    res = {}
+                    res_av_loc = {}
+                    seq_list_sorted = sorted(seq_list)
+                    for curr_seq in tqdm.tqdm(seq_list_sorted):
+                        res[curr_seq] = eval_sequence(curr_seq, dataset, tracker, class_list, metrics_list, metric_names)
+                        res_av_loc[curr_seq] = eval_av_loc_sequence(curr_seq, dataset, tracker)
+                    # Combine results over all sequences and then over all classes
+                    res_av_loc_all = combine_av_loc_sequences(res_av_loc)
+                    # collecting combined cls keys (cls averaged, det averaged, super classes)
+                    combined_cls_keys = []
+                    res['COMBINED_SEQ'] = {}
+                    # combine sequences for each class
+                    for c_cls in class_list:
+                        res['COMBINED_SEQ'][c_cls] = {}
+                        for metric, metric_name in zip(metrics_list, metric_names):
+                            curr_res = {seq_key: seq_value[c_cls][metric_name] for seq_key, seq_value in res.items() if
+                                        seq_key != 'COMBINED_SEQ'}
+                            res['COMBINED_SEQ'][c_cls][metric_name] = metric.combine_sequences(curr_res)
+                    # combine classes
+                    if dataset.should_classes_combine:
+                        combined_cls_keys += ['cls_comb_cls_av', 'cls_comb_det_av', 'all']
+                        res['COMBINED_SEQ']['cls_comb_cls_av'] = {}
+                        res['COMBINED_SEQ']['cls_comb_det_av'] = {}
+                        for metric, metric_name in zip(metrics_list, metric_names):
+                            cls_res = {cls_key: cls_value[metric_name] for cls_key, cls_value in
+                                       res['COMBINED_SEQ'].items() if cls_key not in combined_cls_keys}
+                            res['COMBINED_SEQ']['cls_comb_cls_av'][metric_name] = \
+                                metric.combine_classes_class_averaged(cls_res)
+                            res['COMBINED_SEQ']['cls_comb_det_av'][metric_name] = \
+                                metric.combine_classes_det_averaged(cls_res)
+                    # combine classes to super classes
+                    if dataset.use_super_categories:
+                        for cat, sub_cats in dataset.super_categories.items():
+                            combined_cls_keys.append(cat)
+                            res['COMBINED_SEQ'][cat] = {}
+                            for metric, metric_name in zip(metrics_list, metric_names):
+                                cat_res = {cls_key: cls_value[metric_name] for cls_key, cls_value in
+                                           res['COMBINED_SEQ'].items() if cls_key in sub_cats}
+                                res['COMBINED_SEQ'][cat][metric_name] = metric.combine_classes_det_averaged(cat_res)
+                    # Print and output results in various formats
+                    output_fol = dataset.get_output_fol(tracker)
+                    tracker_display_name = dataset.get_display_name(tracker)
+                    for c_cls in res['COMBINED_SEQ'].keys():  # class_list + combined classes if calculated
+                        summaries = []
+                        details = []
+                        num_dets = res['COMBINED_SEQ'][c_cls]['Count']['Dets']
+                        if config['OUTPUT_EMPTY_CLASSES'] or num_dets > 0:
+                            for metric, metric_name in zip(metrics_list, metric_names):
+                                # for combined classes there is no per sequence evaluation
+                                if c_cls in combined_cls_keys:
+                                    table_res = {'COMBINED_SEQ': res['COMBINED_SEQ'][c_cls][metric_name]}
+                                else:
+                                    table_res = {seq_key: seq_value[c_cls][metric_name] for seq_key, seq_value in res.items()}
+                                if config['PLOT_CURVES']:
+                                    metric.plot_single_tracker_results(table_res, tracker_display_name, c_cls, output_fol)
+                            if config['OUTPUT_SUMMARY']:
+                                utils.write_summary_results(summaries, c_cls, output_fol)
+                            if config['OUTPUT_DETAILED']:
+                                utils.write_detailed_results(details, c_cls, output_fol)
+                    # Output for returning from function
+                    res_output = {}
+                    res_output["AP_all"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['TrackMAP']['AP_all']), 2)
+                    res_output["AP_s"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['TrackMAP']['AP_area_s']), 2)
+                    res_output["AP_m"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['TrackMAP']['AP_area_m']), 2)
+                    res_output["AP_l"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['TrackMAP']['AP_area_l']), 2)
+                    res_output["AR_all"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['TrackMAP']['AR_all']), 2)
+                    res_output["HOTA"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['HOTA']['HOTA']), 2)
+                    res_output["DetA"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['HOTA']['DetA']), 2)
+                    res_output["DetRe"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['HOTA']['DetRe']), 2)
+                    res_output["DetPr"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['HOTA']['DetPr']), 2)
+                    res_output["AssA"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['HOTA']['AssA']), 2)
+                    res_output["AssRe"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['HOTA']['AssRe']), 2)
+                    res_output["AssPr"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['HOTA']['AssPr']), 2)
+                    res_output["LocA"] = round(100 * np.mean(res['COMBINED_SEQ']['cls_comb_cls_av']['HOTA']['LocA']), 2)
+                    res_output["FA"] = round(100 * np.mean(res_av_loc_all['FA']), 2)
+                    res_output["FAn"] = round(100 * np.mean(res_av_loc_all['FAn']), 2)
+                    res_output['FAn_count'] = int(np.mean(res_av_loc_all['FAn_count']))
+                    res_output['FAn_all'] = int(np.mean(res_av_loc_all['FAn_all']))
+                    res_output["FAs"] = round(100 * np.mean(res_av_loc_all['FAs']), 2)
+                    res_output['FAs_count'] = int(np.mean(res_av_loc_all['FAs_count']))
+                    res_output['FAs_all'] = int(np.mean(res_av_loc_all['FAs_all']))
+                    res_output["FAm"] = round(100 * np.mean(res_av_loc_all['FAm']), 2)
+                    res_output['FAm_count'] = int(np.mean(res_av_loc_all['FAm_count']))
+                    res_output['FAm_all'] = int(np.mean(res_av_loc_all['FAm_all']))
+                    output_res[dataset_name][tracker] = res_output
+                    output_msg[dataset_name][tracker] = 'Success'
+                except Exception as err:
+                    output_res[dataset_name][tracker] = None
+                    if type(err) == TrackEvalException:
+                        output_msg[dataset_name][tracker] = str(err)
+                    else:
+                        output_msg[dataset_name][tracker] = 'Unknown error occurred.'
+                    print('Tracker %s was unable to be evaluated.' % tracker)
+                    print(err)
+                    traceback.print_exc()
+                    if config['LOG_ON_ERROR'] is not None:
+                        with open(config['LOG_ON_ERROR'], 'a') as f:
+                            print(dataset_name, file=f)
+                            print(tracker, file=f)
+                            print(traceback.format_exc(), file=f)
+                            print('\n\n\n', file=f)
+                    if config['BREAK_ON_ERROR']:
+                        raise err
+                    elif config['RETURN_ON_ERROR']:
+                        return output_res, output_msg
+        return output_res, output_msg
+@_timing.time
+def eval_sequence(seq, dataset, tracker, class_list, metrics_list, metric_names):
+    """Function for evaluating a single sequence"""
+    raw_data = dataset.get_raw_seq_data(tracker, seq)
+    seq_res = {}
+    for cls in class_list:
+        seq_res[cls] = {}
+        data = dataset.get_preprocessed_seq_data(raw_data, cls)
+        for metric, met_name in zip(metrics_list, metric_names):
+            seq_res[cls][met_name] = metric.eval_sequence(data)
+    return seq_res
+def eval_av_loc_sequence(seq, dataset, tracker):
+    """Function for evaluating a single sequence"""
+    raw_data = dataset.get_raw_seq_data(tracker, seq)
+    av_loc_res = compute_av_loc(raw_data)
+    return av_loc_res

avism/data/aviseval/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .hota import HOTA
+from .clear import CLEAR
+from .identity import Identity
+from .count import Count
+from .j_and_f import JAndF
+from .track_map import TrackMAP
+from .vace import VACE
+from .ideucl import IDEucl
+from .avisa import avisA
+from .av_loc import compute_av_loc, combine_av_loc_sequences

avism/data/aviseval/metrics/_base_metric.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import numpy as np
+from abc import ABC, abstractmethod
+from .. import _timing
+from ..utils import TrackEvalException
+class _BaseMetric(ABC):
+    @abstractmethod
+    def __init__(self):
+        self.plottable = False
+        self.integer_fields = []
+        self.float_fields = []
+        self.array_labels = []
+        self.integer_array_fields = []
+        self.float_array_fields = []
+        self.fields = []
+        self.summary_fields = []
+        self.registered = False
+    #####################################################################
+    # Abstract functions for subclasses to implement
+    @_timing.time
+    @abstractmethod
+    def eval_sequence(self, data):
+        ...
+    @abstractmethod
+    def combine_sequences(self, all_res):
+        ...
+    @abstractmethod
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
+        ...
+    @ abstractmethod
+    def combine_classes_det_averaged(self, all_res):
+        ...
+    def plot_single_tracker_results(self, all_res, tracker, output_folder, cls):
+        """Plot results of metrics, only valid for metrics with self.plottable"""
+        if self.plottable:
+            raise NotImplementedError('plot_results is not implemented for metric %s' % self.get_name())
+        else:
+            pass
+    #####################################################################
+    # Helper functions which are useful for all metrics:
+    @classmethod
+    def get_name(cls):
+        return cls.__name__
+    @staticmethod
+    def _combine_sum(all_res, field):
+        """Combine sequence results via sum"""
+        return sum([all_res[k][field] for k in all_res.keys()])
+    @staticmethod
+    def _combine_weighted_av(all_res, field, comb_res, weight_field):
+        """Combine sequence results via weighted average"""
+        return sum([all_res[k][field] * all_res[k][weight_field] for k in all_res.keys()]) / np.maximum(1.0, comb_res[
+            weight_field])
+    def print_table(self, table_res, tracker, cls):
+        """Prints table of results for all sequences"""
+        print('')
+        metric_name = self.get_name()
+        self._row_print([metric_name + ': ' + tracker + '-' + cls] + self.summary_fields)
+        for seq, results in sorted(table_res.items()):
+            if seq == 'COMBINED_SEQ':
+                continue
+            summary_res = self._summary_row(results)
+            self._row_print([seq] + summary_res)
+        summary_res = self._summary_row(table_res['COMBINED_SEQ'])
+        self._row_print(['COMBINED'] + summary_res)
+    def _summary_row(self, results_):
+        vals = []
+        for h in self.summary_fields:
+            if h in self.float_array_fields:
+                vals.append("{0:1.5g}".format(100 * np.mean(results_[h])))
+            elif h in self.float_fields:
+                vals.append("{0:1.5g}".format(100 * float(results_[h])))
+            elif h in self.integer_fields:
+                vals.append("{0:d}".format(int(results_[h])))
+            else:
+                raise NotImplementedError("Summary function not implemented for this field type.")
+        return vals
+    @staticmethod
+    def _row_print(*argv):
+        """Prints results in an evenly spaced rows, with more space in first row"""
+        if len(argv) == 1:
+            argv = argv[0]
+        to_print = '%-35s' % argv[0]
+        for v in argv[1:]:
+            to_print += '%-10s' % str(v)
+        print(to_print)
+    def summary_results(self, table_res):
+        """Returns a simple summary of final results for a tracker"""
+        return dict(zip(self.summary_fields, self._summary_row(table_res['COMBINED_SEQ'])))
+    def detailed_results(self, table_res):
+        """Returns detailed final results for a tracker"""
+        # Get detailed field information
+        detailed_fields = self.float_fields + self.integer_fields
+        for h in self.float_array_fields + self.integer_array_fields:
+            for alpha in [int(100*x) for x in self.array_labels]:
+                detailed_fields.append(h + '___' + str(alpha))
+            detailed_fields.append(h + '___AUC')
+        # Get detailed results
+        detailed_results = {}
+        for seq, res in table_res.items():
+            detailed_row = self._detailed_row(res)
+            if len(detailed_row) != len(detailed_fields):
+                raise TrackEvalException(
+                    'Field names and data have different sizes (%i and %i)' % (len(detailed_row), len(detailed_fields)))
+            detailed_results[seq] = dict(zip(detailed_fields, detailed_row))
+        return detailed_results
+    def _detailed_row(self, res):
+        detailed_row = []
+        for h in self.float_fields + self.integer_fields:
+            detailed_row.append(res[h])
+        for h in self.float_array_fields + self.integer_array_fields:
+            for i, alpha in enumerate([int(100 * x) for x in self.array_labels]):
+                detailed_row.append(res[h][i])
+            detailed_row.append(np.mean(res[h]))
+        return detailed_row

avism/data/aviseval/metrics/av_loc.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import numpy as np
+from pycocotools import mask as mask_utils
+from scipy.optimize import linear_sum_assignment
+def compute_av_loc(data):
+    alphas = np.arange(0.05, 0.99, 0.05)
+    res = {}
+    res['FA'] = np.zeros((len(alphas)), dtype=float)
+    res['FAn'] = np.array([None] * len(alphas))   # frame accuracy in no sound source
+    res['FAs'] = np.array([None] * len(alphas))   # frame accuracy in single sound source
+    res['FAm'] = np.array([None] * len(alphas))   # frame accuracy in multi sound source
+    res['frame_num_n_all'] = np.zeros((len(alphas)), dtype=int)
+    res['frame_num_n_tp'] = np.zeros((len(alphas)), dtype=int)
+    res['frame_num_s_all'] = np.zeros((len(alphas)), dtype=int)
+    res['frame_num_s_tp'] = np.zeros((len(alphas)), dtype=int)
+    res['frame_num_m_all'] = np.zeros((len(alphas)), dtype=int)
+    res['frame_num_m_tp'] = np.zeros((len(alphas)), dtype=int)
+    frame_num_all = data['num_timesteps']
+    gt_classes = data['gt_classes']
+    gt_dets = data['gt_dets']
+    raw_classes = data['raw_classes']
+    raw_dets = data['raw_dets']
+    pred_classes = data['tracker_classes']
+    pred_dets = data['tracker_dets']
+    # 1. Find the best trajectory between gt and pred
+    unique_gt_ids = []
+    unique_tracker_ids = []
+    for t in range(data['num_timesteps']):
+        unique_gt_ids += list(np.unique(data['gt_ids'][t]))
+        unique_tracker_ids += list(np.unique(data['tracker_ids'][t]))
+    # Re-label IDs such that there are no empty IDs
+    if len(unique_gt_ids) > 0:
+        unique_gt_ids = np.unique(unique_gt_ids)
+        gt_id_map = np.nan * np.ones((np.max(unique_gt_ids) + 1))
+        gt_id_map[unique_gt_ids] = np.arange(len(unique_gt_ids))
+        for t in range(data['num_timesteps']):
+            if len(data['gt_ids'][t]) > 0:
+                data['gt_ids'][t] = gt_id_map[data['gt_ids'][t]].astype(int)
+    if len(unique_tracker_ids) > 0:
+        unique_tracker_ids = np.unique(unique_tracker_ids)
+        tracker_id_map = np.nan * np.ones((np.max(unique_tracker_ids) + 1))
+        tracker_id_map[unique_tracker_ids] = np.arange(len(unique_tracker_ids))
+        for t in range(data['num_timesteps']):
+            if len(data['tracker_ids'][t]) > 0:
+                data['tracker_ids'][t] = tracker_id_map[data['tracker_ids'][t]].astype(int)
+    data['num_tracker_ids'] = len(unique_tracker_ids)
+    data['num_gt_ids'] = len(unique_gt_ids)
+    # Variables counting global association
+    potential_matches_count = np.zeros((data['num_gt_ids'], data['num_tracker_ids']))
+    gt_id_count = np.zeros((data['num_gt_ids'], 1))
+    tracker_id_count = np.zeros((1, data['num_tracker_ids']))
+    # First loop through each timestep and accumulate global track information.
+    for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
+        # Count the potential matches between ids in each timestep
+        # These are normalised, weighted by the match similarity.
+        similarity = data['similarity_scores'][t]
+        sim_iou_denom = similarity.sum(0)[np.newaxis, :] + similarity.sum(1)[:, np.newaxis] - similarity
+        sim_iou = np.zeros_like(similarity)
+        sim_iou_mask = sim_iou_denom > 0 + np.finfo('float').eps
+        sim_iou[sim_iou_mask] = similarity[sim_iou_mask] / sim_iou_denom[sim_iou_mask]
+        potential_matches_count[gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]] += sim_iou
+        # Calculate the total number of dets for each gt_id and tracker_id.
+        gt_id_count[gt_ids_t] += 1
+        tracker_id_count[0, tracker_ids_t] += 1
+    # Calculate overall jaccard alignment score (before unique matching) between IDs
+    global_alignment_score = potential_matches_count / (gt_id_count + tracker_id_count - potential_matches_count)
+    # Hungarian algorithm to find best matches
+    match_rows, match_cols = linear_sum_assignment(-global_alignment_score)
+    # 2. Compute FSLA
+    for a, alpha in enumerate(alphas):
+        frame_num_n_all = 0   # total frames in no sound source
+        frame_num_s_all = 0   # total frames in single sound source
+        frame_num_m_all = 0   # total frames in multi sound source
+        frame_num_n_tp = 0    # true positive frames in no sound source
+        frame_num_s_tp = 0    # true positive frames in single sound source
+        frame_num_m_tp = 0    # true positive frames in multi sound source
+        for frame_id in range(frame_num_all):
+            # classes
+            gt_classes_per_frame = gt_classes[frame_id]
+            raw_classes_per_frame = raw_classes[frame_id]
+            pred_classes_per_frame = pred_classes[frame_id]
+            # masks
+            gt_dets_per_frame = gt_dets[frame_id]
+            raw_dets_per_frame = raw_dets[frame_id]
+            pred_dets_per_frame = pred_dets[frame_id]
+            if len(pred_dets_per_frame) > 0:
+                pred_dets_per_frame_f = [di for di in pred_dets_per_frame if di['counts'] != 'PPTl0']
+            else:
+                pred_dets_per_frame_f = pred_dets_per_frame
+            # Masks must have the same class and number
+            if (set(gt_classes_per_frame) == set(pred_classes_per_frame)) and (len(gt_dets_per_frame) == len(pred_dets_per_frame_f)):
+                # 1) no sound source
+                if len(gt_dets_per_frame) == 0:
+                    frame_num_n_all += 1
+                    frame_num_n_tp += 1
+                # 2) single sound source
+                elif len(gt_dets_per_frame) == 1:
+                    frame_num_s_all += 1
+                    index_gt = [index for index, value in enumerate(raw_dets_per_frame) if value is not None][0]
+                    index_pred = [index for index, element in enumerate(match_cols) if element == index_gt]
+                    if index_pred != []:
+                        ious = mask_utils.iou(gt_dets_per_frame, [pred_dets_per_frame[index_pred[0]]], [False])
+                        if np.all(ious > alpha):
+                            frame_num_s_tp += 1
+                # 3) multi sound source
+                else:
+                    frame_num_m_all += 1
+                    flags = [0] * len(match_rows)
+                    for tr in range(len(match_rows)):
+                        if (raw_classes_per_frame[match_rows[tr]] == pred_classes_per_frame[match_cols[tr]]):
+                            if raw_dets_per_frame[match_rows[tr]] == None:
+                                if pred_dets_per_frame[match_cols[tr]]['counts'] == 'PPTl0':
+                                    flags[tr] = 1
+                            else:
+                                iou = mask_utils.iou([raw_dets_per_frame[match_rows[tr]]],
+                                                     [pred_dets_per_frame[match_cols[tr]]], [False])
+                                if np.all(iou > alpha):
+                                    flags[tr] = 1
+                    if all(ff == 1 for ff in flags):
+                        frame_num_m_tp += 1
+            else:
+                if len(gt_dets_per_frame) == 0:
+                    frame_num_n_all += 1
+                elif len(gt_dets_per_frame) == 1:
+                    frame_num_s_all += 1
+                else:
+                    frame_num_m_all += 1
+        assert frame_num_all == (frame_num_n_all + frame_num_s_all + frame_num_m_all)
+        if frame_num_n_all > 0:
+            res['FAn'][a] = frame_num_n_tp / frame_num_n_all
+            res['frame_num_n_all'][a] = frame_num_n_all
+            res['frame_num_n_tp'][a] = frame_num_n_tp
+        else:
+            res['FAn'][a] = None
+            res['frame_num_n_all'][a] = 0
+            res['frame_num_n_tp'][a] = 0
+        if frame_num_s_all > 0:
+            res['FAs'][a] = frame_num_s_tp / frame_num_s_all
+            res['frame_num_s_all'][a] = frame_num_s_all
+            res['frame_num_s_tp'][a] = frame_num_s_tp
+        else:
+            res['FAs'][a] = None
+            res['frame_num_s_all'][a] = 0
+            res['frame_num_s_tp'][a] = 0
+        if frame_num_m_all > 0:
+            res['FAm'][a] = frame_num_m_tp / frame_num_m_all
+            res['frame_num_m_all'][a] = frame_num_m_all
+            res['frame_num_m_tp'][a] = frame_num_m_tp
+        else:
+            res['FAm'][a] = None
+            res['frame_num_m_all'][a] = 0
+            res['frame_num_m_tp'][a] = 0
+        res['FA'][a] = (frame_num_n_tp + frame_num_s_tp + frame_num_m_tp) / frame_num_all
+    return res
+def combine_av_loc_sequences(all_res):
+    """Combines metrics across all sequences"""
+    res = {}
+    fields_num = ['frame_num_n_all', 'frame_num_s_all', 'frame_num_m_all', 'frame_num_n_tp', 'frame_num_s_tp', 'frame_num_m_tp']
+    for field in fields_num:
+        res[field] = sum([all_res[k][field] for k in all_res.keys()])
+    res_final = {}
+    res_final['FAn'] = res['frame_num_n_tp'] / res['frame_num_n_all']
+    res_final['FAn_count'] = res['frame_num_n_tp']
+    res_final['FAn_all'] = res['frame_num_n_all']
+    res_final['FAs'] = res['frame_num_s_tp'] / res['frame_num_s_all']
+    res_final['FAs_count'] = res['frame_num_s_tp']
+    res_final['FAs_all'] = res['frame_num_s_all']
+    res_final['FAm'] = res['frame_num_m_tp'] / res['frame_num_m_all']
+    res_final['FAm_count'] = res['frame_num_m_tp']
+    res_final['FAm_all'] = res['frame_num_m_all']
+    res_final['FA'] = (res['frame_num_n_tp'] + res['frame_num_s_tp'] + res['frame_num_m_tp']) / (res['frame_num_n_all'] + res['frame_num_s_all'] + res['frame_num_m_all'])
+    return res_final

avism/data/aviseval/metrics/avisa.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import os
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+from ._base_metric import _BaseMetric
+from .. import _timing
+class avisA(_BaseMetric):
+    def __init__(self, config=None):
+        super().__init__()
+        self.plottable = True
+        self.array_labels = np.arange(0.05, 0.99, 0.05)
+        self.integer_array_fields = ['HOTA_TP', 'HOTA_FN', 'HOTA_FP']
+        self.float_array_fields = ['AssA', 'AssRe', 'AssPr', 'SegA']
+        self.float_fields = ['SegA(0)']
+        self.fields = self.float_array_fields + self.integer_array_fields + self.float_fields
+        self.summary_fields = self.float_array_fields + self.float_fields
+    @_timing.time
+    def eval_sequence(self, data):
+        """Calculates the AssA and SegA metrics for one sequence"""
+        # Initialise results
+        res = {}
+        for field in self.float_array_fields + self.integer_array_fields:
+            res[field] = np.zeros((len(self.array_labels)), dtype=float)
+        for field in self.float_fields:
+            res[field] = 0
+        # Return result quickly if tracker or gt sequence is empty
+        if data['num_tracker_dets'] == 0:
+            res['HOTA_FN'] = data['num_gt_dets'] * np.ones((len(self.array_labels)), dtype=float)
+            res['SegA'] = np.ones((len(self.array_labels)), dtype=float)
+            res['SegA(0)'] = 1.0
+            return res
+        if data['num_gt_dets'] == 0:
+            res['HOTA_FP'] = data['num_tracker_dets'] * np.ones((len(self.array_labels)), dtype=float)
+            res['SegA'] = np.ones((len(self.array_labels)), dtype=float)
+            res['SegA(0)'] = 1.0
+            return res
+        # Variables counting global association
+        potential_matches_count = np.zeros((data['num_gt_ids'], data['num_tracker_ids']))
+        gt_id_count = np.zeros((data['num_gt_ids'], 1))
+        tracker_id_count = np.zeros((1, data['num_tracker_ids']))
+        # First loop through each timestep and accumulate global track information.
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
+            # Count the potential matches between ids in each timestep
+            # These are normalised, weighted by the match similarity.
+            similarity = data['similarity_scores'][t]
+            sim_iou_denom = similarity.sum(0)[np.newaxis, :] + similarity.sum(1)[:, np.newaxis] - similarity
+            sim_iou = np.zeros_like(similarity)
+            sim_iou_mask = sim_iou_denom > 0 + np.finfo('float').eps
+            sim_iou[sim_iou_mask] = similarity[sim_iou_mask] / sim_iou_denom[sim_iou_mask]
+            potential_matches_count[gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]] += sim_iou
+            # Calculate the total number of dets for each gt_id and tracker_id.
+            gt_id_count[gt_ids_t] += 1
+            tracker_id_count[0, tracker_ids_t] += 1
+        # Calculate overall jaccard alignment score (before unique matching) between IDs
+        global_alignment_score = potential_matches_count / (gt_id_count + tracker_id_count - potential_matches_count)
+        matches_counts = [np.zeros_like(potential_matches_count) for _ in self.array_labels]
+        # Calculate scores for each timestep
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
+            # Deal with the case that there are no gt_det/tracker_det in a timestep.
+            if len(gt_ids_t) == 0:
+                for a, alpha in enumerate(self.array_labels):
+                    res['HOTA_FP'][a] += len(tracker_ids_t)
+                continue
+            if len(tracker_ids_t) == 0:
+                for a, alpha in enumerate(self.array_labels):
+                    res['HOTA_FN'][a] += len(gt_ids_t)
+                continue
+            # Get matching scores between pairs of dets for optimizing HOTA
+            similarity = data['similarity_scores'][t]
+            score_mat = global_alignment_score[gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]] * similarity
+            # Hungarian algorithm to find best matches: 找出最优跟踪路线
+            match_rows, match_cols = linear_sum_assignment(-score_mat)
+            # Calculate and accumulate basic statistics
+            for a, alpha in enumerate(self.array_labels):
+                actually_matched_mask = similarity[match_rows, match_cols] >= alpha - np.finfo('float').eps
+                alpha_match_rows = match_rows[actually_matched_mask]
+                alpha_match_cols = match_cols[actually_matched_mask]
+                num_matches = len(alpha_match_rows)
+                res['HOTA_TP'][a] += num_matches
+                res['HOTA_FN'][a] += len(gt_ids_t) - num_matches
+                res['HOTA_FP'][a] += len(tracker_ids_t) - num_matches
+                if num_matches > 0:
+                    res['SegA'][a] += sum(similarity[alpha_match_rows, alpha_match_cols])
+                    matches_counts[a][gt_ids_t[alpha_match_rows], tracker_ids_t[alpha_match_cols]] += 1
+        # Calculate association scores (AssA, AssRe, AssPr) for the alpha value.
+        # First calculate scores per gt_id/tracker_id combo and then average over the number of detections.
+        for a, alpha in enumerate(self.array_labels):
+            matches_count = matches_counts[a]
+            ass_a = matches_count / np.maximum(1, gt_id_count + tracker_id_count - matches_count)
+            res['AssA'][a] = np.sum(matches_count * ass_a) / np.maximum(1, res['HOTA_TP'][a])
+            ass_re = matches_count / np.maximum(1, gt_id_count)
+            res['AssRe'][a] = np.sum(matches_count * ass_re) / np.maximum(1, res['HOTA_TP'][a])
+            ass_pr = matches_count / np.maximum(1, tracker_id_count)
+            res['AssPr'][a] = np.sum(matches_count * ass_pr) / np.maximum(1, res['HOTA_TP'][a])
+        # Calculate final scores
+        res['SegA'] = np.maximum(1e-10, res['SegA']) / np.maximum(1e-10, res['HOTA_TP'])
+        res = self._compute_final_fields(res)
+        return res
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences"""
+        res = {}
+        for field in self.integer_array_fields:
+            res[field] = self._combine_sum(all_res, field)
+        for field in ['AssRe', 'AssPr', 'AssA']:
+            res[field] = self._combine_weighted_av(all_res, field, res, weight_field='HOTA_TP')
+        sega_weighted_sum = sum([all_res[k]['SegA'] * all_res[k]['HOTA_TP'] for k in all_res.keys()])
+        res['SegA'] = np.maximum(1e-10, sega_weighted_sum) / np.maximum(1e-10, res['HOTA_TP'])
+        res = self._compute_final_fields(res)
+        return res
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
+        """Combines metrics across all classes by averaging over the class values.
+        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
+        """
+        res = {}
+        for field in self.integer_array_fields:
+            if ignore_empty_classes:
+                res[field] = self._combine_sum(
+                    {k: v for k, v in all_res.items()
+                     if (v['HOTA_TP'] + v['HOTA_FN'] + v['HOTA_FP'] > 0 + np.finfo('float').eps).any()}, field)
+            else:
+                res[field] = self._combine_sum({k: v for k, v in all_res.items()}, field)
+        for field in self.float_fields + self.float_array_fields:
+            if ignore_empty_classes:
+                res[field] = np.mean([v[field] for v in all_res.values() if
+                                      (v['HOTA_TP'] + v['HOTA_FN'] + v['HOTA_FP'] > 0 + np.finfo('float').eps).any()],
+                                     axis=0)
+            else:
+                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
+        return res
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {}
+        for field in self.integer_array_fields:
+            res[field] = self._combine_sum(all_res, field)
+        for field in ['AssRe', 'AssPr', 'AssA']:
+            res[field] = self._combine_weighted_av(all_res, field, res, weight_field='HOTA_TP')
+        sega_weighted_sum = sum([all_res[k]['SegA'] * all_res[k]['HOTA_TP'] for k in all_res.keys()])
+        res['SegA'] = np.maximum(1e-10, sega_weighted_sum) / np.maximum(1e-10, res['HOTA_TP'])
+        res = self._compute_final_fields(res)
+        return res
+    @staticmethod
+    def _compute_final_fields(res):
+        """Calculate sub-metric ('field') values which only depend on other sub-metric values.
+        This function is used both for both per-sequence calculation, and in combining values across sequences.
+        """
+        res['SegA(0)'] = res['SegA'][0]
+        return res
+    def plot_single_tracker_results(self, table_res, tracker, cls, output_folder):
+        """Create plot of results"""
+        # Only loaded when run to reduce minimum requirements
+        from matplotlib import pyplot as plt
+        res = table_res['COMBINED_SEQ']
+        styles_to_plot = ['r', 'b', 'g', 'b--', 'b:', 'g--', 'g:', 'm']
+        for name, style in zip(self.float_array_fields, styles_to_plot):
+            plt.plot(self.array_labels, res[name], style)
+        plt.xlabel('alpha')
+        plt.ylabel('score')
+        plt.title(tracker + ' - ' + cls)
+        plt.axis([0, 1, 0, 1])
+        legend = []
+        for name in self.float_array_fields:
+            legend += [name + ' (' + str(np.round(np.mean(res[name]), 2)) + ')']
+        plt.legend(legend, loc='lower left')
+        out_file = os.path.join(output_folder, cls + '_plot.pdf')
+        os.makedirs(os.path.dirname(out_file), exist_ok=True)
+        plt.savefig(out_file)
+        plt.savefig(out_file.replace('.pdf', '.png'))
+        plt.clf()

avism/data/aviseval/metrics/clear.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import numpy as np
+from scipy.optimize import linear_sum_assignment
+from ._base_metric import _BaseMetric
+from .. import _timing
+from .. import utils
+class CLEAR(_BaseMetric):
+    """Class which implements the CLEAR metrics"""
+    @staticmethod
+    def get_default_config():
+        """Default class config values"""
+        default_config = {
+            'THRESHOLD': 0.5,  # Similarity score threshold required for a TP match. Default 0.5.
+            'PRINT_CONFIG': True,  # Whether to print the config information on init. Default: False.
+        }
+        return default_config
+    def __init__(self, config=None):
+        super().__init__()
+        main_integer_fields = ['CLR_TP', 'CLR_FN', 'CLR_FP', 'IDSW', 'MT', 'PT', 'ML', 'Frag']
+        extra_integer_fields = ['CLR_Frames']
+        self.integer_fields = main_integer_fields + extra_integer_fields
+        main_float_fields = ['MOTA', 'MOTP', 'MODA', 'CLR_Re', 'CLR_Pr', 'MTR', 'PTR', 'MLR', 'sMOTA']
+        extra_float_fields = ['CLR_F1', 'FP_per_frame', 'MOTAL', 'MOTP_sum']
+        self.float_fields = main_float_fields + extra_float_fields
+        self.fields = self.float_fields + self.integer_fields
+        self.summed_fields = self.integer_fields + ['MOTP_sum']
+        self.summary_fields = main_float_fields + main_integer_fields
+        # Configuration options:
+        self.config = utils.init_config(config, self.get_default_config(), self.get_name())
+        self.threshold = float(self.config['THRESHOLD'])
+    @_timing.time
+    def eval_sequence(self, data):
+        """Calculates CLEAR metrics for one sequence"""
+        # Initialise results
+        res = {}
+        for field in self.fields:
+            res[field] = 0
+        # Return result quickly if tracker or gt sequence is empty
+        if data['num_tracker_dets'] == 0:
+            res['CLR_FN'] = data['num_gt_dets']
+            res['ML'] = data['num_gt_ids']
+            res['MLR'] = 1.0
+            return res
+        if data['num_gt_dets'] == 0:
+            res['CLR_FP'] = data['num_tracker_dets']
+            res['MLR'] = 1.0
+            return res
+        # Variables counting global association
+        num_gt_ids = data['num_gt_ids']
+        gt_id_count = np.zeros(num_gt_ids)  # For MT/ML/PT
+        gt_matched_count = np.zeros(num_gt_ids)  # For MT/ML/PT
+        gt_frag_count = np.zeros(num_gt_ids)  # For Frag
+        # Note that IDSWs are counted based on the last time each gt_id was present (any number of frames previously),
+        # but are only used in matching to continue current tracks based on the gt_id in the single previous timestep.
+        prev_tracker_id = np.nan * np.zeros(num_gt_ids)  # For scoring IDSW
+        prev_timestep_tracker_id = np.nan * np.zeros(num_gt_ids)  # For matching IDSW
+        # Calculate scores for each timestep
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
+            # Deal with the case that there are no gt_det/tracker_det in a timestep.
+            if len(gt_ids_t) == 0:
+                res['CLR_FP'] += len(tracker_ids_t)
+                continue
+            if len(tracker_ids_t) == 0:
+                res['CLR_FN'] += len(gt_ids_t)
+                gt_id_count[gt_ids_t] += 1
+                continue
+            # Calc score matrix to first minimise IDSWs from previous frame, and then maximise MOTP secondarily
+            similarity = data['similarity_scores'][t]
+            score_mat = (tracker_ids_t[np.newaxis, :] == prev_timestep_tracker_id[gt_ids_t[:, np.newaxis]])
+            score_mat = 1000 * score_mat + similarity
+            score_mat[similarity < self.threshold - np.finfo('float').eps] = 0
+            # Hungarian algorithm to find best matches
+            match_rows, match_cols = linear_sum_assignment(-score_mat)
+            actually_matched_mask = score_mat[match_rows, match_cols] > 0 + np.finfo('float').eps
+            match_rows = match_rows[actually_matched_mask]
+            match_cols = match_cols[actually_matched_mask]
+            matched_gt_ids = gt_ids_t[match_rows]
+            matched_tracker_ids = tracker_ids_t[match_cols]
+            # Calc IDSW for MOTA
+            prev_matched_tracker_ids = prev_tracker_id[matched_gt_ids]
+            is_idsw = (np.logical_not(np.isnan(prev_matched_tracker_ids))) & (
+                np.not_equal(matched_tracker_ids, prev_matched_tracker_ids))
+            res['IDSW'] += np.sum(is_idsw)
+            # Update counters for MT/ML/PT/Frag and record for IDSW/Frag for next timestep
+            gt_id_count[gt_ids_t] += 1
+            gt_matched_count[matched_gt_ids] += 1
+            not_previously_tracked = np.isnan(prev_timestep_tracker_id)
+            prev_tracker_id[matched_gt_ids] = matched_tracker_ids
+            prev_timestep_tracker_id[:] = np.nan
+            prev_timestep_tracker_id[matched_gt_ids] = matched_tracker_ids
+            currently_tracked = np.logical_not(np.isnan(prev_timestep_tracker_id))
+            gt_frag_count += np.logical_and(not_previously_tracked, currently_tracked)
+            # Calculate and accumulate basic statistics
+            num_matches = len(matched_gt_ids)
+            res['CLR_TP'] += num_matches
+            res['CLR_FN'] += len(gt_ids_t) - num_matches
+            res['CLR_FP'] += len(tracker_ids_t) - num_matches
+            if num_matches > 0:
+                res['MOTP_sum'] += sum(similarity[match_rows, match_cols])
+        # Calculate MT/ML/PT/Frag/MOTP
+        tracked_ratio = gt_matched_count[gt_id_count > 0] / gt_id_count[gt_id_count > 0]
+        res['MT'] = np.sum(np.greater(tracked_ratio, 0.8))
+        res['PT'] = np.sum(np.greater_equal(tracked_ratio, 0.2)) - res['MT']
+        res['ML'] = num_gt_ids - res['MT'] - res['PT']
+        res['Frag'] = np.sum(np.subtract(gt_frag_count[gt_frag_count > 0], 1))
+        res['MOTP'] = res['MOTP_sum'] / np.maximum(1.0, res['CLR_TP'])
+        res['CLR_Frames'] = data['num_timesteps']
+        # Calculate final CLEAR scores
+        res = self._compute_final_fields(res)
+        return res
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences"""
+        res = {}
+        for field in self.summed_fields:
+            res[field] = self._combine_sum(all_res, field)
+        res = self._compute_final_fields(res)
+        return res
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {}
+        for field in self.summed_fields:
+            res[field] = self._combine_sum(all_res, field)
+        res = self._compute_final_fields(res)
+        return res
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
+        """Combines metrics across all classes by averaging over the class values.
+        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
+        """
+        res = {}
+        for field in self.integer_fields:
+            if ignore_empty_classes:
+                res[field] = self._combine_sum(
+                    {k: v for k, v in all_res.items() if v['CLR_TP'] + v['CLR_FN'] + v['CLR_FP'] > 0}, field)
+            else:
+                res[field] = self._combine_sum({k: v for k, v in all_res.items()}, field)
+        for field in self.float_fields:
+            if ignore_empty_classes:
+                res[field] = np.mean(
+                    [v[field] for v in all_res.values() if v['CLR_TP'] + v['CLR_FN'] + v['CLR_FP'] > 0], axis=0)
+            else:
+                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
+        return res
+    @staticmethod
+    def _compute_final_fields(res):
+        """Calculate sub-metric ('field') values which only depend on other sub-metric values.
+        This function is used both for both per-sequence calculation, and in combining values across sequences.
+        """
+        num_gt_ids = res['MT'] + res['ML'] + res['PT']
+        res['MTR'] = res['MT'] / np.maximum(1.0, num_gt_ids)
+        res['MLR'] = res['ML'] / np.maximum(1.0, num_gt_ids)
+        res['PTR'] = res['PT'] / np.maximum(1.0, num_gt_ids)
+        res['CLR_Re'] = res['CLR_TP'] / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
+        res['CLR_Pr'] = res['CLR_TP'] / np.maximum(1.0, res['CLR_TP'] + res['CLR_FP'])
+        res['MODA'] = (res['CLR_TP'] - res['CLR_FP']) / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
+        res['MOTA'] = (res['CLR_TP'] - res['CLR_FP'] - res['IDSW']) / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
+        res['MOTP'] = res['MOTP_sum'] / np.maximum(1.0, res['CLR_TP'])
+        res['sMOTA'] = (res['MOTP_sum'] - res['CLR_FP'] - res['IDSW']) / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
+        res['CLR_F1'] = res['CLR_TP'] / np.maximum(1.0, res['CLR_TP'] + 0.5*res['CLR_FN'] + 0.5*res['CLR_FP'])
+        res['FP_per_frame'] = res['CLR_FP'] / np.maximum(1.0, res['CLR_Frames'])
+        safe_log_idsw = np.log10(res['IDSW']) if res['IDSW'] > 0 else res['IDSW']
+        res['MOTAL'] = (res['CLR_TP'] - res['CLR_FP'] - safe_log_idsw) / np.maximum(1.0, res['CLR_TP'] + res['CLR_FN'])
+        return res

avism/data/aviseval/metrics/count.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from ._base_metric import _BaseMetric
+from .. import _timing
+class Count(_BaseMetric):
+    """Class which simply counts the number of tracker and gt detections and ids."""
+    def __init__(self, config=None):
+        super().__init__()
+        self.integer_fields = ['Dets', 'GT_Dets', 'IDs', 'GT_IDs']
+        self.fields = self.integer_fields
+        self.summary_fields = self.fields
+    @_timing.time
+    def eval_sequence(self, data):
+        """Returns counts for one sequence"""
+        # Get results
+        res = {'Dets': data['num_tracker_dets'],
+               'GT_Dets': data['num_gt_dets'],
+               'IDs': data['num_tracker_ids'],
+               'GT_IDs': data['num_gt_ids'],
+               'Frames': data['num_timesteps']}
+        return res
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences"""
+        res = {}
+        for field in self.integer_fields:
+            res[field] = self._combine_sum(all_res, field)
+        return res
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=None):
+        """Combines metrics across all classes by averaging over the class values"""
+        res = {}
+        for field in self.integer_fields:
+            res[field] = self._combine_sum(all_res, field)
+        return res
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {}
+        for field in self.integer_fields:
+            res[field] = self._combine_sum(all_res, field)
+        return res

avism/data/aviseval/metrics/hota.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+from ._base_metric import _BaseMetric
+from .. import _timing
+class HOTA(_BaseMetric):
+    """Class which implements the HOTA metrics.
+    See: https://link.springer.com/article/10.1007/s11263-020-01375-2
+    """
+    def __init__(self, config=None):
+        super().__init__()
+        self.plottable = True
+        self.array_labels = np.arange(0.05, 0.99, 0.05)
+        self.integer_array_fields = ['HOTA_TP', 'HOTA_FN', 'HOTA_FP']
+        self.float_array_fields = ['HOTA', 'DetA', 'AssA', 'DetRe', 'DetPr', 'AssRe', 'AssPr', 'LocA', 'OWTA']
+        self.float_fields = ['HOTA(0)', 'LocA(0)', 'HOTALocA(0)']
+        self.fields = self.float_array_fields + self.integer_array_fields + self.float_fields
+        self.summary_fields = self.float_array_fields + self.float_fields
+    @_timing.time
+    def eval_sequence(self, data):
+        """Calculates the HOTA metrics for one sequence"""
+        # Initialise results
+        res = {}
+        for field in self.float_array_fields + self.integer_array_fields:
+            res[field] = np.zeros((len(self.array_labels)), dtype=float)
+        for field in self.float_fields:
+            res[field] = 0
+        # Return result quickly if tracker or gt sequence is empty
+        if data['num_tracker_dets'] == 0:
+            res['HOTA_FN'] = data['num_gt_dets'] * np.ones((len(self.array_labels)), dtype=float)
+            res['LocA'] = np.ones((len(self.array_labels)), dtype=float)
+            res['LocA(0)'] = 1.0
+            return res
+        if data['num_gt_dets'] == 0:
+            res['HOTA_FP'] = data['num_tracker_dets'] * np.ones((len(self.array_labels)), dtype=float)
+            res['LocA'] = np.ones((len(self.array_labels)), dtype=float)
+            res['LocA(0)'] = 1.0
+            return res
+        # Variables counting global association
+        potential_matches_count = np.zeros((data['num_gt_ids'], data['num_tracker_ids']))
+        gt_id_count = np.zeros((data['num_gt_ids'], 1))
+        tracker_id_count = np.zeros((1, data['num_tracker_ids']))
+        # First loop through each timestep and accumulate global track information.
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
+            # Count the potential matches between ids in each timestep
+            # These are normalised, weighted by the match similarity.
+            similarity = data['similarity_scores'][t]
+            sim_iou_denom = similarity.sum(0)[np.newaxis, :] + similarity.sum(1)[:, np.newaxis] - similarity
+            sim_iou = np.zeros_like(similarity)
+            sim_iou_mask = sim_iou_denom > 0 + np.finfo('float').eps
+            sim_iou[sim_iou_mask] = similarity[sim_iou_mask] / sim_iou_denom[sim_iou_mask]
+            potential_matches_count[gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]] += sim_iou
+            # Calculate the total number of dets for each gt_id and tracker_id.
+            gt_id_count[gt_ids_t] += 1
+            tracker_id_count[0, tracker_ids_t] += 1
+        # Calculate overall jaccard alignment score (before unique matching) between IDs
+        global_alignment_score = potential_matches_count / (gt_id_count + tracker_id_count - potential_matches_count)
+        matches_counts = [np.zeros_like(potential_matches_count) for _ in self.array_labels]
+        # Calculate scores for each timestep
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
+            # Deal with the case that there are no gt_det/tracker_det in a timestep.
+            if len(gt_ids_t) == 0:
+                for a, alpha in enumerate(self.array_labels):
+                    res['HOTA_FP'][a] += len(tracker_ids_t)
+                continue
+            if len(tracker_ids_t) == 0:
+                for a, alpha in enumerate(self.array_labels):
+                    res['HOTA_FN'][a] += len(gt_ids_t)
+                continue
+            # Get matching scores between pairs of dets for optimizing HOTA
+            similarity = data['similarity_scores'][t]
+            score_mat = global_alignment_score[gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]] * similarity
+            # Hungarian algorithm to find best matches: 找出最优跟踪路线
+            match_rows, match_cols = linear_sum_assignment(-score_mat)
+            # Calculate and accumulate basic statistics
+            for a, alpha in enumerate(self.array_labels):
+                actually_matched_mask = similarity[match_rows, match_cols] >= alpha - np.finfo('float').eps
+                alpha_match_rows = match_rows[actually_matched_mask]
+                alpha_match_cols = match_cols[actually_matched_mask]
+                num_matches = len(alpha_match_rows)
+                res['HOTA_TP'][a] += num_matches
+                res['HOTA_FN'][a] += len(gt_ids_t) - num_matches
+                res['HOTA_FP'][a] += len(tracker_ids_t) - num_matches
+                if num_matches > 0:
+                    res['LocA'][a] += sum(similarity[alpha_match_rows, alpha_match_cols])
+                    matches_counts[a][gt_ids_t[alpha_match_rows], tracker_ids_t[alpha_match_cols]] += 1
+        # Calculate association scores (AssA, AssRe, AssPr) for the alpha value.
+        # First calculate scores per gt_id/tracker_id combo and then average over the number of detections.
+        for a, alpha in enumerate(self.array_labels):
+            matches_count = matches_counts[a]
+            ass_a = matches_count / np.maximum(1, gt_id_count + tracker_id_count - matches_count)
+            res['AssA'][a] = np.sum(matches_count * ass_a) / np.maximum(1, res['HOTA_TP'][a])
+            ass_re = matches_count / np.maximum(1, gt_id_count)
+            res['AssRe'][a] = np.sum(matches_count * ass_re) / np.maximum(1, res['HOTA_TP'][a])
+            ass_pr = matches_count / np.maximum(1, tracker_id_count)
+            res['AssPr'][a] = np.sum(matches_count * ass_pr) / np.maximum(1, res['HOTA_TP'][a])
+        # Calculate final scores
+        res['LocA'] = np.maximum(1e-10, res['LocA']) / np.maximum(1e-10, res['HOTA_TP'])
+        res = self._compute_final_fields(res)
+        return res
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences"""
+        res = {}
+        for field in self.integer_array_fields:
+            res[field] = self._combine_sum(all_res, field)
+        for field in ['AssRe', 'AssPr', 'AssA']:
+            res[field] = self._combine_weighted_av(all_res, field, res, weight_field='HOTA_TP')
+        loca_weighted_sum = sum([all_res[k]['LocA'] * all_res[k]['HOTA_TP'] for k in all_res.keys()])
+        res['LocA'] = np.maximum(1e-10, loca_weighted_sum) / np.maximum(1e-10, res['HOTA_TP'])
+        res = self._compute_final_fields(res)
+        return res
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
+        """Combines metrics across all classes by averaging over the class values.
+        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
+        """
+        res = {}
+        for field in self.integer_array_fields:
+            if ignore_empty_classes:
+                res[field] = self._combine_sum(
+                    {k: v for k, v in all_res.items()
+                     if (v['HOTA_TP'] + v['HOTA_FN'] + v['HOTA_FP'] > 0 + np.finfo('float').eps).any()}, field)
+            else:
+                res[field] = self._combine_sum({k: v for k, v in all_res.items()}, field)
+        for field in self.float_fields + self.float_array_fields:
+            if ignore_empty_classes:
+                res[field] = np.mean([v[field] for v in all_res.values() if
+                                      (v['HOTA_TP'] + v['HOTA_FN'] + v['HOTA_FP'] > 0 + np.finfo('float').eps).any()],
+                                     axis=0)
+            else:
+                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
+        return res
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {}
+        for field in self.integer_array_fields:
+            res[field] = self._combine_sum(all_res, field)
+        for field in ['AssRe', 'AssPr', 'AssA']:
+            res[field] = self._combine_weighted_av(all_res, field, res, weight_field='HOTA_TP')
+        loca_weighted_sum = sum([all_res[k]['LocA'] * all_res[k]['HOTA_TP'] for k in all_res.keys()])
+        res['LocA'] = np.maximum(1e-10, loca_weighted_sum) / np.maximum(1e-10, res['HOTA_TP'])
+        res = self._compute_final_fields(res)
+        return res
+    @staticmethod
+    def _compute_final_fields(res):
+        """Calculate sub-metric ('field') values which only depend on other sub-metric values.
+        This function is used both for both per-sequence calculation, and in combining values across sequences.
+        """
+        res['DetRe'] = res['HOTA_TP'] / np.maximum(1, res['HOTA_TP'] + res['HOTA_FN'])
+        res['DetPr'] = res['HOTA_TP'] / np.maximum(1, res['HOTA_TP'] + res['HOTA_FP'])
+        res['DetA'] = res['HOTA_TP'] / np.maximum(1, res['HOTA_TP'] + res['HOTA_FN'] + res['HOTA_FP'])
+        res['HOTA'] = np.sqrt(res['DetA'] * res['AssA'])
+        res['OWTA'] = np.sqrt(res['DetRe'] * res['AssA'])
+        res['HOTA(0)'] = res['HOTA'][0]
+        res['LocA(0)'] = res['LocA'][0]
+        res['HOTALocA(0)'] = res['HOTA(0)']*res['LocA(0)']
+        return res
+    def plot_single_tracker_results(self, table_res, tracker, cls, output_folder):
+        """Create plot of results"""
+        # Only loaded when run to reduce minimum requirements
+        from matplotlib import pyplot as plt
+        res = table_res['COMBINED_SEQ']
+        styles_to_plot = ['r', 'b', 'g', 'b--', 'b:', 'g--', 'g:', 'm']
+        for name, style in zip(self.float_array_fields, styles_to_plot):
+            plt.plot(self.array_labels, res[name], style)
+        plt.xlabel('alpha')
+        plt.ylabel('score')
+        plt.title(tracker + ' - ' + cls)
+        plt.axis([0, 1, 0, 1])
+        legend = []
+        for name in self.float_array_fields:
+            legend += [name + ' (' + str(np.round(np.mean(res[name]), 2)) + ')']
+        plt.legend(legend, loc='lower left')
+        out_file = os.path.join(output_folder, cls + '_plot.pdf')
+        os.makedirs(os.path.dirname(out_file), exist_ok=True)
+        plt.savefig(out_file)
+        plt.savefig(out_file.replace('.pdf', '.png'))
+        plt.clf()

avism/data/aviseval/metrics/identity.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import numpy as np
+from scipy.optimize import linear_sum_assignment
+from ._base_metric import _BaseMetric
+from .. import _timing
+from .. import utils
+class Identity(_BaseMetric):
+    """Class which implements the ID metrics"""
+    @staticmethod
+    def get_default_config():
+        """Default class config values"""
+        default_config = {
+            'THRESHOLD': 0.5,  # Similarity score threshold required for a IDTP match. Default 0.5.
+            'PRINT_CONFIG': True,  # Whether to print the config information on init. Default: False.
+        }
+        return default_config
+    def __init__(self, config=None):
+        super().__init__()
+        self.integer_fields = ['IDTP', 'IDFN', 'IDFP']
+        self.float_fields = ['IDF1', 'IDR', 'IDP']
+        self.fields = self.float_fields + self.integer_fields
+        self.summary_fields = self.fields
+        # Configuration options:
+        self.config = utils.init_config(config, self.get_default_config(), self.get_name())
+        self.threshold = float(self.config['THRESHOLD'])
+    @_timing.time
+    def eval_sequence(self, data):
+        """Calculates ID metrics for one sequence"""
+        # Initialise results
+        res = {}
+        for field in self.fields:
+            res[field] = 0
+        # Return result quickly if tracker or gt sequence is empty
+        if data['num_tracker_dets'] == 0:
+            res['IDFN'] = data['num_gt_dets']
+            return res
+        if data['num_gt_dets'] == 0:
+            res['IDFP'] = data['num_tracker_dets']
+            return res
+        # Variables counting global association
+        potential_matches_count = np.zeros((data['num_gt_ids'], data['num_tracker_ids']))
+        gt_id_count = np.zeros(data['num_gt_ids'])
+        tracker_id_count = np.zeros(data['num_tracker_ids'])
+        # First loop through each timestep and accumulate global track information.
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
+            # Count the potential matches between ids in each timestep
+            matches_mask = np.greater_equal(data['similarity_scores'][t], self.threshold)
+            match_idx_gt, match_idx_tracker = np.nonzero(matches_mask)
+            potential_matches_count[gt_ids_t[match_idx_gt], tracker_ids_t[match_idx_tracker]] += 1
+            # Calculate the total number of dets for each gt_id and tracker_id.
+            gt_id_count[gt_ids_t] += 1
+            tracker_id_count[tracker_ids_t] += 1
+        # Calculate optimal assignment cost matrix for ID metrics
+        num_gt_ids = data['num_gt_ids']
+        num_tracker_ids = data['num_tracker_ids']
+        fp_mat = np.zeros((num_gt_ids + num_tracker_ids, num_gt_ids + num_tracker_ids))
+        fn_mat = np.zeros((num_gt_ids + num_tracker_ids, num_gt_ids + num_tracker_ids))
+        fp_mat[num_gt_ids:, :num_tracker_ids] = 1e10
+        fn_mat[:num_gt_ids, num_tracker_ids:] = 1e10
+        for gt_id in range(num_gt_ids):
+            fn_mat[gt_id, :num_tracker_ids] = gt_id_count[gt_id]
+            fn_mat[gt_id, num_tracker_ids + gt_id] = gt_id_count[gt_id]
+        for tracker_id in range(num_tracker_ids):
+            fp_mat[:num_gt_ids, tracker_id] = tracker_id_count[tracker_id]
+            fp_mat[tracker_id + num_gt_ids, tracker_id] = tracker_id_count[tracker_id]
+        fn_mat[:num_gt_ids, :num_tracker_ids] -= potential_matches_count
+        fp_mat[:num_gt_ids, :num_tracker_ids] -= potential_matches_count
+        # Hungarian algorithm
+        match_rows, match_cols = linear_sum_assignment(fn_mat + fp_mat)
+        # Accumulate basic statistics
+        res['IDFN'] = fn_mat[match_rows, match_cols].sum().astype(int)
+        res['IDFP'] = fp_mat[match_rows, match_cols].sum().astype(int)
+        res['IDTP'] = (gt_id_count.sum() - res['IDFN']).astype(int)
+        # Calculate final ID scores
+        res = self._compute_final_fields(res)
+        return res
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
+        """Combines metrics across all classes by averaging over the class values.
+        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
+        """
+        res = {}
+        for field in self.integer_fields:
+            if ignore_empty_classes:
+                res[field] = self._combine_sum({k: v for k, v in all_res.items()
+                                                if v['IDTP'] + v['IDFN'] + v['IDFP'] > 0 + np.finfo('float').eps},
+                                               field)
+            else:
+                res[field] = self._combine_sum({k: v for k, v in all_res.items()}, field)
+        for field in self.float_fields:
+            if ignore_empty_classes:
+                res[field] = np.mean([v[field] for v in all_res.values()
+                                      if v['IDTP'] + v['IDFN'] + v['IDFP'] > 0 + np.finfo('float').eps], axis=0)
+            else:
+                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
+        return res
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {}
+        for field in self.integer_fields:
+            res[field] = self._combine_sum(all_res, field)
+        res = self._compute_final_fields(res)
+        return res
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences"""
+        res = {}
+        for field in self.integer_fields:
+            res[field] = self._combine_sum(all_res, field)
+        res = self._compute_final_fields(res)
+        return res
+    @staticmethod
+    def _compute_final_fields(res):
+        """Calculate sub-metric ('field') values which only depend on other sub-metric values.
+        This function is used both for both per-sequence calculation, and in combining values across sequences.
+        """
+        res['IDR'] = res['IDTP'] / np.maximum(1.0, res['IDTP'] + res['IDFN'])
+        res['IDP'] = res['IDTP'] / np.maximum(1.0, res['IDTP'] + res['IDFP'])
+        res['IDF1'] = res['IDTP'] / np.maximum(1.0, res['IDTP'] + 0.5 * res['IDFP'] + 0.5 * res['IDFN'])
+        return res

avism/data/aviseval/metrics/ideucl.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import numpy as np
+from scipy.optimize import linear_sum_assignment
+from ._base_metric import _BaseMetric
+from .. import _timing
+from collections import defaultdict
+from .. import utils
+class IDEucl(_BaseMetric):
+    """Class which implements the ID metrics"""
+    @staticmethod
+    def get_default_config():
+        """Default class config values"""
+        default_config = {
+            'THRESHOLD': 0.4,  # Similarity score threshold required for a IDTP match. 0.4 for IDEucl.
+            'PRINT_CONFIG': True,  # Whether to print the config information on init. Default: False.
+        }
+        return default_config
+    def __init__(self, config=None):
+        super().__init__()
+        self.fields = ['IDEucl']
+        self.float_fields = self.fields
+        self.summary_fields = self.fields
+        # Configuration options:
+        self.config = utils.init_config(config, self.get_default_config(), self.get_name())
+        self.threshold = float(self.config['THRESHOLD'])
+    @_timing.time
+    def eval_sequence(self, data):
+        """Calculates IDEucl metrics for all frames"""
+        # Initialise results
+        res = {'IDEucl' : 0}
+        # Return result quickly if tracker or gt sequence is empty
+        if data['num_tracker_dets'] == 0 or data['num_gt_dets'] == 0.:
+            return res
+        data['centroid'] = []
+        for t, gt_det in enumerate(data['gt_dets']):
+            # import pdb;pdb.set_trace()
+            data['centroid'].append(self._compute_centroid(gt_det))
+        oid_hid_cent = defaultdict(list)
+        oid_cent = defaultdict(list)
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
+            matches_mask = np.greater_equal(data['similarity_scores'][t], self.threshold)
+            # I hope the orders of ids and boxes are maintained in `data`
+            for ind, gid in enumerate(gt_ids_t):
+                oid_cent[gid].append(data['centroid'][t][ind])
+            match_idx_gt, match_idx_tracker = np.nonzero(matches_mask)
+            for m_gid, m_tid in zip(match_idx_gt, match_idx_tracker):
+                oid_hid_cent[gt_ids_t[m_gid], tracker_ids_t[m_tid]].append(data['centroid'][t][m_gid])
+        oid_hid_dist = {k : np.sum(np.linalg.norm(np.diff(np.array(v), axis=0), axis=1)) for k, v in oid_hid_cent.items()}
+        oid_dist = {int(k) : np.sum(np.linalg.norm(np.diff(np.array(v), axis=0), axis=1)) for k, v in oid_cent.items()}
+        unique_oid = np.unique([i[0] for i in oid_hid_dist.keys()]).tolist()
+        unique_hid = np.unique([i[1] for i in oid_hid_dist.keys()]).tolist()
+        o_len = len(unique_oid)
+        h_len = len(unique_hid)
+        dist_matrix = np.zeros((o_len, h_len))
+        for ((oid, hid), dist) in oid_hid_dist.items():
+            oid_ind = unique_oid.index(oid)
+            hid_ind = unique_hid.index(hid)
+            dist_matrix[oid_ind, hid_ind] = dist
+        # opt_hyp_dist contains GT ID : max dist covered by track
+        opt_hyp_dist = dict.fromkeys(oid_dist.keys(), 0.)
+        cost_matrix = np.max(dist_matrix) - dist_matrix
+        rows, cols = linear_sum_assignment(cost_matrix)
+        for (row, col) in zip(rows, cols):
+            value = dist_matrix[row, col]
+            opt_hyp_dist[int(unique_oid[row])] = value
+        assert len(opt_hyp_dist.keys()) == len(oid_dist.keys())
+        hyp_length = np.sum(list(opt_hyp_dist.values()))
+        gt_length = np.sum(list(oid_dist.values()))
+        id_eucl =np.mean([np.divide(a, b, out=np.zeros_like(a), where=b!=0) for a, b in zip(opt_hyp_dist.values(), oid_dist.values())])
+        res['IDEucl'] = np.divide(hyp_length, gt_length, out=np.zeros_like(hyp_length), where=gt_length!=0)
+        return res
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
+        """Combines metrics across all classes by averaging over the class values.
+        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
+        """
+        res = {}
+        for field in self.float_fields:
+            if ignore_empty_classes:
+                res[field] = np.mean([v[field] for v in all_res.values()
+                                      if v['IDEucl'] > 0 + np.finfo('float').eps], axis=0)
+            else:
+                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
+        return res
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {}
+        for field in self.float_fields:
+            res[field] = self._combine_sum(all_res, field)
+        res = self._compute_final_fields(res, len(all_res))
+        return res
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences"""
+        res = {}
+        for field in self.float_fields:
+            res[field] = self._combine_sum(all_res, field)
+        res = self._compute_final_fields(res, len(all_res))
+        return res
+    @staticmethod
+    def _compute_centroid(box):
+        box = np.array(box)
+        if len(box.shape) == 1:
+            centroid = (box[0:2] + box[2:4])/2
+        else:
+            centroid = (box[:, 0:2] + box[:, 2:4])/2
+        return  np.flip(centroid, axis=1)
+    @staticmethod
+    def _compute_final_fields(res, res_len):
+        """
+        Exists only to match signature with the original Identiy class.
+        """
+        return {k:v/res_len for k,v in res.items()}

avism/data/aviseval/metrics/j_and_f.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import numpy as np
+import math
+from scipy.optimize import linear_sum_assignment
+from ..utils import TrackEvalException
+from ._base_metric import _BaseMetric
+from .. import _timing
+class JAndF(_BaseMetric):
+    """Class which implements the J&F metrics"""
+    def __init__(self, config=None):
+        super().__init__()
+        self.integer_fields = ['num_gt_tracks']
+        self.float_fields = ['J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay', 'J&F']
+        self.fields = self.float_fields + self.integer_fields
+        self.summary_fields = self.float_fields
+        self.optim_type = 'J'  # possible values J, J&F
+    @_timing.time
+    def eval_sequence(self, data):
+        """Returns J&F metrics for one sequence"""
+        # Only loaded when run to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+        num_timesteps = data['num_timesteps']
+        num_tracker_ids = data['num_tracker_ids']
+        num_gt_ids = data['num_gt_ids']
+        gt_dets = data['gt_dets']
+        tracker_dets = data['tracker_dets']
+        gt_ids = data['gt_ids']
+        tracker_ids = data['tracker_ids']
+        # get shape of frames
+        frame_shape = None
+        if num_gt_ids > 0:
+            for t in range(num_timesteps):
+                if len(gt_ids[t]) > 0:
+                    frame_shape = gt_dets[t][0]['size']
+                    break
+        elif num_tracker_ids > 0:
+            for t in range(num_timesteps):
+                if len(tracker_ids[t]) > 0:
+                    frame_shape = tracker_dets[t][0]['size']
+                    break
+        if frame_shape:
+            # append all zero masks for timesteps in which tracks do not have a detection
+            zero_padding = np.zeros((frame_shape), order= 'F').astype(np.uint8)
+            padding_mask = mask_utils.encode(zero_padding)
+            for t in range(num_timesteps):
+                gt_id_det_mapping = {gt_ids[t][i]: gt_dets[t][i] for i in range(len(gt_ids[t]))}
+                gt_dets[t] = [gt_id_det_mapping[index] if index in gt_ids[t] else padding_mask for index
+                              in range(num_gt_ids)]
+                tracker_id_det_mapping = {tracker_ids[t][i]: tracker_dets[t][i] for i in range(len(tracker_ids[t]))}
+                tracker_dets[t] = [tracker_id_det_mapping[index] if index in tracker_ids[t] else padding_mask for index
+                                   in range(num_tracker_ids)]
+            # also perform zero padding if number of tracker IDs < number of ground truth IDs
+            if num_tracker_ids < num_gt_ids:
+                diff = num_gt_ids - num_tracker_ids
+                for t in range(num_timesteps):
+                    tracker_dets[t] = tracker_dets[t] + [padding_mask for _ in range(diff)]
+                num_tracker_ids += diff
+        j = self._compute_j(gt_dets, tracker_dets, num_gt_ids, num_tracker_ids, num_timesteps)
+        # boundary threshold for F computation
+        bound_th = 0.008
+        # perform matching
+        if self.optim_type == 'J&F':
+            f = np.zeros_like(j)
+            for k in range(num_tracker_ids):
+                for i in range(num_gt_ids):
+                    f[k, i, :] = self._compute_f(gt_dets, tracker_dets, k, i, bound_th)
+            optim_metrics = (np.mean(j, axis=2) + np.mean(f, axis=2)) / 2
+            row_ind, col_ind = linear_sum_assignment(- optim_metrics)
+            j_m = j[row_ind, col_ind, :]
+            f_m = f[row_ind, col_ind, :]
+        elif self.optim_type == 'J':
+            optim_metrics = np.mean(j, axis=2)
+            row_ind, col_ind = linear_sum_assignment(- optim_metrics)
+            j_m = j[row_ind, col_ind, :]
+            f_m = np.zeros_like(j_m)
+            for i, (tr_ind, gt_ind) in enumerate(zip(row_ind, col_ind)):
+                f_m[i] = self._compute_f(gt_dets, tracker_dets, tr_ind, gt_ind, bound_th)
+        else:
+            raise TrackEvalException('Unsupported optimization type %s for J&F metric.' % self.optim_type)
+        # append zeros for false negatives
+        if j_m.shape[0] < data['num_gt_ids']:
+            diff = data['num_gt_ids'] - j_m.shape[0]
+            j_m = np.concatenate((j_m, np.zeros((diff, j_m.shape[1]))), axis=0)
+            f_m = np.concatenate((f_m, np.zeros((diff, f_m.shape[1]))), axis=0)
+        # compute the metrics for each ground truth track
+        res = {
+            'J-Mean': [np.nanmean(j_m[i, :]) for i in range(j_m.shape[0])],
+            'J-Recall': [np.nanmean(j_m[i, :] > 0.5 + np.finfo('float').eps) for i in range(j_m.shape[0])],
+            'F-Mean': [np.nanmean(f_m[i, :]) for i in range(f_m.shape[0])],
+            'F-Recall': [np.nanmean(f_m[i, :] > 0.5 + np.finfo('float').eps) for i in range(f_m.shape[0])],
+            'J-Decay': [],
+            'F-Decay': []
+        }
+        n_bins = 4
+        ids = np.round(np.linspace(1, data['num_timesteps'], n_bins + 1) + 1e-10) - 1
+        ids = ids.astype(np.uint8)
+        for k in range(j_m.shape[0]):
+            d_bins_j = [j_m[k][ids[i]:ids[i + 1] + 1] for i in range(0, n_bins)]
+            res['J-Decay'].append(np.nanmean(d_bins_j[0]) - np.nanmean(d_bins_j[3]))
+        for k in range(f_m.shape[0]):
+            d_bins_f = [f_m[k][ids[i]:ids[i + 1] + 1] for i in range(0, n_bins)]
+            res['F-Decay'].append(np.nanmean(d_bins_f[0]) - np.nanmean(d_bins_f[3]))
+        # count number of tracks for weighting of the result
+        res['num_gt_tracks'] = len(res['J-Mean'])
+        for field in ['J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay']:
+            res[field] = np.mean(res[field])
+        res['J&F'] = (res['J-Mean'] + res['F-Mean']) / 2
+        return res
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences"""
+        res = {'num_gt_tracks': self._combine_sum(all_res, 'num_gt_tracks')}
+        for field in self.summary_fields:
+            res[field] = self._combine_weighted_av(all_res, field, res, weight_field='num_gt_tracks')
+        return res
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=False):
+        """Combines metrics across all classes by averaging over the class values
+        'ignore empty classes' is not yet implemented here.
+        """
+        res = {'num_gt_tracks': self._combine_sum(all_res, 'num_gt_tracks')}
+        for field in self.float_fields:
+            res[field] = np.mean([v[field] for v in all_res.values()])
+        return res
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {'num_gt_tracks': self._combine_sum(all_res, 'num_gt_tracks')}
+        for field in self.float_fields:
+            res[field] = np.mean([v[field] for v in all_res.values()])
+        return res
+    @staticmethod
+    def _seg2bmap(seg, width=None, height=None):
+        """
+        From a segmentation, compute a binary boundary map with 1 pixel wide
+        boundaries.  The boundary pixels are offset by 1/2 pixel towards the
+        origin from the actual segment boundary.
+        Arguments:
+            seg     : Segments labeled from 1..k.
+            width	  :	Width of desired bmap  <= seg.shape[1]
+            height  :	Height of desired bmap <= seg.shape[0]
+        Returns:
+            bmap (ndarray):	Binary boundary map.
+         David Martin <[email protected]>
+         January 2003
+        """
+        seg = seg.astype(bool)
+        seg[seg > 0] = 1
+        assert np.atleast_3d(seg).shape[2] == 1
+        width = seg.shape[1] if width is None else width
+        height = seg.shape[0] if height is None else height
+        h, w = seg.shape[:2]
+        ar1 = float(width) / float(height)
+        ar2 = float(w) / float(h)
+        assert not (
+                width > w | height > h | abs(ar1 - ar2) > 0.01
+        ), "Can" "t convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
+        e = np.zeros_like(seg)
+        s = np.zeros_like(seg)
+        se = np.zeros_like(seg)
+        e[:, :-1] = seg[:, 1:]
+        s[:-1, :] = seg[1:, :]
+        se[:-1, :-1] = seg[1:, 1:]
+        b = seg ^ e | seg ^ s | seg ^ se
+        b[-1, :] = seg[-1, :] ^ e[-1, :]
+        b[:, -1] = seg[:, -1] ^ s[:, -1]
+        b[-1, -1] = 0
+        if w == width and h == height:
+            bmap = b
+        else:
+            bmap = np.zeros((height, width))
+            for x in range(w):
+                for y in range(h):
+                    if b[y, x]:
+                        j = 1 + math.floor((y - 1) + height / h)
+                        i = 1 + math.floor((x - 1) + width / h)
+                        bmap[j, i] = 1
+        return bmap
+    @staticmethod
+    def _compute_f(gt_data, tracker_data, tracker_data_id, gt_id, bound_th):
+        """
+        Perform F computation for a given gt and a given tracker ID. Adapted from
+        https://github.com/davisvideochallenge/davis2017-evaluation
+        :param gt_data: the encoded gt masks
+        :param tracker_data: the encoded tracker masks
+        :param tracker_data_id: the tracker ID
+        :param gt_id: the ground truth ID
+        :param bound_th: boundary threshold parameter
+        :return: the F value for the given tracker and gt ID
+        """
+        # Only loaded when run to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+        from skimage.morphology import disk
+        import cv2
+        f = np.zeros(len(gt_data))
+        for t, (gt_masks, tracker_masks) in enumerate(zip(gt_data, tracker_data)):
+            curr_tracker_mask = mask_utils.decode(tracker_masks[tracker_data_id])
+            curr_gt_mask = mask_utils.decode(gt_masks[gt_id])
+            bound_pix = bound_th if bound_th >= 1 - np.finfo('float').eps else \
+                np.ceil(bound_th * np.linalg.norm(curr_tracker_mask.shape))
+            # Get the pixel boundaries of both masks
+            fg_boundary = JAndF._seg2bmap(curr_tracker_mask)
+            gt_boundary = JAndF._seg2bmap(curr_gt_mask)
+            # fg_dil = binary_dilation(fg_boundary, disk(bound_pix))
+            fg_dil = cv2.dilate(fg_boundary.astype(np.uint8), disk(bound_pix).astype(np.uint8))
+            # gt_dil = binary_dilation(gt_boundary, disk(bound_pix))
+            gt_dil = cv2.dilate(gt_boundary.astype(np.uint8), disk(bound_pix).astype(np.uint8))
+            # Get the intersection
+            gt_match = gt_boundary * fg_dil
+            fg_match = fg_boundary * gt_dil
+            # Area of the intersection
+            n_fg = np.sum(fg_boundary)
+            n_gt = np.sum(gt_boundary)
+            # % Compute precision and recall
+            if n_fg == 0 and n_gt > 0:
+                precision = 1
+                recall = 0
+            elif n_fg > 0 and n_gt == 0:
+                precision = 0
+                recall = 1
+            elif n_fg == 0 and n_gt == 0:
+                precision = 1
+                recall = 1
+            else:
+                precision = np.sum(fg_match) / float(n_fg)
+                recall = np.sum(gt_match) / float(n_gt)
+            # Compute F measure
+            if precision + recall == 0:
+                f_val = 0
+            else:
+                f_val = 2 * precision * recall / (precision + recall)
+            f[t] = f_val
+        return f
+    @staticmethod
+    def _compute_j(gt_data, tracker_data, num_gt_ids, num_tracker_ids, num_timesteps):
+        """
+        Computation of J value for all ground truth IDs and all tracker IDs in the given sequence. Adapted from
+        https://github.com/davisvideochallenge/davis2017-evaluation
+        :param gt_data: the ground truth masks
+        :param tracker_data: the tracker masks
+        :param num_gt_ids: the number of ground truth IDs
+        :param num_tracker_ids: the number of tracker IDs
+        :param num_timesteps: the number of timesteps
+        :return: the J values
+        """
+        # Only loaded when run to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+        j = np.zeros((num_tracker_ids, num_gt_ids, num_timesteps))
+        for t, (time_gt, time_data) in enumerate(zip(gt_data, tracker_data)):
+            # run length encoded masks with pycocotools
+            area_gt = mask_utils.area(time_gt)
+            time_data = list(time_data)
+            area_tr = mask_utils.area(time_data)
+            area_tr = np.repeat(area_tr[:, np.newaxis], len(area_gt), axis=1)
+            area_gt = np.repeat(area_gt[np.newaxis, :], len(area_tr), axis=0)
+            # mask iou computation with pycocotools
+            ious = np.atleast_2d(mask_utils.iou(time_data, time_gt, [0]*len(time_gt)))
+            # set iou to 1 if both masks are close to 0 (no ground truth and no predicted mask in timestep)
+            ious[np.isclose(area_tr, 0) & np.isclose(area_gt, 0)] = 1
+            assert (ious >= 0 - np.finfo('float').eps).all()
+            assert (ious <= 1 + np.finfo('float').eps).all()
+            j[..., t] = ious
+        return j

avism/data/aviseval/metrics/track_map.py ADDED Viewed

	@@ -0,0 +1,462 @@

+import numpy as np
+from ._base_metric import _BaseMetric
+from .. import _timing
+from functools import partial
+from .. import utils
+from ..utils import TrackEvalException
+class TrackMAP(_BaseMetric):
+    """Class which implements the TrackMAP metrics"""
+    @staticmethod
+    def get_default_metric_config():
+        """Default class config values"""
+        default_config = {
+            'USE_AREA_RANGES': True,  # whether to evaluate for certain area ranges
+            'AREA_RANGES': [[0 ** 2, 32 ** 2],  # additional area range sets for which TrackMAP is evaluated
+                            [32 ** 2, 96 ** 2],  # (all area range always included), default values for TAO
+                            [96 ** 2, 1e5 ** 2]],  # evaluation
+            'AREA_RANGE_LABELS': ["area_s", "area_m", "area_l"],  # the labels for the area ranges
+            'USE_TIME_RANGES': True,  # whether to evaluate for certain time ranges (length of tracks)
+            'TIME_RANGES': [[0, 3], [3, 10], [10, 1e5]],  # additional time range sets for which TrackMAP is evaluated
+            # (all time range always included) , default values for TAO evaluation
+            'TIME_RANGE_LABELS': ["time_s", "time_m", "time_l"],  # the labels for the time ranges
+            'IOU_THRESHOLDS': np.arange(0.5, 0.96, 0.05),  # the IoU thresholds
+            'RECALL_THRESHOLDS': np.linspace(0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01) + 1), endpoint=True),
+            # recall thresholds at which precision is evaluated
+            'MAX_DETECTIONS': 0,  # limit the maximum number of considered tracks per sequence (0 for unlimited)
+            'PRINT_CONFIG': False
+        }
+        return default_config
+    def __init__(self, config=None):
+        super().__init__()
+        self.config = utils.init_config(config, self.get_default_metric_config(), self.get_name())
+        self.num_ig_masks = 1
+        self.lbls = ['all']
+        self.use_area_rngs = self.config['USE_AREA_RANGES']
+        if self.use_area_rngs:
+            self.area_rngs = self.config['AREA_RANGES']
+            self.area_rng_lbls = self.config['AREA_RANGE_LABELS']
+            self.num_ig_masks += len(self.area_rng_lbls)
+            self.lbls += self.area_rng_lbls
+        self.use_time_rngs = self.config['USE_TIME_RANGES']
+        if self.use_time_rngs:
+            self.time_rngs = self.config['TIME_RANGES']
+            self.time_rng_lbls = self.config['TIME_RANGE_LABELS']
+            self.num_ig_masks += len(self.time_rng_lbls)
+            self.lbls += self.time_rng_lbls
+        self.array_labels = self.config['IOU_THRESHOLDS']
+        self.rec_thrs = self.config['RECALL_THRESHOLDS']
+        self.maxDet = self.config['MAX_DETECTIONS']
+        self.float_array_fields = ['AP_' + lbl for lbl in self.lbls] + ['AR_' + lbl for lbl in self.lbls]
+        self.fields = self.float_array_fields
+        self.summary_fields = self.float_array_fields
+    @_timing.time
+    def eval_sequence(self, data):
+        """Calculates GT and Tracker matches for one sequence for TrackMAP metrics. Adapted from
+        https://github.com/TAO-Dataset/"""
+        # Initialise results to zero for each sequence as the fields are only defined over the set of all sequences
+        res = {}
+        for field in self.fields:
+            res[field] = [0 for _ in self.array_labels]
+        gt_ids, dt_ids = data['gt_track_ids'], data['dt_track_ids']
+        if len(gt_ids) == 0 and len(dt_ids) == 0:
+            for idx in range(self.num_ig_masks):
+                res[idx] = None
+            return res
+        # get track data
+        gt_tr_areas = data.get('gt_track_areas', None) if self.use_area_rngs else None
+        gt_tr_lengths = data.get('gt_track_lengths', None) if self.use_time_rngs else None
+        gt_tr_iscrowd = data.get('gt_track_iscrowd', None)
+        dt_tr_areas = data.get('dt_track_areas', None) if self.use_area_rngs else None
+        dt_tr_lengths = data.get('dt_track_lengths', None) if self.use_time_rngs else None
+        is_nel = data.get('not_exhaustively_labeled', False)
+        # compute ignore masks for different track sets to eval
+        gt_ig_masks = self._compute_track_ig_masks(len(gt_ids), track_lengths=gt_tr_lengths, track_areas=gt_tr_areas,
+                                                   iscrowd=gt_tr_iscrowd)
+        dt_ig_masks = self._compute_track_ig_masks(len(dt_ids), track_lengths=dt_tr_lengths, track_areas=dt_tr_areas,
+                                                   is_not_exhaustively_labeled=is_nel, is_gt=False)
+        boxformat = data.get('boxformat', 'xywh')
+        ious = self._compute_track_ious(data['dt_tracks'], data['gt_tracks'], iou_function=data['iou_type'],
+                                        boxformat=boxformat)
+        for mask_idx in range(self.num_ig_masks):
+            gt_ig_mask = gt_ig_masks[mask_idx]
+            # Sort gt ignore last
+            gt_idx = np.argsort([g for g in gt_ig_mask], kind="mergesort")
+            gt_ids = [gt_ids[i] for i in gt_idx]
+            ious_sorted = ious[:, gt_idx] if len(ious) > 0 else ious
+            num_thrs = len(self.array_labels)
+            num_gt = len(gt_ids)
+            num_dt = len(dt_ids)
+            # Array to store the "id" of the matched dt/gt
+            gt_m = np.zeros((num_thrs, num_gt)) - 1
+            dt_m = np.zeros((num_thrs, num_dt)) - 1
+            gt_ig = np.array([gt_ig_mask[idx] for idx in gt_idx])
+            dt_ig = np.zeros((num_thrs, num_dt))
+            for iou_thr_idx, iou_thr in enumerate(self.array_labels):
+                if len(ious_sorted) == 0:
+                    break
+                for dt_idx, _dt in enumerate(dt_ids):
+                    iou = min([iou_thr, 1 - 1e-10])
+                    # information about best match so far (m=-1 -> unmatched)
+                    # store the gt_idx which matched for _dt
+                    m = -1
+                    for gt_idx, _ in enumerate(gt_ids):
+                        # if this gt already matched continue
+                        if gt_m[iou_thr_idx, gt_idx] > 0:
+                            continue
+                        # if _dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gt_ig[m] == 0 and gt_ig[gt_idx] == 1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious_sorted[dt_idx, gt_idx] < iou - np.finfo('float').eps:
+                            continue
+                        # if match successful and best so far, store appropriately
+                        iou = ious_sorted[dt_idx, gt_idx]
+                        m = gt_idx
+                    # No match found for _dt, go to next _dt
+                    if m == -1:
+                        continue
+                    # if gt to ignore for some reason update dt_ig.
+                    # Should not be used in evaluation.
+                    dt_ig[iou_thr_idx, dt_idx] = gt_ig[m]
+                    # _dt match found, update gt_m, and dt_m with "id"
+                    dt_m[iou_thr_idx, dt_idx] = gt_ids[m]
+                    gt_m[iou_thr_idx, m] = _dt
+            dt_ig_mask = dt_ig_masks[mask_idx]
+            dt_ig_mask = np.array(dt_ig_mask).reshape((1, num_dt))  # 1 X num_dt
+            dt_ig_mask = np.repeat(dt_ig_mask, num_thrs, 0)  # num_thrs X num_dt
+            # Based on dt_ig_mask ignore any unmatched detection by updating dt_ig
+            dt_ig = np.logical_or(dt_ig, np.logical_and(dt_m == -1, dt_ig_mask))
+            # store results for given video and category
+            res[mask_idx] = {
+                "dt_ids": dt_ids,
+                "gt_ids": gt_ids,
+                "dt_matches": dt_m,
+                "gt_matches": gt_m,
+                "dt_scores": data['dt_track_scores'],
+                "gt_ignore": gt_ig,
+                "dt_ignore": dt_ig,
+            }
+        return res
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences. Computes precision and recall values based on track matches.
+        Adapted from https://github.com/TAO-Dataset/
+        """
+        num_thrs = len(self.array_labels)
+        num_recalls = len(self.rec_thrs)
+        # -1 for absent categories
+        precision = -np.ones(
+            (num_thrs, num_recalls, self.num_ig_masks)
+        )
+        recall = -np.ones((num_thrs, self.num_ig_masks))
+        for ig_idx in range(self.num_ig_masks):
+            ig_idx_results = [res[ig_idx] for res in all_res.values() if res[ig_idx] is not None]
+            # Remove elements which are None
+            if len(ig_idx_results) == 0:
+                continue
+            # Append all scores: shape (N,)
+            # limit considered tracks for each sequence if maxDet > 0
+            if self.maxDet == 0:
+                dt_scores = np.concatenate([res["dt_scores"] for res in ig_idx_results], axis=0)
+                dt_idx = np.argsort(-dt_scores, kind="mergesort")
+                dt_m = np.concatenate([e["dt_matches"] for e in ig_idx_results],
+                                      axis=1)[:, dt_idx]
+                dt_ig = np.concatenate([e["dt_ignore"] for e in ig_idx_results],
+                                       axis=1)[:, dt_idx]
+            elif self.maxDet > 0:
+                dt_scores = np.concatenate([res["dt_scores"][0:self.maxDet] for res in ig_idx_results], axis=0)
+                dt_idx = np.argsort(-dt_scores, kind="mergesort")
+                dt_m = np.concatenate([e["dt_matches"][:, 0:self.maxDet] for e in ig_idx_results],
+                                      axis=1)[:, dt_idx]
+                dt_ig = np.concatenate([e["dt_ignore"][:, 0:self.maxDet] for e in ig_idx_results],
+                                       axis=1)[:, dt_idx]
+            else:
+                raise Exception("Number of maximum detections must be >= 0, but is set to %i" % self.maxDet)
+            gt_ig = np.concatenate([res["gt_ignore"] for res in ig_idx_results])
+            # num gt anns to consider
+            num_gt = np.count_nonzero(gt_ig == 0)
+            if num_gt == 0:
+                continue
+            tps = np.logical_and(dt_m != -1, np.logical_not(dt_ig))
+            fps = np.logical_and(dt_m == -1, np.logical_not(dt_ig))
+            tp_sum = np.cumsum(tps, axis=1).astype(dtype=float)
+            fp_sum = np.cumsum(fps, axis=1).astype(dtype=float)
+            for iou_thr_idx, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                tp = np.array(tp)
+                fp = np.array(fp)
+                num_tp = len(tp)
+                rc = tp / num_gt
+                if num_tp:
+                    recall[iou_thr_idx, ig_idx] = rc[-1]
+                else:
+                    recall[iou_thr_idx, ig_idx] = 0
+                # np.spacing(1) ~= eps
+                pr = tp / (fp + tp + np.spacing(1))
+                pr = pr.tolist()
+                # Ensure precision values are monotonically decreasing
+                for i in range(num_tp - 1, 0, -1):
+                    if pr[i] > pr[i - 1]:
+                        pr[i - 1] = pr[i]
+                # find indices at the predefined recall values
+                rec_thrs_insert_idx = np.searchsorted(rc, self.rec_thrs, side="left")
+                pr_at_recall = [0.0] * num_recalls
+                try:
+                    for _idx, pr_idx in enumerate(rec_thrs_insert_idx):
+                        pr_at_recall[_idx] = pr[pr_idx]
+                except IndexError:
+                    pass
+                precision[iou_thr_idx, :, ig_idx] = (np.array(pr_at_recall))
+        res = {'precision': precision, 'recall': recall}
+        # compute the precision and recall averages for the respective alpha thresholds and ignore masks
+        for lbl in self.lbls:
+            res['AP_' + lbl] = np.zeros((len(self.array_labels)), dtype=float)
+            res['AR_' + lbl] = np.zeros((len(self.array_labels)), dtype=float)
+        for a_id, alpha in enumerate(self.array_labels):
+            for lbl_idx, lbl in enumerate(self.lbls):
+                p = precision[a_id, :, lbl_idx]
+                if len(p[p > -1]) == 0:
+                    mean_p = -1
+                else:
+                    mean_p = np.mean(p[p > -1])
+                res['AP_' + lbl][a_id] = mean_p
+                res['AR_' + lbl][a_id] = recall[a_id, lbl_idx]
+        return res
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=True):
+        """Combines metrics across all classes by averaging over the class values
+        Note mAP is not well defined for 'empty classes' so 'ignore empty classes' is always true here.
+        """
+        res = {}
+        for field in self.fields:
+            res[field] = np.zeros((len(self.array_labels)), dtype=float)
+            field_stacked = np.array([res[field] for res in all_res.values()])
+            for a_id, alpha in enumerate(self.array_labels):
+                values = field_stacked[:, a_id]
+                if len(values[values > -1]) == 0:
+                    mean = -1
+                else:
+                    mean = np.mean(values[values > -1])
+                res[field][a_id] = mean
+        return res
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {}
+        for field in self.fields:
+            res[field] = np.zeros((len(self.array_labels)), dtype=float)
+            field_stacked = np.array([res[field] for res in all_res.values()])
+            for a_id, alpha in enumerate(self.array_labels):
+                values = field_stacked[:, a_id]
+                if len(values[values > -1]) == 0:
+                    mean = -1
+                else:
+                    mean = np.mean(values[values > -1])
+                res[field][a_id] = mean
+        return res
+    def _compute_track_ig_masks(self, num_ids, track_lengths=None, track_areas=None, iscrowd=None,
+                                is_not_exhaustively_labeled=False, is_gt=True):
+        """
+        Computes ignore masks for different track sets to evaluate
+        :param num_ids: the number of track IDs
+        :param track_lengths: the lengths of the tracks (number of timesteps)
+        :param track_areas: the average area of a track
+        :param iscrowd: whether a track is marked as crowd
+        :param is_not_exhaustively_labeled: whether the track category is not exhaustively labeled
+        :param is_gt: whether it is gt
+        :return: the track ignore masks
+        """
+        # for TAO tracks for classes which are not exhaustively labeled are not evaluated
+        if not is_gt and is_not_exhaustively_labeled:
+            track_ig_masks = [[1 for _ in range(num_ids)] for i in range(self.num_ig_masks)]
+        else:
+            # consider all tracks
+            track_ig_masks = [[0 for _ in range(num_ids)]]
+            # consider tracks with certain area
+            if self.use_area_rngs:
+                for rng in self.area_rngs:
+                    track_ig_masks.append([0 if rng[0] - np.finfo('float').eps <= area <= rng[1] + np.finfo('float').eps
+                                           else 1 for area in track_areas])
+            # consider tracks with certain duration
+            if self.use_time_rngs:
+                for rng in self.time_rngs:
+                    track_ig_masks.append([0 if rng[0] - np.finfo('float').eps <= length
+                                                <= rng[1] + np.finfo('float').eps else 1 for length in track_lengths])
+        # for YouTubeVIS evaluation tracks with crowd tag are not evaluated
+        if is_gt and iscrowd:
+            track_ig_masks = [np.logical_or(mask, iscrowd) for mask in track_ig_masks]
+        return track_ig_masks
+    @staticmethod
+    def _compute_bb_track_iou(dt_track, gt_track, boxformat='xywh'):
+        """
+        Calculates the track IoU for one detected track and one ground truth track for bounding boxes
+        :param dt_track: the detected track (format: dictionary with frame index as keys and
+                            numpy arrays as values)
+        :param gt_track: the ground truth track (format: dictionary with frame index as keys and
+                        numpy array as values)
+        :param boxformat: the format of the boxes
+        :return: the track IoU
+        """
+        intersect = 0
+        union = 0
+        image_ids = set(gt_track.keys()) | set(dt_track.keys())
+        for image in image_ids:
+            g = gt_track.get(image, None)
+            d = dt_track.get(image, None)
+            if boxformat == 'xywh':
+                if d is not None and g is not None:
+                    dx, dy, dw, dh = d
+                    gx, gy, gw, gh = g
+                    w = max(min(dx + dw, gx + gw) - max(dx, gx), 0)
+                    h = max(min(dy + dh, gy + gh) - max(dy, gy), 0)
+                    i = w * h
+                    u = dw * dh + gw * gh - i
+                    intersect += i
+                    union += u
+                elif d is None and g is not None:
+                    union += g[2] * g[3]
+                elif d is not None and g is None:
+                    union += d[2] * d[3]
+            elif boxformat == 'x0y0x1y1':
+                if d is not None and g is not None:
+                    dx0, dy0, dx1, dy1 = d
+                    gx0, gy0, gx1, gy1 = g
+                    w = max(min(dx1, gx1) - max(dx0, gx0), 0)
+                    h = max(min(dy1, gy1) - max(dy0, gy0), 0)
+                    i = w * h
+                    u = (dx1 - dx0) * (dy1 - dy0) + (gx1 - gx0) * (gy1 - gy0) - i
+                    intersect += i
+                    union += u
+                elif d is None and g is not None:
+                    union += (g[2] - g[0]) * (g[3] - g[1])
+                elif d is not None and g is None:
+                    union += (d[2] - d[0]) * (d[3] - d[1])
+            else:
+                raise TrackEvalException('BoxFormat not implemented')
+        if intersect > union:
+            raise TrackEvalException("Intersection value > union value. Are the box values corrupted?")
+        return intersect / union if union > 0 else 0
+    @staticmethod
+    def _compute_mask_track_iou(dt_track, gt_track):
+        """
+        Calculates the track IoU for one detected track and one ground truth track for segmentation masks
+        :param dt_track: the detected track (format: dictionary with frame index as keys and
+                            pycocotools rle encoded masks as values)
+        :param gt_track: the ground truth track (format: dictionary with frame index as keys and
+                            pycocotools rle encoded masks as values)
+        :return: the track IoU
+        """
+        # only loaded when needed to reduce minimum requirements
+        from pycocotools import mask as mask_utils
+        intersect = .0
+        union = .0
+        image_ids = set(gt_track.keys()) | set(dt_track.keys())
+        for image in image_ids:
+            g = gt_track.get(image, None)
+            d = dt_track.get(image, None)
+            if d and g:
+                intersect += mask_utils.area(mask_utils.merge([d, g], True))
+                union += mask_utils.area(mask_utils.merge([d, g], False))
+            elif not d and g:
+                union += mask_utils.area(g)
+            elif d and not g:
+                union += mask_utils.area(d)
+        if union < 0.0 - np.finfo('float').eps:
+            raise TrackEvalException("Union value < 0. Are the segmentaions corrupted?")
+        if intersect > union:
+            raise TrackEvalException("Intersection value > union value. Are the segmentations corrupted?")
+        iou = intersect / union if union > 0.0 + np.finfo('float').eps else 0.0
+        return iou
+    @staticmethod
+    def _compute_track_ious(dt, gt, iou_function='bbox', boxformat='xywh'):
+        """
+        Calculate track IoUs for a set of ground truth tracks and a set of detected tracks
+        """
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        if iou_function == 'bbox':
+            track_iou_function = partial(TrackMAP._compute_bb_track_iou, boxformat=boxformat)
+        elif iou_function == 'mask':
+            track_iou_function = partial(TrackMAP._compute_mask_track_iou)
+        else:
+            raise Exception('IoU function not implemented')
+        ious = np.zeros([len(dt), len(gt)])
+        for i, j in np.ndindex(ious.shape):
+            ious[i, j] = track_iou_function(dt[i], gt[j])
+        return ious
+    @staticmethod
+    def _row_print(*argv):
+        """Prints results in an evenly spaced rows, with more space in first row"""
+        if len(argv) == 1:
+            argv = argv[0]
+        to_print = '%-40s' % argv[0]
+        for v in argv[1:]:
+            to_print += '%-12s' % str(v)
+        print(to_print)

avism/data/aviseval/metrics/vace.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import numpy as np
+from scipy.optimize import linear_sum_assignment
+from ._base_metric import _BaseMetric
+from .. import _timing
+class VACE(_BaseMetric):
+    """Class which implements the VACE metrics.
+    The metrics are described in:
+    Manohar et al. (2006) "Performance Evaluation of Object Detection and Tracking in Video"
+    https://link.springer.com/chapter/10.1007/11612704_16
+    This implementation uses the "relaxed" variant of the metrics,
+    where an overlap threshold is applied in each frame.
+    """
+    def __init__(self, config=None):
+        super().__init__()
+        self.integer_fields = ['VACE_IDs', 'VACE_GT_IDs', 'num_non_empty_timesteps']
+        self.float_fields = ['STDA', 'ATA', 'FDA', 'SFDA']
+        self.fields = self.integer_fields + self.float_fields
+        self.summary_fields = ['SFDA', 'ATA']
+        # Fields that are accumulated over multiple videos.
+        self._additive_fields = self.integer_fields + ['STDA', 'FDA']
+        self.threshold = 0.5
+    @_timing.time
+    def eval_sequence(self, data):
+        """Calculates VACE metrics for one sequence.
+        Depends on the fields:
+            data['num_gt_ids']
+            data['num_tracker_ids']
+            data['gt_ids']
+            data['tracker_ids']
+            data['similarity_scores']
+        """
+        res = {}
+        # Obtain Average Tracking Accuracy (ATA) using track correspondence.
+        # Obtain counts necessary to compute temporal IOU.
+        # Assume that integer counts can be represented exactly as floats.
+        potential_matches_count = np.zeros((data['num_gt_ids'], data['num_tracker_ids']))
+        gt_id_count = np.zeros(data['num_gt_ids'])
+        tracker_id_count = np.zeros(data['num_tracker_ids'])
+        both_present_count = np.zeros((data['num_gt_ids'], data['num_tracker_ids']))
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
+            # Count the number of frames in which two tracks satisfy the overlap criterion.
+            matches_mask = np.greater_equal(data['similarity_scores'][t], self.threshold)
+            match_idx_gt, match_idx_tracker = np.nonzero(matches_mask)
+            potential_matches_count[gt_ids_t[match_idx_gt], tracker_ids_t[match_idx_tracker]] += 1
+            # Count the number of frames in which the tracks are present.
+            gt_id_count[gt_ids_t] += 1
+            tracker_id_count[tracker_ids_t] += 1
+            both_present_count[gt_ids_t[:, np.newaxis], tracker_ids_t[np.newaxis, :]] += 1
+        # Number of frames in which either track is present (union of the two sets of frames).
+        union_count = (gt_id_count[:, np.newaxis]
+                       + tracker_id_count[np.newaxis, :]
+                       - both_present_count)
+        # The denominator should always be non-zero if all tracks are non-empty.
+        with np.errstate(divide='raise', invalid='raise'):
+            temporal_iou = potential_matches_count / union_count
+        # Find assignment that maximizes temporal IOU.
+        match_rows, match_cols = linear_sum_assignment(-temporal_iou)
+        res['STDA'] = temporal_iou[match_rows, match_cols].sum()
+        res['VACE_IDs'] = data['num_tracker_ids']
+        res['VACE_GT_IDs'] = data['num_gt_ids']
+        # Obtain Frame Detection Accuracy (FDA) using per-frame correspondence.
+        non_empty_count = 0
+        fda = 0
+        for t, (gt_ids_t, tracker_ids_t) in enumerate(zip(data['gt_ids'], data['tracker_ids'])):
+            n_g = len(gt_ids_t)
+            n_d = len(tracker_ids_t)
+            if not (n_g or n_d):
+                continue
+            # n_g > 0 or n_d > 0
+            non_empty_count += 1
+            if not (n_g and n_d):
+                continue
+            # n_g > 0 and n_d > 0
+            spatial_overlap = data['similarity_scores'][t]
+            match_rows, match_cols = linear_sum_assignment(-spatial_overlap)
+            overlap_ratio = spatial_overlap[match_rows, match_cols].sum()
+            fda += overlap_ratio / (0.5 * (n_g + n_d))
+        res['FDA'] = fda
+        res['num_non_empty_timesteps'] = non_empty_count
+        res.update(self._compute_final_fields(res))
+        return res
+    def combine_classes_class_averaged(self, all_res, ignore_empty_classes=True):
+        """Combines metrics across all classes by averaging over the class values.
+        If 'ignore_empty_classes' is True, then it only sums over classes with at least one gt or predicted detection.
+        """
+        res = {}
+        for field in self.fields:
+            if ignore_empty_classes:
+                res[field] = np.mean([v[field] for v in all_res.values()
+                                  if v['VACE_GT_IDs'] > 0 or v['VACE_IDs'] > 0], axis=0)
+            else:
+                res[field] = np.mean([v[field] for v in all_res.values()], axis=0)
+        return res
+    def combine_classes_det_averaged(self, all_res):
+        """Combines metrics across all classes by averaging over the detection values"""
+        res = {}
+        for field in self._additive_fields:
+            res[field] = _BaseMetric._combine_sum(all_res, field)
+        res = self._compute_final_fields(res)
+        return res
+    def combine_sequences(self, all_res):
+        """Combines metrics across all sequences"""
+        res = {}
+        for header in self._additive_fields:
+            res[header] = _BaseMetric._combine_sum(all_res, header)
+        res.update(self._compute_final_fields(res))
+        return res
+    @staticmethod
+    def _compute_final_fields(additive):
+        final = {}
+        with np.errstate(invalid='ignore'):  # Permit nan results.
+            final['ATA'] = (additive['STDA'] /
+                            (0.5 * (additive['VACE_IDs'] + additive['VACE_GT_IDs'])))
+            final['SFDA'] = additive['FDA'] / additive['num_non_empty_timesteps']
+        return final

avism/data/aviseval/plotting.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import os
+import numpy as np
+from .utils import TrackEvalException
+def plot_compare_trackers(tracker_folder, tracker_list, cls, output_folder, plots_list=None):
+    """Create plots which compare metrics across different trackers."""
+    # Define what to plot
+    if plots_list is None:
+        plots_list = get_default_plots_list()
+    # Load data
+    data = load_multiple_tracker_summaries(tracker_folder, tracker_list, cls)
+    out_loc = os.path.join(output_folder, cls)
+    # Plot
+    for args in plots_list:
+        create_comparison_plot(data, out_loc, *args)
+def get_default_plots_list():
+    # y_label, x_label, sort_label, bg_label, bg_function
+    plots_list = [
+        ['AssA', 'DetA', 'HOTA', 'HOTA', 'geometric_mean'],
+        ['AssPr', 'AssRe', 'HOTA', 'AssA', 'jaccard'],
+        ['DetPr', 'DetRe', 'HOTA', 'DetA', 'jaccard'],
+        ['HOTA(0)', 'LocA(0)', 'HOTA', 'HOTALocA(0)', 'multiplication'],
+        ['HOTA', 'LocA', 'HOTA', None, None],
+        ['HOTA', 'MOTA', 'HOTA', None, None],
+        ['HOTA', 'IDF1', 'HOTA', None, None],
+        ['IDF1', 'MOTA', 'HOTA', None, None],
+    ]
+    return plots_list
+def load_multiple_tracker_summaries(tracker_folder, tracker_list, cls):
+    """Loads summary data for multiple trackers."""
+    data = {}
+    for tracker in tracker_list:
+        with open(os.path.join(tracker_folder, tracker, cls + '_summary.txt')) as f:
+            keys = next(f).split(' ')
+            done = False
+            while not done:
+                values = next(f).split(' ')
+                if len(values) == len(keys):
+                    done = True
+            data[tracker] = dict(zip(keys, map(float, values)))
+    return data
+def create_comparison_plot(data, out_loc, y_label, x_label, sort_label, bg_label=None, bg_function=None, settings=None):
+    """ Creates a scatter plot comparing multiple trackers between two metric fields, with one on the x-axis and the
+    other on the y axis. Adds pareto optical lines and (optionally) a background contour.
+    Inputs:
+        data: dict of dicts such that data[tracker_name][metric_field_name] = float
+        y_label: the metric_field_name to be plotted on the y-axis
+        x_label: the metric_field_name to be plotted on the x-axis
+        sort_label: the metric_field_name by which trackers are ordered and ranked
+        bg_label: the metric_field_name by which (optional) background contours are plotted
+        bg_function: the (optional) function bg_function(x,y) which converts the x_label / y_label values into bg_label.
+        settings: dict of plot settings with keys:
+            'gap_val': gap between axis ticks and bg curves.
+            'num_to_plot': maximum number of trackers to plot
+    """
+    # Only loaded when run to reduce minimum requirements
+    from matplotlib import pyplot as plt
+    # Get plot settings
+    if settings is None:
+        gap_val = 2
+        num_to_plot = 20
+    else:
+        gap_val = settings['gap_val']
+        num_to_plot = settings['num_to_plot']
+    if (bg_label is None) != (bg_function is None):
+        raise TrackEvalException('bg_function and bg_label must either be both given or neither given.')
+    # Extract data
+    tracker_names = np.array(list(data.keys()))
+    sort_index = np.array([data[t][sort_label] for t in tracker_names]).argsort()[::-1]
+    x_values = np.array([data[t][x_label] for t in tracker_names])[sort_index][:num_to_plot]
+    y_values = np.array([data[t][y_label] for t in tracker_names])[sort_index][:num_to_plot]
+    # Print info on what is being plotted
+    tracker_names = tracker_names[sort_index][:num_to_plot]
+    print('\nPlotting %s vs %s, for the following (ordered) trackers:' % (y_label, x_label))
+    for i, name in enumerate(tracker_names):
+        print('%i: %s' % (i+1, name))
+    # Find best fitting boundaries for data
+    boundaries = _get_boundaries(x_values, y_values, round_val=gap_val/2)
+    fig = plt.figure()
+    # Plot background contour
+    if bg_function is not None:
+        _plot_bg_contour(bg_function, boundaries, gap_val)
+    # Plot pareto optimal lines
+    _plot_pareto_optimal_lines(x_values, y_values)
+    # Plot data points with number labels
+    labels = np.arange(len(y_values)) + 1
+    plt.plot(x_values, y_values, 'b.', markersize=15)
+    for xx, yy, l in zip(x_values, y_values, labels):
+        plt.text(xx, yy, str(l), color="red", fontsize=15)
+    # Add extra explanatory text to plots
+    plt.text(0, -0.11, 'label order:\nHOTA', horizontalalignment='left', verticalalignment='center',
+             transform=fig.axes[0].transAxes, color="red", fontsize=12)
+    if bg_label is not None:
+        plt.text(1, -0.11, 'curve values:\n' + bg_label, horizontalalignment='right', verticalalignment='center',
+                 transform=fig.axes[0].transAxes, color="grey", fontsize=12)
+    plt.xlabel(x_label, fontsize=15)
+    plt.ylabel(y_label, fontsize=15)
+    title = y_label + ' vs ' + x_label
+    if bg_label is not None:
+        title += ' (' + bg_label + ')'
+    plt.title(title, fontsize=17)
+    plt.xticks(np.arange(0, 100, gap_val))
+    plt.yticks(np.arange(0, 100, gap_val))
+    min_x, max_x, min_y, max_y = boundaries
+    plt.xlim(min_x, max_x)
+    plt.ylim(min_y, max_y)
+    plt.gca().set_aspect('equal', adjustable='box')
+    plt.tight_layout()
+    os.makedirs(out_loc, exist_ok=True)
+    filename = os.path.join(out_loc, title.replace(' ', '_'))
+    plt.savefig(filename + '.pdf', bbox_inches='tight', pad_inches=0.05)
+    plt.savefig(filename + '.png', bbox_inches='tight', pad_inches=0.05)
+def _get_boundaries(x_values, y_values, round_val):
+    x1 = np.min(np.floor((x_values - 0.5) / round_val) * round_val)
+    x2 = np.max(np.ceil((x_values + 0.5) / round_val) * round_val)
+    y1 = np.min(np.floor((y_values - 0.5) / round_val) * round_val)
+    y2 = np.max(np.ceil((y_values + 0.5) / round_val) * round_val)
+    x_range = x2 - x1
+    y_range = y2 - y1
+    max_range = max(x_range, y_range)
+    x_center = (x1 + x2) / 2
+    y_center = (y1 + y2) / 2
+    min_x = max(x_center - max_range / 2, 0)
+    max_x = min(x_center + max_range / 2, 100)
+    min_y = max(y_center - max_range / 2, 0)
+    max_y = min(y_center + max_range / 2, 100)
+    return min_x, max_x, min_y, max_y
+def geometric_mean(x, y):
+    return np.sqrt(x * y)
+def jaccard(x, y):
+    x = x / 100
+    y = y / 100
+    return 100 * (x * y) / (x + y - x * y)
+def multiplication(x, y):
+    return x * y / 100
+bg_function_dict = {
+    "geometric_mean": geometric_mean,
+    "jaccard": jaccard,
+    "multiplication": multiplication,
+    }
+def _plot_bg_contour(bg_function, plot_boundaries, gap_val):
+    """ Plot background contour. """
+    # Only loaded when run to reduce minimum requirements
+    from matplotlib import pyplot as plt
+    # Plot background contour
+    min_x, max_x, min_y, max_y = plot_boundaries
+    x = np.arange(min_x, max_x, 0.1)
+    y = np.arange(min_y, max_y, 0.1)
+    x_grid, y_grid = np.meshgrid(x, y)
+    if bg_function in bg_function_dict.keys():
+        z_grid = bg_function_dict[bg_function](x_grid, y_grid)
+    else:
+        raise TrackEvalException("background plotting function '%s' is not defined." % bg_function)
+    levels = np.arange(0, 100, gap_val)
+    con = plt.contour(x_grid, y_grid, z_grid, levels, colors='grey')
+    def bg_format(val):
+        s = '{:1f}'.format(val)
+        return '{:.0f}'.format(val) if s[-1] == '0' else s
+    con.levels = [bg_format(val) for val in con.levels]
+    plt.clabel(con, con.levels, inline=True, fmt='%r', fontsize=8)
+def _plot_pareto_optimal_lines(x_values, y_values):
+    """ Plot pareto optimal lines """
+    # Only loaded when run to reduce minimum requirements
+    from matplotlib import pyplot as plt
+    # Plot pareto optimal lines
+    cxs = x_values
+    cys = y_values
+    best_y = np.argmax(cys)
+    x_pareto = [0, cxs[best_y]]
+    y_pareto = [cys[best_y], cys[best_y]]
+    t = 2
+    remaining = cxs > x_pareto[t - 1]
+    cys = cys[remaining]
+    cxs = cxs[remaining]
+    while len(cxs) > 0 and len(cys) > 0:
+        best_y = np.argmax(cys)
+        x_pareto += [x_pareto[t - 1], cxs[best_y]]
+        y_pareto += [cys[best_y], cys[best_y]]
+        t += 2
+        remaining = cxs > x_pareto[t - 1]
+        cys = cys[remaining]
+        cxs = cxs[remaining]
+    x_pareto.append(x_pareto[t - 1])
+    y_pareto.append(0)
+    plt.plot(np.array(x_pareto), np.array(y_pareto), '--r')

avism/data/aviseval/utils.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import os
+import csv
+import argparse
+from collections import OrderedDict
+def init_config(config, default_config, name=None):
+    """Initialise non-given config values with defaults"""
+    if config is None:
+        config = default_config
+    else:
+        for k in default_config.keys():
+            if k not in config.keys():
+                config[k] = default_config[k]
+    if name and config['PRINT_CONFIG']:
+        print('\n%s Config:' % name)
+        for c in config.keys():
+            print('%-20s : %-30s' % (c, config[c]))
+    return config
+def update_config(config):
+    """
+    Parse the arguments of a script and updates the config values for a given value if specified in the arguments.
+    :param config: the config to update
+    :return: the updated config
+    """
+    parser = argparse.ArgumentParser()
+    for setting in config.keys():
+        if type(config[setting]) == list or type(config[setting]) == type(None):
+            parser.add_argument("--" + setting, nargs='+')
+        else:
+            parser.add_argument("--" + setting)
+    args = parser.parse_args().__dict__
+    for setting in args.keys():
+        if args[setting] is not None:
+            if type(config[setting]) == type(True):
+                if args[setting] == 'True':
+                    x = True
+                elif args[setting] == 'False':
+                    x = False
+                else:
+                    raise Exception('Command line parameter ' + setting + 'must be True or False')
+            elif type(config[setting]) == type(1):
+                x = int(args[setting])
+            elif type(args[setting]) == type(None):
+                x = None
+            else:
+                x = args[setting]
+            config[setting] = x
+    return config
+def get_code_path():
+    """Get base path where code is"""
+    return os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+def validate_metrics_list(metrics_list):
+    """Get names of metric class and ensures they are unique, further checks that the fields within each metric class
+    do not have overlapping names.
+    """
+    metric_names = [metric.get_name() for metric in metrics_list]
+    # check metric names are unique
+    if len(metric_names) != len(set(metric_names)):
+        raise TrackEvalException('Code being run with multiple metrics of the same name')
+    fields = []
+    for m in metrics_list:
+        fields += m.fields
+    # check metric fields are unique
+    if len(fields) != len(set(fields)):
+        raise TrackEvalException('Code being run with multiple metrics with fields of the same name')
+    return metric_names
+def write_summary_results(summaries, cls, output_folder):
+    """Write summary results to file"""
+    fields = sum([list(s.keys()) for s in summaries], [])
+    values = sum([list(s.values()) for s in summaries], [])
+    # In order to remain consistent upon new fields being adding, for each of the following fields if they are present
+    # they will be output in the summary first in the order below. Any further fields will be output in the order each
+    # metric family is called, and within each family either in the order they were added to the dict (python >= 3.6) or
+    # randomly (python < 3.6).
+    default_order = ['HOTA', 'DetA', 'AssA', 'DetRe', 'DetPr', 'AssRe', 'AssPr', 'LocA', 'OWTA', 'HOTA(0)', 'LocA(0)',
+                     'HOTALocA(0)', 'MOTA', 'MOTP', 'MODA', 'CLR_Re', 'CLR_Pr', 'MTR', 'PTR', 'MLR', 'CLR_TP', 'CLR_FN',
+                     'CLR_FP', 'IDSW', 'MT', 'PT', 'ML', 'Frag', 'sMOTA', 'IDF1', 'IDR', 'IDP', 'IDTP', 'IDFN', 'IDFP',
+                     'Dets', 'GT_Dets', 'IDs', 'GT_IDs']
+    default_ordered_dict = OrderedDict(zip(default_order, [None for _ in default_order]))
+    for f, v in zip(fields, values):
+        default_ordered_dict[f] = v
+    for df in default_order:
+        if default_ordered_dict[df] is None:
+            del default_ordered_dict[df]
+    fields = list(default_ordered_dict.keys())
+    values = list(default_ordered_dict.values())
+    out_file = os.path.join(output_folder, cls + '_summary.txt')
+    os.makedirs(os.path.dirname(out_file), exist_ok=True)
+    with open(out_file, 'w', newline='') as f:
+        writer = csv.writer(f, delimiter=' ')
+        writer.writerow(fields)
+        writer.writerow(values)
+def write_detailed_results(details, cls, output_folder):
+    """Write detailed results to file"""
+    sequences = details[0].keys()
+    fields = ['seq'] + sum([list(s['COMBINED_SEQ'].keys()) for s in details], [])
+    out_file = os.path.join(output_folder, cls + '_detailed.csv')
+    os.makedirs(os.path.dirname(out_file), exist_ok=True)
+    with open(out_file, 'w', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(fields)
+        for seq in sorted(sequences):
+            if seq == 'COMBINED_SEQ':
+                continue
+            writer.writerow([seq] + sum([list(s[seq].values()) for s in details], []))
+        writer.writerow(['COMBINED'] + sum([list(s['COMBINED_SEQ'].values()) for s in details], []))
+def load_detail(file):
+    """Loads detailed data for a tracker."""
+    data = {}
+    with open(file) as f:
+        for i, row_text in enumerate(f):
+            row = row_text.replace('\r', '').replace('\n', '').split(',')
+            if i == 0:
+                keys = row[1:]
+                continue
+            current_values = row[1:]
+            seq = row[0]
+            if seq == 'COMBINED':
+                seq = 'COMBINED_SEQ'
+            if (len(current_values) == len(keys)) and seq != '':
+                data[seq] = {}
+                for key, value in zip(keys, current_values):
+                    data[seq][key] = float(value)
+    return data
+class TrackEvalException(Exception):
+    """Custom exception for catching expected errors."""
+    ...

avism/data/build.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import itertools
+import logging
+import torch.utils.data
+from detectron2.config import CfgNode, configurable
+from detectron2.data.build import (
+    build_batch_data_loader,
+    load_proposals_into_dataset,
+    trivial_batch_collator,
+)
+from detectron2.data.catalog import DatasetCatalog
+from detectron2.data.common import DatasetFromList, MapDataset
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.samplers import InferenceSampler, TrainingSampler
+from detectron2.utils.comm import get_world_size
+def _compute_num_images_per_worker(cfg: CfgNode):
+    num_workers = get_world_size()
+    images_per_batch = cfg.SOLVER.IMS_PER_BATCH
+    assert (
+        images_per_batch % num_workers == 0
+    ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format(
+        images_per_batch, num_workers
+    )
+    assert (
+        images_per_batch >= num_workers
+    ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format(
+        images_per_batch, num_workers
+    )
+    images_per_worker = images_per_batch // num_workers
+    return images_per_worker
+def filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names):
+    """
+    Filter out images with none annotations or only crowd annotations
+    (i.e., images without non-crowd annotations).
+    A common training-time preprocessing on COCO dataset.
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+    Returns:
+        list[dict]: the same format, but filtered.
+    """
+    num_before = len(dataset_dicts)
+    def valid(anns):
+        for ann in anns:
+            if isinstance(ann, list):
+                for instance in ann:
+                    if instance.get("iscrowd", 0) == 0:
+                        return True
+            else:
+                if ann.get("iscrowd", 0) == 0:
+                    return True
+        return False
+    dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images with no usable annotations. {} images left.".format(
+            num_before - num_after, num_after
+        )
+    )
+    return dataset_dicts
+def get_detection_dataset_dicts(
+    dataset_names, filter_empty=True, proposal_files=None
+):
+    """
+    Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
+    Args:
+        dataset_names (str or list[str]): a dataset name or a list of dataset names
+        filter_empty (bool): whether to filter out images without instance annotations
+        proposal_files (list[str]): if given, a list of object proposal files
+            that match each dataset in `dataset_names`.
+    Returns:
+        list[dict]: a list of dicts following the standard dataset dict format.
+    """
+    if isinstance(dataset_names, str):
+        dataset_names = [dataset_names]
+    assert len(dataset_names)
+    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
+    for dataset_name, dicts in zip(dataset_names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+    if proposal_files is not None:
+        assert len(dataset_names) == len(proposal_files)
+        # load precomputed proposals from proposal files
+        dataset_dicts = [
+            load_proposals_into_dataset(dataset_i_dicts, proposal_file)
+            for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
+        ]
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+    has_instances = "annotations" in dataset_dicts[0]
+    if filter_empty and has_instances:
+        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names)
+    assert len(dataset_dicts), "No valid data found in {}.".format(",".join(dataset_names))
+    return dataset_dicts
+def _train_loader_from_config(cfg, mapper, dataset_name=None, *, dataset=None, sampler=None):
+    if dataset is None:
+        dataset = get_detection_dataset_dicts(
+            dataset_name,
+            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+        )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    if sampler is None:
+        sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+        logger = logging.getLogger(__name__)
+        logger.info("Using training sampler {}".format(sampler_name))
+        sampler = TrainingSampler(len(dataset))
+    return {
+        "dataset": dataset,
+        "sampler": sampler,
+        "mapper": mapper,
+        "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
+        "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
+        "num_workers": cfg.DATALOADER.NUM_WORKERS,
+    }
+# TODO can allow dataset as an iterable or IterableDataset to make this function more general
+@configurable(from_config=_train_loader_from_config)
+def build_detection_train_loader(
+    dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0
+):
+    """
+    Build a dataloader for object detection with some default features.
+    This interface is experimental.
+    Args:
+        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+            or a map-style pytorch dataset. They can be obtained by using
+            :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper (callable): a callable which takes a sample (dict) from dataset and
+            returns the format to be consumed by the model.
+            When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
+        sampler (torch.utils.data.sampler.Sampler or None): a sampler that
+            produces indices to be applied on ``dataset``.
+            Default to :class:`TrainingSampler`, which coordinates a random shuffle
+            sequence across all workers.
+        total_batch_size (int): total batch size across all workers. Batching
+            simply puts data into a list.
+        aspect_ratio_grouping (bool): whether to group images with similar
+            aspect ratio for efficiency. When enabled, it requires each
+            element in dataset be a dict with keys "width" and "height".
+        num_workers (int): number of parallel data loading workers
+    Returns:
+        torch.utils.data.DataLoader: a dataloader. Each output from it is a
+            ``list[mapped_element]`` of length ``total_batch_size / num_workers``,
+            where ``mapped_element`` is produced by the ``mapper``.
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = TrainingSampler(len(dataset))
+    assert isinstance(sampler, torch.utils.data.sampler.Sampler)
+    return build_batch_data_loader(
+        dataset,
+        sampler,
+        total_batch_size,
+        aspect_ratio_grouping=aspect_ratio_grouping,
+        num_workers=num_workers,
+    )
+def _test_loader_from_config(cfg, dataset_name, mapper=None):
+    """
+    Uses the given `dataset_name` argument (instead of the names in cfg), because the
+    standard practice is to evaluate each test set individually (not combining them).
+    """
+    dataset = get_detection_dataset_dicts(
+        [dataset_name],
+        filter_empty=False,
+        proposal_files=[
+            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)]
+        ]
+        if cfg.MODEL.LOAD_PROPOSALS
+        else None,
+    )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+    return {"dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS}
+@configurable(from_config=_test_loader_from_config)
+def build_detection_test_loader(dataset, *, mapper, num_workers=0):
+    """
+    Similar to `build_detection_train_loader`, but uses a batch size of 1.
+    This interface is experimental.
+    Args:
+        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+            or a map-style pytorch dataset. They can be obtained by using
+            :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper (callable): a callable which takes a sample (dict) from dataset
+           and returns the format to be consumed by the model.
+           When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
+        num_workers (int): number of parallel data loading workers
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+        dataset, with test-time transformation and batching.
+    Examples:
+    ::
+        data_loader = build_detection_test_loader(
+            DatasetRegistry.get("my_test"),
+            mapper=DatasetMapper(...))
+        # or, instantiate with a CfgNode:
+        data_loader = build_detection_test_loader(cfg, "my_test")
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    sampler = InferenceSampler(len(dataset))
+    # Always use 1 image per worker during inference since this is the
+    # standard when reporting inference time in papers.
+    batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=num_workers,
+        batch_sampler=batch_sampler,
+        collate_fn=trivial_batch_collator,
+    )
+    return data_loader

avism/data/dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import copy
+import logging
+import random
+import numpy as np
+from typing import List, Union
+import torch
+from detectron2.config import configurable
+from detectron2.structures import (
+    BitMasks,
+    Boxes,
+    BoxMode,
+    Instances,
+)
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from .augmentation import build_augmentation
+__all__ = ["AVISDatasetMapper"]
+def filter_empty_instances(instances, by_box=True, by_mask=True, box_threshold=1e-5):
+    """
+    Filter out empty instances in an `Instances` object.
+    Args:
+        instances (Instances):
+        by_box (bool): whether to filter out instances with empty boxes
+        by_mask (bool): whether to filter out instances with empty masks
+        box_threshold (float): minimum width and height to be considered non-empty
+    Returns:
+        Instances: the filtered instances.
+    """
+    assert by_box or by_mask
+    r = []
+    if by_box:
+        r.append(instances.gt_boxes.nonempty(threshold=box_threshold))
+    if instances.has("gt_masks") and by_mask:
+        r.append(instances.gt_masks.nonempty())
+    r.append(instances.gt_classes != -1)
+    if not r:
+        return instances
+    m = r[0]
+    for x in r[1:]:
+        m = m & x
+    instances.gt_ids[~m] = -1
+    return instances
+def _get_dummy_anno(num_classes):
+    return {
+        "iscrowd": 0,
+        "category_id": num_classes,
+        "id": -1,
+        "bbox": np.array([0, 0, 0, 0]),
+        "bbox_mode": BoxMode.XYXY_ABS,
+        "segmentation": [np.array([0.0] * 6)]
+    }
+def avis_annotations_to_instances(annos, image_size):
+    """
+    Create an :class:`Instances` object used by the models,
+    from instance annotations in the dataset dict.
+    Args:
+        annos (list[dict]): a list of instance annotations in one image, each
+            element for one instance.
+        image_size (tuple): height, width
+    Returns:
+        Instances:
+            It will contain fields "gt_boxes", "gt_classes", "gt_ids",
+            "gt_masks", if they can be obtained from `annos`.
+            This is the format that builtin models expect.
+    """
+    boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
+    target = Instances(image_size)
+    target.gt_boxes = Boxes(boxes)
+    classes = [int(obj["category_id"]) for obj in annos]
+    classes = torch.tensor(classes, dtype=torch.int64)
+    target.gt_classes = classes
+    ids = [int(obj["id"]) for obj in annos]
+    ids = torch.tensor(ids, dtype=torch.int64)
+    target.gt_ids = ids
+    if len(annos) and "segmentation" in annos[0]:
+        segms = [obj["segmentation"] for obj in annos]
+        masks = []
+        for segm in segms:
+            assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
+                segm.ndim
+            )
+            # mask array
+            masks.append(segm)
+        # torch.from_numpy does not support array with negative stride.
+        masks = BitMasks(
+            torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks])
+        )
+        target.gt_masks = masks
+    return target
+class AVISDatasetMapper:
+    """
+    A callable which takes a dataset dict in AVIS Dataset format,
+    and map it into a format used by the model.
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train: bool,
+        *,
+        augmentations: List[Union[T.Augmentation, T.Transform]],
+        image_format: str,
+        use_instance_mask: bool = False,
+        sampling_frame_num: int = 2,
+        sampling_frame_range: int = 5,
+        sampling_frame_shuffle: bool = False,
+        num_classes: int = 26,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: whether it's used in training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            use_instance_mask: whether to process instance segmentation annotations, if available
+        """
+        # fmt: off
+        self.is_train               = is_train
+        self.augmentations          = T.AugmentationList(augmentations)
+        self.image_format           = image_format
+        self.use_instance_mask      = use_instance_mask
+        self.sampling_frame_num     = sampling_frame_num
+        self.sampling_frame_range   = sampling_frame_range
+        self.sampling_frame_shuffle = sampling_frame_shuffle
+        self.num_classes            = num_classes
+        # fmt: on
+        logger = logging.getLogger(__name__)
+        mode = "training" if is_train else "inference"
+        logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")
+    @classmethod
+    def from_config(cls, cfg, is_train: bool = True):
+        augs = build_augmentation(cfg, is_train)
+        sampling_frame_num = cfg.INPUT.SAMPLING_FRAME_NUM
+        sampling_frame_range = cfg.INPUT.SAMPLING_FRAME_RANGE
+        sampling_frame_shuffle = cfg.INPUT.SAMPLING_FRAME_SHUFFLE
+        ret = {
+            "is_train": is_train,
+            "augmentations": augs,
+            "image_format": cfg.INPUT.FORMAT,
+            "use_instance_mask": cfg.MODEL.MASK_ON,
+            "sampling_frame_num": sampling_frame_num,
+            "sampling_frame_range": sampling_frame_range,
+            "sampling_frame_shuffle": sampling_frame_shuffle,
+            "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+        }
+        return ret
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one video, in YTVIS Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        # TODO consider examining below deepcopy as it costs huge amount of computations.
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        video_length = dataset_dict["length"]
+        if self.is_train:
+            ref_frame = random.randrange(video_length)
+            start_idx = max(0, ref_frame-self.sampling_frame_range)
+            end_idx = min(video_length, ref_frame+self.sampling_frame_range + 1)
+            selected_idx = np.random.choice(
+                np.array(list(range(start_idx, ref_frame)) + list(range(ref_frame+1, end_idx))),
+                self.sampling_frame_num - 1,
+            )
+            selected_idx = selected_idx.tolist() + [ref_frame]
+            selected_idx = sorted(selected_idx)
+            if self.sampling_frame_shuffle:
+                random.shuffle(selected_idx)
+        else:
+            selected_idx = range(video_length)
+        video_annos = dataset_dict.pop("annotations", None)
+        file_names = dataset_dict.pop("file_names", None)
+        audio_feats = dataset_dict.pop("audio", None)
+        if self.is_train:
+            _ids = set()
+            for frame_idx in selected_idx:
+                _ids.update([anno["id"] for anno in video_annos[frame_idx]])
+            ids = dict()
+            for i, _id in enumerate(_ids):
+                ids[_id] = i
+        dataset_dict["image"] = []
+        dataset_dict["instances"] = []
+        dataset_dict["file_names"] = []
+        dataset_dict["audio"] = []
+        dataset_dict["frame_idx"] = list(selected_idx)
+        for frame_idx in selected_idx:
+            dataset_dict["file_names"].append(file_names[frame_idx])
+            dataset_dict["audio"].append(audio_feats[frame_idx])
+            # Read image
+            image = utils.read_image(file_names[frame_idx], format=self.image_format)
+            utils.check_image_size(dataset_dict, image)
+            aug_input = T.AugInput(image)
+            transforms = self.augmentations(aug_input)
+            image = aug_input.image
+            image_shape = image.shape[:2]  # h, w
+            # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+            # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+            # Therefore it's important to use torch.Tensor.
+            dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))))
+            if (video_annos is None) or (not self.is_train):
+                continue
+            # NOTE copy() is to prevent annotations getting changed from applying augmentations
+            _frame_annos = []
+            for anno in video_annos[frame_idx]:
+                _anno = {}
+                for k, v in anno.items():
+                    _anno[k] = copy.deepcopy(v)
+                _frame_annos.append(_anno)
+            # USER: Implement additional transformations if you have other types of data
+            annos = [
+                utils.transform_instance_annotations(obj, transforms, image_shape)
+                for obj in _frame_annos
+                if obj.get("iscrowd", 0) == 0
+            ]
+            sorted_annos = [_get_dummy_anno(self.num_classes) for _ in range(len(ids))]
+            for _anno in annos:
+                idx = ids[_anno["id"]]
+                sorted_annos[idx] = _anno
+            _gt_ids = [_anno["id"] for _anno in sorted_annos]
+            instances = utils.annotations_to_instances(sorted_annos, image_shape, mask_format="bitmask")
+            instances.gt_ids = torch.tensor(_gt_ids)
+            if instances.has("gt_masks"):
+                instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+                instances = filter_empty_instances(instances)
+            else:
+                instances.gt_masks = BitMasks(torch.empty((0, *image_shape)))
+            dataset_dict["instances"].append(instances)
+        return dataset_dict

avism/data/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from . import builtin # ensure the builtin datasets are registered
2	+
3	+ __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]

avism/data/datasets/avis.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import contextlib
+import io
+import logging
+import numpy as np
+import os
+import pycocotools.mask as mask_util
+from fvcore.common.file_io import PathManager
+from fvcore.common.timer import Timer
+from detectron2.structures import Boxes, BoxMode, PolygonMasks
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from .avis_api.avos import AVOS
+"""
+This file contains functions to parse AVIS dataset of
+COCO-format annotations into dicts in "Detectron2 format".
+"""
+logger = logging.getLogger(__name__)
+__all__ = ["load_avis_json", "register_avis_instances"]
+AVIS_CATEGORIES = [
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+    {"color": [0, 82, 0], "isthing": 1, "id": 2, "name": "violin"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 3, "name": "guitar"},
+    {"color": [165, 42, 42], "isthing": 1, "id": 4, "name": "cello"},
+    {"color": [134, 134, 103], "isthing": 1, "id": 5, "name": "flute"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 6, "name": "piano"},
+    {"color": [255, 109, 65], "isthing": 1, "id": 7, "name": "ukulele"},
+    {"color": [0, 226, 252], "isthing": 1, "id": 8, "name": "accordion"},
+    {"color": [5, 121, 0], "isthing": 1, "id": 9, "name": "guzheng"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 10, "name": "clarinet"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 11, "name": "cat"},
+    {"color": [100, 170, 30], "isthing": 1, "id": 12, "name": "car"},
+    {"color": [179, 0, 194], "isthing": 1, "id": 13, "name": "saxophone"},
+    {"color": [255, 77, 255], "isthing": 1, "id": 14, "name": "dog"},
+    {"color": [120, 166, 157], "isthing": 1, "id": 15, "name": "lawn_mover"},
+    {"color": [73, 77, 174], "isthing": 1, "id": 16, "name": "tuba"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 17, "name": "banjo"},
+    {"color": [182, 182, 255], "isthing": 1, "id": 18, "name": "pipa"},
+    {"color": [0, 143, 149], "isthing": 1, "id": 19, "name": "bassoon"},
+    {"color": [174, 57, 255], "isthing": 1, "id": 20, "name": "airplane"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 21, "name": "tree_harvester"},
+    {"color": [72, 0, 118], "isthing": 1, "id": 22, "name": "trumpet"},
+    {"color": [255, 179, 240], "isthing": 1, "id": 23, "name": "lion"},
+    {"color": [0, 125, 92], "isthing": 1, "id": 24, "name": "bass"},
+    {"color": [209, 0, 151], "isthing": 1, "id": 25, "name": "erhu"},
+    {"color": [188, 208, 182], "isthing": 1, "id": 26, "name": "horse"}]
+def _get_avis_instances_meta():
+    thing_ids = [k["id"] for k in AVIS_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in AVIS_CATEGORIES if k["isthing"] == 1]
+    assert len(thing_ids) == 26, len(thing_ids)
+    # Mapping from the incontiguous AVIS category id to an id in [0, 25]
+    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
+    thing_classes = [k["name"] for k in AVIS_CATEGORIES if k["isthing"] == 1]
+    ret = {
+        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+        "thing_classes": thing_classes,
+        "thing_colors": thing_colors,
+    }
+    return ret
+def load_avis_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None):
+    timer = Timer()
+    json_file = PathManager.get_local_path(json_file)
+    with contextlib.redirect_stdout(io.StringIO()):
+        avis_api = AVOS(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+    id_map = None
+    if dataset_name is not None:
+        meta = MetadataCatalog.get(dataset_name)
+        cat_ids = sorted(avis_api.getCatIds())
+        cats = avis_api.loadCats(cat_ids)
+        # The categories in a custom json file may not be sorted.
+        thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
+        meta.thing_classes = thing_classes
+        # It works by looking at the "categories" field in the json, therefore
+        # if users' own json also have incontiguous ids, we'll
+        # apply this mapping as well but print a warning.
+        if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
+            if "coco" not in dataset_name:
+                logger.warning(
+                    """
+                    Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
+                    """
+                )
+        id_map = {v: i for i, v in enumerate(cat_ids)}
+        meta.thing_dataset_id_to_contiguous_id = id_map
+    # sort indices for reproducible results
+    vid_ids = sorted(avis_api.vids.keys())
+    vids = avis_api.loadVids(vid_ids)
+    anns = [avis_api.vidToAnns[vid_id] for vid_id in vid_ids]
+    total_num_valid_anns = sum([len(x) for x in anns])
+    total_num_anns = len(avis_api.anns)
+    if total_num_valid_anns < total_num_anns:
+        logger.warning(
+            f"{json_file} contains {total_num_anns} annotations, but only "
+            f"{total_num_valid_anns} of them match to images in the file."
+        )
+    vids_anns = list(zip(vids, anns))
+    logger.info("Loaded {} videos in AVIS format from {}".format(len(vids_anns), json_file))
+    dataset_dicts = []
+    ann_keys = ["iscrowd", "category_id", "id"] + (extra_annotation_keys or [])
+    num_instances_without_valid_segmentation = 0
+    for (vid_dict, anno_dict_list) in vids_anns:
+        record = {}
+        record["file_names"] = [os.path.join(image_root, vid_dict["file_names"][i]) for i in range(vid_dict["length"])]
+        record["height"] = vid_dict["height"]
+        record["width"] = vid_dict["width"]
+        record["length"] = vid_dict["length"]
+        video_id = record["video_id"] = vid_dict["id"]
+        video_objs = []
+        for frame_idx in range(record["length"]):
+            frame_objs = []
+            for anno in anno_dict_list:
+                assert anno["video_id"] == video_id
+                obj = {key: anno[key] for key in ann_keys if key in anno}
+                _bboxes = anno.get("bboxes", None)
+                _segm = anno.get("segmentations", None)
+                if not (_bboxes and _segm and _bboxes[frame_idx] and _segm[frame_idx]):
+                    continue
+                bbox = _bboxes[frame_idx]
+                segm = _segm[frame_idx]
+                obj["bbox"] = bbox
+                obj["bbox_mode"] = BoxMode.XYWH_ABS
+                if isinstance(segm, dict):
+                    if isinstance(segm["counts"], list):
+                        # convert to compressed RLE
+                        segm = mask_util.frPyObjects(segm, *segm["size"])
+                elif segm:
+                    # filter out invalid polygons (< 3 points)
+                    segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+                    if len(segm) == 0:
+                        num_instances_without_valid_segmentation += 1
+                        continue  # ignore this instance
+                obj["segmentation"] = segm
+                if id_map:
+                    obj["category_id"] = id_map[obj["category_id"]]
+                frame_objs.append(obj)
+            video_objs.append(frame_objs)
+        record["annotations"] = video_objs
+        # audio:
+        audio_feats_pth = os.path.join(image_root[:-10], "FEATAudios", vid_dict['file_names'][0].split("/")[0] + '.npy')
+        record["audio"] = np.load(audio_feats_pth)
+        dataset_dicts.append(record)
+    if num_instances_without_valid_segmentation > 0:
+        logger.warning(
+            "Filtered out {} instances without valid segmentation. ".format(
+                num_instances_without_valid_segmentation
+            )
+            + "There might be issues in your dataset generation process. "
+            "A valid polygon should be a list[float] with even length >= 6."
+        )
+    return dataset_dicts
+def register_avis_instances(name, metadata, json_file, image_root):
+    """
+    Register a dataset in AVIS's json annotation format for
+    instance tracking.
+    Args:
+        name (str): the name that identifies a dataset, e.g. "avis_train".
+        metadata (dict): extra metadata associated with this dataset.  You can
+            leave it as an empty dict.
+        json_file (str): path to the json instance annotation file.
+        image_root (str or path-like): directory which contains all the images.
+    """
+    assert isinstance(name, str), name
+    assert isinstance(json_file, (str, os.PathLike)), json_file
+    assert isinstance(image_root, (str, os.PathLike)), image_root
+    # 1. register a function which returns dicts
+    DatasetCatalog.register(name, lambda: load_avis_json(json_file, image_root, name))
+    # 2. Optionally, add metadata about this dataset,
+    # since they might be useful in evaluation, visualization or logging
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root, evaluator_type="avis", **metadata
+    )

avism/data/datasets/avis_api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Facebook, Inc. and its affiliates.

avism/data/datasets/avis_api/avos.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# The following API functions are defined:
+#  AVOS       - AVOS api class that loads AVIS annotation file and prepare data structures.
+#  decodeMask - Decode binary mask M encoded via run-length encoding.
+#  encodeMask - Encode binary mask M using run-length encoding.
+#  getAnnIds  - Get ann ids that satisfy given filter conditions.
+#  getCatIds  - Get cat ids that satisfy given filter conditions.
+#  getImgIds  - Get img ids that satisfy given filter conditions.
+#  loadAnns   - Load anns with the specified ids.
+#  loadCats   - Load cats with the specified ids.
+#  loadImgs   - Load imgs with the specified ids.
+#  annToMask  - Convert segmentation in an annotation to binary mask.
+#  loadRes    - Load algorithm results and create API for accessing them.
+import json
+import time
+import numpy as np
+import copy
+import itertools
+from pycocotools import mask as maskUtils
+from collections import defaultdict
+import sys
+PYTHON_VERSION = sys.version_info[0]
+if PYTHON_VERSION == 2:
+    from urllib import urlretrieve
+elif PYTHON_VERSION == 3:
+    from urllib.request import urlretrieve
+def _isArrayLike(obj):
+    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
+class AVOS:
+    def __init__(self, annotation_file=None):
+        """
+        Constructor of Microsoft COCO helper class for reading and visualizing annotations.
+        :param annotation_file (str): location of annotation file
+        :param image_folder (str): location to the folder that hosts images.
+        :return:
+        """
+        # load dataset
+        self.dataset,self.anns,self.cats,self.vids = dict(),dict(),dict(),dict()
+        self.vidToAnns, self.catToVids = defaultdict(list), defaultdict(list)
+        if not annotation_file == None:
+            print('loading annotations into memory...')
+            tic = time.time()
+            dataset = json.load(open(annotation_file, 'r'))
+            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
+            print('Done (t={:0.2f}s)'.format(time.time()- tic))
+            self.dataset = dataset
+            self.createIndex()
+    def createIndex(self):
+        # create index
+        print('creating index...')
+        anns, cats, vids = {}, {}, {}
+        vidToAnns,catToVids = defaultdict(list),defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann in self.dataset['annotations']:
+                vidToAnns[ann['video_id']].append(ann)
+                anns[ann['id']] = ann
+        if 'videos' in self.dataset:
+            for vid in self.dataset['videos']:
+                vids[vid['id']] = vid
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                catToVids[ann['category_id']].append(ann['video_id'])
+        print('index created!')
+        # create class members
+        self.anns = anns
+        self.vidToAnns = vidToAnns
+        self.catToVids = catToVids
+        self.vids = vids
+        self.cats = cats
+    def info(self):
+        """
+        Print information about the annotation file.
+        :return:
+        """
+        for key, value in self.dataset['info'].items():
+            print('{}: {}'.format(key, value))
+    def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None):
+        """
+        Get ann ids that satisfy given filter conditions. default skips that filter
+        :param vidIds  (int array)     : get anns for given vids
+               catIds  (int array)     : get anns for given cats
+               areaRng (float array)   : get anns for given area range (e.g. [0 inf])
+               iscrowd (boolean)       : get anns for given crowd label (False or True)
+        :return: ids (int array)       : integer array of ann ids
+        """
+        vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+        if len(vidIds) == len(catIds) == len(areaRng) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(vidIds) == 0:
+                lists = [self.vidToAnns[vidId] for vidId in vidIds if vidId in self.vidToAnns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.dataset['annotations']
+            anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
+            anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['avg_area'] > areaRng[0] and ann['avg_area'] < areaRng[1]]
+        if not iscrowd == None:
+            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
+        else:
+            ids = [ann['id'] for ann in anns]
+        return ids
+    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
+        """
+        filtering parameters. default skips that filter.
+        :param catNms (str array)  : get cats for given cat names
+        :param supNms (str array)  : get cats for given supercategory names
+        :param catIds (int array)  : get cats for given cat ids
+        :return: ids (int array)   : integer array of cat ids
+        """
+        catNms = catNms if _isArrayLike(catNms) else [catNms]
+        supNms = supNms if _isArrayLike(supNms) else [supNms]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+        if len(catNms) == len(supNms) == len(catIds) == 0:
+            cats = self.dataset['categories']
+        else:
+            cats = self.dataset['categories']
+            cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
+            cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
+            cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
+        ids = [cat['id'] for cat in cats]
+        return ids
+    def getVidIds(self, vidIds=[], catIds=[]):
+        '''
+        Get vid ids that satisfy given filter conditions.
+        :param vidIds (int array) : get vids for given ids
+        :param catIds (int array) : get vids with all given cats
+        :return: ids (int array)  : integer array of vid ids
+        '''
+        vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+        if len(vidIds) == len(catIds) == 0:
+            ids = self.vids.keys()
+        else:
+            ids = set(vidIds)
+            for i, catId in enumerate(catIds):
+                if i == 0 and len(ids) == 0:
+                    ids = set(self.catToVids[catId])
+                else:
+                    ids &= set(self.catToVids[catId])
+        return list(ids)
+    def loadAnns(self, ids=[]):
+        """
+        Load anns with the specified ids.
+        :param ids (int array)       : integer ids specifying anns
+        :return: anns (object array) : loaded ann objects
+        """
+        if _isArrayLike(ids):
+            return [self.anns[id] for id in ids]
+        elif type(ids) == int:
+            return [self.anns[ids]]
+    def loadCats(self, ids=[]):
+        """
+        Load cats with the specified ids.
+        :param ids (int array)       : integer ids specifying cats
+        :return: cats (object array) : loaded cat objects
+        """
+        if _isArrayLike(ids):
+            return [self.cats[id] for id in ids]
+        elif type(ids) == int:
+            return [self.cats[ids]]
+    def loadVids(self, ids=[]):
+        """
+        Load anns with the specified ids.
+        :param ids (int array)       : integer ids specifying vid
+        :return: vids (object array) : loaded vid objects
+        """
+        if _isArrayLike(ids):
+            return [self.vids[id] for id in ids]
+        elif type(ids) == int:
+            return [self.vids[ids]]
+    def loadRes(self, resFile):
+        """
+        Load result file and return a result api object.
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = AVOS()
+        res.dataset['videos'] = [img for img in self.dataset['videos']]
+        print('Loading and preparing results...')
+        tic = time.time()
+        if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode):
+            anns = json.load(open(resFile))
+        elif type(resFile) == np.ndarray:
+            anns = self.loadNumpyAnnotations(resFile)
+        else:
+            anns = resFile
+        assert type(anns) == list, 'results in not an array of objects'
+        annsVidIds = [ann['video_id'] for ann in anns]
+        assert set(annsVidIds) == (set(annsVidIds) & set(self.getVidIds())), \
+               'Results do not correspond to current coco set'
+        if 'segmentations' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                ann['areas'] = []
+                if not 'bboxes' in ann:
+                    ann['bboxes'] = []
+                for seg in ann['segmentations']:
+                    # now only support compressed RLE format as segmentation results
+                    if seg:
+                        ann['areas'].append(maskUtils.area(seg))
+                        if len(ann['bboxes']) < len(ann['areas']):
+                            ann['bboxes'].append(maskUtils.toBbox(seg))
+                    else:
+                        ann['areas'].append(None)
+                        if len(ann['bboxes']) < len(ann['areas']):
+                            ann['bboxes'].append(None)
+                ann['id'] = id+1
+                l = [a for a in ann['areas'] if a]
+                if len(l)==0:
+                  ann['avg_area'] = 0
+                else:
+                  ann['avg_area'] = np.array(l).mean()
+                ann['iscrowd'] = 0
+        print('DONE (t={:0.2f}s)'.format(time.time()- tic))
+        res.dataset['annotations'] = anns
+        res.createIndex()
+        return res
+    def annToRLE(self, ann, frameId):
+        """
+        Convert annotation which can be polygons, uncompressed RLE to RLE.
+        :return: binary mask (numpy 2D array)
+        """
+        t = self.vids[ann['video_id']]
+        h, w = t['height'], t['width']
+        segm = ann['segmentations'][frameId]
+        if type(segm) == list:
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(segm, h, w)
+            rle = maskUtils.merge(rles)
+        elif type(segm['counts']) == list:
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(segm, h, w)
+        else:
+            # rle
+            rle = segm
+        return rle
+    def annToMask(self, ann, frameId):
+        """
+        Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
+        :return: binary mask (numpy 2D array)
+        """
+        rle = self.annToRLE(ann, frameId)
+        m = maskUtils.decode(rle)
+        return m

avism/data/datasets/avis_api/avoseval.py ADDED Viewed

	@@ -0,0 +1,559 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+import datetime
+import time
+from collections import defaultdict
+from pycocotools import mask as maskUtils
+import copy
+class AVOSeval:
+    # Interface for evaluating video instance segmentation on the AVIS dataset.
+    #
+    # The usage for AVOSeval is as follows:
+    #  cocoGt=..., cocoDt=...       # load dataset and results
+    #  E = AVOSeval(cocoGt,cocoDt); # initialize AVOSeval object
+    #  E.params.recThrs = ...;      # set parameters as desired
+    #  E.evaluate();                # run per image evaluation
+    #  E.accumulate();              # accumulate per image results
+    #  E.summarize();               # display summary metrics of results
+    #
+    # The evaluation parameters are as follows (defaults in brackets):
+    #  imgIds     - [all] N img ids to use for evaluation
+    #  catIds     - [all] K cat ids to use for evaluation
+    #  iouThrs    - [.5:.05:.95] T=10 IoU thresholds for evaluation
+    #  recThrs    - [0:.01:1] R=101 recall thresholds for evaluation
+    #  areaRng    - [...] A=4 object area ranges for evaluation
+    #  maxDets    - [1 10 100] M=3 thresholds on max detections per image
+    #  iouType    - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
+    #  iouType replaced the now DEPRECATED useSegm parameter.
+    #  useCats    - [1] if true use category labels for evaluation
+    # Note: if useCats=0 category labels are ignored as in proposal scoring.
+    # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
+    #
+    # evaluate(): evaluates detections on every image and every category and
+    # concats the results into the "evalImgs" with fields:
+    #  dtIds      - [1xD] id for each of the D detections (dt)
+    #  gtIds      - [1xG] id for each of the G ground truths (gt)
+    #  dtMatches  - [TxD] matching gt id at each IoU or 0
+    #  gtMatches  - [TxG] matching dt id at each IoU or 0
+    #  dtScores   - [1xD] confidence of each dt
+    #  gtIgnore   - [1xG] ignore flag for each gt
+    #  dtIgnore   - [TxD] ignore flag for each dt at each IoU
+    #
+    # accumulate(): accumulates the per-image, per-category evaluation
+    # results in "evalImgs" into the dictionary "eval" with fields:
+    #  params     - parameters used for evaluation
+    #  date       - date evaluation was performed
+    #  counts     - [T,R,K,A,M] parameter dimensions (see above)
+    #  precision  - [TxRxKxAxM] precision for every evaluation setting
+    #  recall     - [TxKxAxM] max recall for every evaluation setting
+    # Note: precision and recall==-1 for settings with no gt objects.
+    #
+    # See also coco, mask, pycocoDemo, pycocoEvalDemo
+    def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
+        '''
+        Initialize CocoEval using coco APIs for gt and dt
+        :param cocoGt: coco object with ground truth annotations
+        :param cocoDt: coco object with detection results
+        :return: None
+        '''
+        if not iouType:
+            print('iouType not specified. use default iouType segm')
+        self.cocoGt   = cocoGt              # ground truth COCO API
+        self.cocoDt   = cocoDt              # detections COCO API
+        self.params   = {}                  # evaluation parameters
+        self.evalVids = defaultdict(list)   # per-image per-category evaluation results [KxAxI] elements
+        self.eval     = {}                  # accumulated evaluation results
+        self._gts = defaultdict(list)       # gt for evaluation
+        self._dts = defaultdict(list)       # dt for evaluation
+        self.params = Params(iouType=iouType) # parameters
+        self._paramsEval = {}               # parameters for evaluation
+        self.stats = []                     # result summarization
+        self.ious = {}                      # ious between all gts and dts
+        if not cocoGt is None:
+            self.params.vidIds = sorted(cocoGt.getVidIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+    def _prepare(self):
+        '''
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        '''
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                for i, a in enumerate(ann['segmentations']):
+                    if a:
+                        rle = coco.annToRLE(ann, i)
+                        ann['segmentations'][i] = rle
+                l = [a for a in ann['areas'] if a]
+                if len(l)==0:
+                  ann['avg_area'] = 0
+                else:
+                  ann['avg_area'] = np.array(l).mean()
+        p = self.params
+        if p.useCats:
+            gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds))
+            dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds))
+        else:
+            gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds))
+            dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds))
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == 'segm':
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+        # set ignore flag
+        for gt in gts:
+            gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
+            gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
+            if p.iouType == 'keypoints':
+                gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
+        self._gts = defaultdict(list)       # gt for evaluation
+        self._dts = defaultdict(list)       # dt for evaluation
+        for gt in gts:
+            self._gts[gt['video_id'], gt['category_id']].append(gt)
+        for dt in dts:
+            self._dts[dt['video_id'], dt['category_id']].append(dt)
+        self.evalVids = defaultdict(list)   # per-image per-category evaluation results
+        self.eval     = {}                  # accumulated evaluation results
+    def evaluate(self):
+        '''
+        Run per image evaluation on given images and store results (a list of dict) in self.evalVids
+        :return: None
+        '''
+        tic = time.time()
+        print('Running per image evaluation...')
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if not p.useSegm is None:
+            p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+            print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+        print('Evaluate annotation type *{}*'.format(p.iouType))
+        p.vidIds = list(np.unique(p.vidIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params=p
+        self._prepare()
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+        if p.iouType == 'segm' or p.iouType == 'bbox':
+            computeIoU = self.computeIoU
+        elif p.iouType == 'keypoints':
+            computeIoU = self.computeOks
+        self.ious = {(vidId, catId): computeIoU(vidId, catId) \
+                        for vidId in p.vidIds
+                        for catId in catIds}
+        evaluateVid = self.evaluateVid
+        maxDet = p.maxDets[-1]
+        self.evalImgs = [evaluateVid(vidId, catId, areaRng, maxDet)
+                 for catId in catIds
+                 for areaRng in p.areaRng
+                 for vidId in p.vidIds
+             ]
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc-tic))
+    def computeIoU(self, vidId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[vidId,catId]
+            dt = self._dts[vidId,catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[vidId,cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[vidId,cId]]
+        if len(gt) == 0 and len(dt) ==0:
+            return []
+        inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt=dt[0:p.maxDets[-1]]
+        if p.iouType == 'segm':
+            g = [g['segmentations'] for g in gt]
+            d = [d['segmentations'] for d in dt]
+        elif p.iouType == 'bbox':
+            g = [g['bboxes'] for g in gt]
+            d = [d['bboxes'] for d in dt]
+        else:
+            raise Exception('unknown iouType for iou computation')
+        # compute iou between each dt and gt region
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        #ious = maskUtils.iou(d,g,iscrowd)
+        def iou_seq(d_seq, g_seq):
+            i = .0
+            u = .0
+            for d, g in zip(d_seq, g_seq):
+                if d and g:
+                    i += maskUtils.area(maskUtils.merge([d, g], True))
+                    u += maskUtils.area(maskUtils.merge([d, g], False))
+                elif not d and g:
+                    u += maskUtils.area(g)
+                elif d and not g:
+                    u += maskUtils.area(d)
+            if not u > .0:
+                print("Mask sizes in video {} and category {} may not match!".format(vidId, catId))
+            iou = i / u if u > .0 else .0
+            return iou
+        ious = np.zeros([len(d), len(g)])
+        for i, j in np.ndindex(ious.shape):
+            ious[i, j] = iou_seq(d[i], g[j])
+        #print(vidId, catId, ious.shape, ious)
+        return ious
+    def computeOks(self, imgId, catId):
+        p = self.params
+        # dimention here should be Nxm
+        gts = self._gts[imgId, catId]
+        dts = self._dts[imgId, catId]
+        inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
+        dts = [dts[i] for i in inds]
+        if len(dts) > p.maxDets[-1]:
+            dts = dts[0:p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(gts) == 0 or len(dts) == 0:
+            return []
+        ious = np.zeros((len(dts), len(gts)))
+        sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0
+        vars = (sigmas * 2)**2
+        k = len(sigmas)
+        # compute oks between each detection and ground truth object
+        for j, gt in enumerate(gts):
+            # create bounds for ignore regions(double the gt bbox)
+            g = np.array(gt['keypoints'])
+            xg = g[0::3]; yg = g[1::3]; vg = g[2::3]
+            k1 = np.count_nonzero(vg > 0)
+            bb = gt['bbox']
+            x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2
+            y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2
+            for i, dt in enumerate(dts):
+                d = np.array(dt['keypoints'])
+                xd = d[0::3]; yd = d[1::3]
+                if k1>0:
+                    # measure the per-keypoint distance if keypoints visible
+                    dx = xd - xg
+                    dy = yd - yg
+                else:
+                    # measure minimum distance to keypoints in (x0,y0) & (x1,y1)
+                    z = np.zeros((k))
+                    dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0)
+                    dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0)
+                e = (dx**2 + dy**2) / vars / (gt['avg_area']+np.spacing(1)) / 2
+                if k1 > 0:
+                    e=e[vg > 0]
+                ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
+        return ious
+    def evaluateVid(self, vidId, catId, aRng, maxDet):
+        '''
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        '''
+        p = self.params
+        if p.useCats:
+            gt = self._gts[vidId,catId]
+            dt = self._dts[vidId,catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[vidId,cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[vidId,cId]]
+        if len(gt) == 0 and len(dt) ==0:
+            return None
+        for g in gt:
+            if g['ignore'] or (g['avg_area']<aRng[0] or g['avg_area']>aRng[1]):
+                g['_ignore'] = 1
+            else:
+                g['_ignore'] = 0
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        # load computed ious
+        ious = self.ious[vidId, catId][:, gtind] if len(self.ious[vidId, catId]) > 0 else self.ious[vidId, catId]
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm  = np.zeros((T,G))
+        dtm  = np.zeros((T,D))
+        gtIg = np.array([g['_ignore'] for g in gt])
+        dtIg = np.zeros((T,D))
+        if not len(ious)==0:
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t,1-1e-10])
+                    m   = -1
+                    for gind, g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind,gind]>0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m>-1 and gtIg[m]==0 and gtIg[gind]==1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious[dind,gind] < iou:
+                            continue
+                        # if match successful and best so far, store appropriately
+                        iou=ious[dind,gind]
+                        m=gind
+                    # if match made store id of match for both dt and gt
+                    if m ==-1:
+                        continue
+                    dtIg[tind,dind] = gtIg[m]
+                    dtm[tind,dind]  = gt[m]['id']
+                    gtm[tind,m]     = d['id']
+        # set unmatched detections outside of area range to ignore
+        a = np.array([d['avg_area']<aRng[0] or d['avg_area']>aRng[1] for d in dt]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0)))
+        # store results for given image and category
+        return {
+                'video_id':     vidId,
+                'category_id':  catId,
+                'aRng':         aRng,
+                'maxDet':       maxDet,
+                'dtIds':        [d['id'] for d in dt],
+                'gtIds':        [g['id'] for g in gt],
+                'dtMatches':    dtm,
+                'gtMatches':    gtm,
+                'dtScores':     [d['score'] for d in dt],
+                'gtIgnore':     gtIg,
+                'dtIgnore':     dtIg,
+            }
+    def accumulate(self, p = None):
+        '''
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        '''
+        print('Accumulating evaluation results...')
+        tic = time.time()
+        if not self.evalImgs:
+            print('Please run evaluate() first')
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+        T           = len(p.iouThrs)
+        R           = len(p.recThrs)
+        K           = len(p.catIds) if p.useCats else 1
+        A           = len(p.areaRng)
+        M           = len(p.maxDets)
+        precision   = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
+        recall      = -np.ones((T,K,A,M))
+        scores      = -np.ones((T,R,K,A,M))
+        # create dictionary for future indexing
+        _pe = self._paramsEval
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.vidIds)
+        # get inds to evaluate
+        k_list = [n for n, k in enumerate(p.catIds)  if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
+        i_list = [n for n, i in enumerate(p.vidIds)  if i in setI]
+        I0 = len(_pe.vidIds)
+        A0 = len(_pe.areaRng)
+        # retrieve E at each category, area range, and max number of detections
+        for k, k0 in enumerate(k_list):
+            Nk = k0*A0*I0
+            for a, a0 in enumerate(a_list):
+                Na = a0*I0
+                for m, maxDet in enumerate(m_list):
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if not e is None]
+                    if len(E) == 0:
+                        continue
+                    dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
+                    # different sorting method generates slightly different results.
+                    # mergesort is used to be consistent as Matlab implementation.
+                    inds = np.argsort(-dtScores, kind='mergesort')
+                    dtScoresSorted = dtScores[inds]
+                    dtm  = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
+                    dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet]  for e in E], axis=1)[:,inds]
+                    gtIg = np.concatenate([e['gtIgnore'] for e in E])
+                    npig = np.count_nonzero(gtIg==0 )
+                    if npig == 0:
+                        continue
+                    tps = np.logical_and(               dtm,  np.logical_not(dtIg) )
+                    fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float64)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float64)
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp+tp+np.spacing(1))
+                        q  = np.zeros((R,))
+                        ss = np.zeros((R,))
+                        if nd:
+                            recall[t,k,a,m] = rc[-1]
+                        else:
+                            recall[t,k,a,m] = 0
+                        # numpy is slow without cython optimization for accessing elements
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist(); q = q.tolist()
+                        for i in range(nd-1, 0, -1):
+                            if pr[i] > pr[i-1]:
+                                pr[i-1] = pr[i]
+                        inds = np.searchsorted(rc, p.recThrs, side='left')
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                                ss[ri] = dtScoresSorted[pi]
+                        except:
+                            pass
+                        precision[t,:,k,a,m] = np.array(q)
+                        scores[t,:,k,a,m] = np.array(ss)
+        self.eval = {
+            'params': p,
+            'counts': [T, R, K, A, M],
+            'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+            'precision': precision,
+            'recall':   recall,
+            'scores': scores,
+        }
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format( toc-tic))
+    def summarize(self):
+        '''
+        Compute and display summary metrics for evaluation results.
+        Note this functin can *only* be applied on the default parameter setting
+        '''
+        def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
+            p = self.params
+            iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
+            titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+            typeStr = '(AP)' if ap==1 else '(AR)'
+            iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
+                if iouThr is None else '{:0.2f}'.format(iouThr)
+            aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval['precision']
+                # IoU
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:,:,:,aind,mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval['recall']
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:,:,aind,mind]
+            if len(s[s>-1])==0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s>-1])
+            print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
+            return mean_s
+        def _summarizeDets():
+            stats = np.zeros((12,))
+            stats[0] = _summarize(1)
+            stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2])
+            return stats
+        def _summarizeKps():
+            stats = np.zeros((10,))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng='medium')
+            stats[4] = _summarize(1, maxDets=20, areaRng='large')
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng='medium')
+            stats[9] = _summarize(0, maxDets=20, areaRng='large')
+            return stats
+        if not self.eval:
+            raise Exception('Please run accumulate() first')
+        iouType = self.params.iouType
+        if iouType == 'segm' or iouType == 'bbox':
+            summarize = _summarizeDets
+        elif iouType == 'keypoints':
+            summarize = _summarizeKps
+        self.stats = summarize()
+    def __str__(self):
+        self.summarize()
+class Params:
+    '''
+    Params for coco evaluation api
+    '''
+    def setDetParams(self):
+        self.vidIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        #self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
+        #self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
+        self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 128 ** 2], [ 128 ** 2, 256 ** 2], [256 ** 2, 1e5 ** 2]]
+        self.areaRngLbl = ['all', 'small', 'medium', 'large']
+        self.useCats = 1
+    def setKpParams(self):
+        self.vidIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
+        self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
+        self.areaRngLbl = ['all', 'medium', 'large']
+        self.useCats = 1
+    def __init__(self, iouType='segm'):
+        if iouType == 'segm' or iouType == 'bbox':
+            self.setDetParams()
+        elif iouType == 'keypoints':
+            self.setKpParams()
+        else:
+            raise Exception('iouType not supported')
+        self.iouType = iouType
+        # useSegm is deprecated
+        self.useSegm = None

avism/data/datasets/builtin.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+from .avis import (
+    register_avis_instances,
+    _get_avis_instances_meta,
+)
+# ==== Predefined splits for AVIS ===========
+_PREDEFINED_SPLITS_AVIS = {
+    "avis_train": ("train/JPEGImages", "train.json"),
+    "avis_val": ("val/JPEGImages", "val.json"),
+    "avis_test": ("test/JPEGImages", "test.json"),
+}
+def register_all_avis(root):
+    for key, (image_root, json_file) in _PREDEFINED_SPLITS_AVIS.items():
+        # Assume pre-defined datasets live in `./datasets`.
+        register_avis_instances(
+            key,
+            _get_avis_instances_meta(),
+            os.path.join(root, json_file) if "://" not in json_file else json_file,
+            os.path.join(root, image_root),
+        )
+if __name__.endswith(".builtin"):
+    # Assume pre-defined datasets live in `./datasets`.
+    _root = os.getenv("DETECTRON2_DATASETS", "datasets")
+    register_all_avis(_root)

avism/data/datasets/extract_audio_feat/audio_feature_extractor.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"   # set gpu number
+import numpy as np
+import tensorflow as tf
+import vggish_input
+import vggish_params
+import vggish_slim
+import contextlib
+import wave
+# get audio length
+def get_audio_len(audio_file):
+    with contextlib.closing(wave.open(audio_file, 'r')) as f:
+        frames = f.getnframes()
+        rate = f.getframerate()
+        wav_length = int(frames / float(rate))
+        return wav_length
+# Paths to downloaded VGGish files.
+checkpoint_path = './vggish_model.ckpt'
+pca_params_path = './vggish_pca_params.npz'
+freq = 1000
+sr = 44100
+audio_root = "./datasets/"
+for subset in ["train", "val", "test"]:
+    print("{} ----------> ".format(subset))
+    audio_dir = os.path.join(audio_root, subset, "WAVAudios")
+    save_dir = os.path.join(audio_root, subset, "FEATAudios")
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    lis = sorted(os.listdir(audio_dir))
+    len_data = len(lis)
+    print(len_data)
+    i = 0
+    for n in range(len_data):
+        i += 1
+        # save file
+        outfile = os.path.join(save_dir, lis[n][:-4] + '.npy')
+        if os.path.exists(outfile):
+            print("\nProcessing: ", i, " / ", len_data, " ----> ", lis[n][:-4] + '.npy', " is already exist! ")
+            continue
+        '''feature learning by VGG-net trained by audioset'''
+        audio_index = os.path.join(audio_dir, lis[n]) # path of your audio files
+        num_secs = len(os.listdir(os.path.join(audio_root, subset, "JPEGImages", lis[n][:-4])))
+        # num_secs_real = get_audio_len(audio_index)
+        # print("\nProcessing: ", i, " / ", len_data, " --------> video: ", lis[n], " ---> sec: ", num_secs_real)
+        input_batch = vggish_input.wavfile_to_examples(audio_index, num_secs)
+        np.testing.assert_equal(
+            input_batch.shape,
+            [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS])
+        # Define VGGish, load the checkpoint, and run the batch through the model to produce embeddings.
+        with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
+            vggish_slim.define_vggish_slim()
+            vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
+            features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
+            embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
+            [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch})
+            np.save(outfile, embedding_batch)
+            print(" save info: ", lis[n][:-4] + '.npy', " ---> ", embedding_batch.shape)
+            i += 1
+    print("\n---------------------------------- end ----------------------------------\n")

avism/data/datasets/extract_audio_feat/mel_features.py ADDED Viewed

	@@ -0,0 +1,233 @@

+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines routines to compute mel spectrogram features from audio waveform."""
+import numpy as np
+def frame(data, window_length, hop_length):
+  """Convert array into a sequence of successive possibly overlapping frames.
+  An n-dimensional array of shape (num_samples, ...) is converted into an
+  (n+1)-D array of shape (num_frames, window_length, ...), where each frame
+  starts hop_length points after the preceding one.
+  This is accomplished using stride_tricks, so the original data is not
+  copied.  However, there is no zero-padding, so any incomplete frames at the
+  end are not included.
+  Args:
+    data: np.array of dimension N >= 1.
+    window_length: Number of samples in each frame.
+    hop_length: Advance (in samples) between each window.
+  Returns:
+    (N+1)-D np.array with as many rows as there are complete frames that can be
+    extracted.
+  """
+  # print("data: ", data.shape)
+  num_samples = data.shape[0]
+  # print("num_samples: ", num_samples)
+  num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
+  # print("num_frames: ", num_frames)
+  shape = (num_frames, window_length) + data.shape[1:]
+  # print("shape: ", shape)
+  strides = (data.strides[0] * hop_length,) + data.strides
+  # print("strides: ", strides)
+  # 按shape进行分块
+  return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
+def periodic_hann(window_length):
+  """Calculate a "periodic" Hann window.
+  The classic Hann window is defined as a raised cosine that starts and
+  ends on zero, and where every value appears twice, except the middle
+  point for an odd-length window.  Matlab calls this a "symmetric" window
+  and np.hanning() returns it.  However, for Fourier analysis, this
+  actually represents just over one cycle of a period N-1 cosine, and
+  thus is not compactly expressed on a length-N Fourier basis.  Instead,
+  it's better to use a raised cosine that ends just before the final
+  zero value - i.e. a complete cycle of a period-N cosine.  Matlab
+  calls this a "periodic" window. This routine calculates it.
+  Args:
+    window_length: The number of points in the returned window.
+  Returns:
+    A 1D np.array containing the periodic hann window.
+  """
+  return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
+                             np.arange(window_length)))
+def stft_magnitude(signal, fft_length,
+                   hop_length=None,
+                   window_length=None):
+  """Calculate the short-time Fourier transform magnitude.
+  Args:
+    signal: 1D np.array of the input time-domain signal.
+    fft_length: Size of the FFT to apply.
+    hop_length: Advance (in samples) between each frame passed to FFT.
+    window_length: Length of each block of samples to pass to FFT.
+  Returns:
+    2D np.array where each row contains the magnitudes of the fft_length/2+1
+    unique values of the FFT for the corresponding frame of input samples.
+  """
+  frames = frame(signal, window_length, hop_length)
+  # Apply frame window to each frame. We use a periodic Hann (cosine of period
+  # window_length) instead of the symmetric Hann of np.hanning (period
+  # window_length-1).
+  window = periodic_hann(window_length)
+  windowed_frames = frames * window
+  return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
+# Mel spectrum constants and functions.
+_MEL_BREAK_FREQUENCY_HERTZ = 700.0
+_MEL_HIGH_FREQUENCY_Q = 1127.0
+def hertz_to_mel(frequencies_hertz):
+  """Convert frequencies to mel scale using HTK formula.
+  Args:
+    frequencies_hertz: Scalar or np.array of frequencies in hertz.
+  Returns:
+    Object of same size as frequencies_hertz containing corresponding values
+    on the mel scale.
+  """
+  return _MEL_HIGH_FREQUENCY_Q * np.log(
+      1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
+def spectrogram_to_mel_matrix(num_mel_bins=20,
+                              num_spectrogram_bins=129,
+                              audio_sample_rate=8000,
+                              lower_edge_hertz=125.0,
+                              upper_edge_hertz=3800.0):
+  """Return a matrix that can post-multiply spectrogram rows to make mel.
+  Returns a np.array matrix A that can be used to post-multiply a matrix S of
+  spectrogram values (STFT magnitudes) arranged as frames x bins to generate a
+  "mel spectrogram" M of frames x num_mel_bins.  M = S A.
+  The classic HTK algorithm exploits the complementarity of adjacent mel bands
+  to multiply each FFT bin by only one mel weight, then add it, with positive
+  and negative signs, to the two adjacent mel bands to which that bin
+  contributes.  Here, by expressing this operation as a matrix multiply, we go
+  from num_fft multiplies per frame (plus around 2*num_fft adds) to around
+  num_fft^2 multiplies and adds.  However, because these are all presumably
+  accomplished in a single call to np.dot(), it's not clear which approach is
+  faster in Python.  The matrix multiplication has the attraction of being more
+  general and flexible, and much easier to read.
+  Args:
+    num_mel_bins: How many bands in the resulting mel spectrum.  This is
+      the number of columns in the output matrix.
+    num_spectrogram_bins: How many bins there are in the source spectrogram
+      data, which is understood to be fft_size/2 + 1, i.e. the spectrogram
+      only contains the nonredundant FFT bins.
+    audio_sample_rate: Samples per second of the audio at the input to the
+      spectrogram. We need this to figure out the actual frequencies for
+      each spectrogram bin, which dictates how they are mapped into mel.
+    lower_edge_hertz: Lower bound on the frequencies to be included in the mel
+      spectrum.  This corresponds to the lower edge of the lowest triangular
+      band.
+    upper_edge_hertz: The desired top edge of the highest frequency band.
+  Returns:
+    An np.array with shape (num_spectrogram_bins, num_mel_bins).
+  Raises:
+    ValueError: if frequency edges are incorrectly ordered or out of range.
+  """
+  nyquist_hertz = audio_sample_rate / 2.
+  if lower_edge_hertz < 0.0:
+    raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz)
+  if lower_edge_hertz >= upper_edge_hertz:
+    raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
+                     (lower_edge_hertz, upper_edge_hertz))
+  if upper_edge_hertz > nyquist_hertz:
+    raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
+                     (upper_edge_hertz, nyquist_hertz))
+  spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
+  spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
+  # The i'th mel band (starting from i=1) has center frequency
+  # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
+  # band_edges_mel[i+1].  Thus, we need num_mel_bins + 2 values in
+  # the band_edges_mel arrays.
+  band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
+                               hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
+  # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
+  # of spectrogram values.
+  mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
+  for i in range(num_mel_bins):
+    lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
+    # Calculate lower and upper slopes for every spectrogram bin.
+    # Line segments are linear in the *mel* domain, not hertz.
+    lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
+                   (center_mel - lower_edge_mel))
+    upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
+                   (upper_edge_mel - center_mel))
+    # .. then intersect them with each other and zero.
+    mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
+                                                          upper_slope))
+  # HTK excludes the spectrogram DC bin; make sure it always gets a zero
+  # coefficient.
+  mel_weights_matrix[0, :] = 0.0
+  return mel_weights_matrix
+def log_mel_spectrogram(data,
+                        audio_sample_rate=8000,
+                        log_offset=0.0,
+                        window_length_secs=0.025,
+                        hop_length_secs=0.010,
+                        **kwargs):
+  """Convert waveform to a log magnitude mel-frequency spectrogram.
+  Args:
+    data: 1D np.array of waveform data.
+    audio_sample_rate: The sampling rate of data.
+    log_offset: Add this to values when taking log to avoid -Infs.
+    window_length_secs: Duration of each window to analyze.
+    hop_length_secs: Advance between successive analysis windows.
+    **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix.
+  Returns:
+    2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank
+    magnitudes for successive frames.
+  """
+  window_length_samples = int(round(audio_sample_rate * window_length_secs))
+  hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
+  fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
+  spectrogram = stft_magnitude(
+      data,
+      fft_length=fft_length,
+      hop_length=hop_length_samples,
+      window_length=window_length_samples)
+  mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
+      num_spectrogram_bins=spectrogram.shape[1],
+      audio_sample_rate=audio_sample_rate, **kwargs))
+  return np.log(mel_spectrogram + log_offset)

avism/data/datasets/extract_audio_feat/vggish_input.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Compute input examples for VGGish from audio waveform."""
+import numpy as np
+import resampy
+from scipy.io import wavfile
+import mel_features
+import vggish_params
+def waveform_to_examples(data, sample_rate):
+  """Converts audio waveform into an array of examples for VGGish.
+  Args:
+    data: np.array of either one dimension (mono) or two dimensions
+      (multi-channel, with the outer dimension representing channels).
+      Each sample is generally expected to lie in the range [-1.0, +1.0],
+      although this is not required.
+    sample_rate: Sample rate of data.
+  Returns:
+    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
+    a sequence of examples, each of which contains a patch of log mel
+    spectrogram, covering num_frames frames of audio and num_bands mel frequency
+    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
+  """
+  # Convert to mono.
+  if len(data.shape) > 1:
+    data = np.mean(data, axis=1)
+  # Resample to the rate assumed by VGGish.
+  if sample_rate != vggish_params.SAMPLE_RATE:
+    data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)
+  # Compute log mel spectrogram features.
+  log_mel = mel_features.log_mel_spectrogram(
+      data,
+      audio_sample_rate=vggish_params.SAMPLE_RATE,
+      log_offset=vggish_params.LOG_OFFSET,
+      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
+      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
+      num_mel_bins=vggish_params.NUM_MEL_BINS,
+      lower_edge_hertz=vggish_params.MEL_MIN_HZ,
+      upper_edge_hertz=vggish_params.MEL_MAX_HZ)
+  # Frame features into examples.
+  features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
+  example_window_length = int(round(
+      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
+  example_hop_length = int(round(
+      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
+  log_mel_examples = mel_features.frame(
+      log_mel,
+      window_length=example_window_length,
+      hop_length=example_hop_length)
+  return log_mel_examples
+def wavfile_to_examples(wav_file, num_secs):
+    """Convenience wrapper around waveform_to_examples() for a common WAV format.
+        Args:
+        wav_file: String path to a file, or a file-like object. The file
+        is assumed to contain WAV audio data with signed 16-bit PCM samples.
+        Returns:
+        See waveform_to_examples.
+    """
+    sr, snd = wavfile.read(wav_file)
+    L = sr * num_secs
+    wav_data = snd[:L, :]
+    wav_data = wav_data / 32768.0  # Convert to [-1.0, +1.0]
+    T = num_secs
+    log_mel = np.zeros([T, 96, 64])
+    for i in range(T):
+        s = i * sr
+        e = (i + 1) * sr
+        if len(wav_data.shape) > 1:
+          data = wav_data[s:e, :]
+        else:
+          data = wav_data[s:e]
+        wave_data_array = waveform_to_examples(data, sr)
+        if len(wave_data_array) != 0:
+            log_mel[i, :, :] = wave_data_array
+        else:
+            log_mel[i, :, :] = np.zeros((1, 96, 64), dtype=float)
+    return log_mel

avism/data/datasets/extract_audio_feat/vggish_params.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Global parameters for the VGGish model.
+See vggish_slim.py for more information.
+"""
+# Architectural constants.
+NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
+NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
+EMBEDDING_SIZE = 128  # Size of embedding layer.
+# Hyperparameters used in feature and example generation.
+SAMPLE_RATE = 16000
+STFT_WINDOW_LENGTH_SECONDS = 0.025
+STFT_HOP_LENGTH_SECONDS = 0.010
+NUM_MEL_BINS = NUM_BANDS
+MEL_MIN_HZ = 125
+MEL_MAX_HZ = 7500
+LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
+EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
+EXAMPLE_HOP_SECONDS = 0.96     # with zero overlap.
+# Parameters used for embedding postprocessing.
+PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
+PCA_MEANS_NAME = 'pca_means'
+QUANTIZE_MIN_VAL = -2.0
+QUANTIZE_MAX_VAL = +2.0
+# Hyperparameters used in training.
+INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
+LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
+ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.
+# Names of ops, tensors, and features.
+INPUT_OP_NAME = 'vggish/input_features'
+INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
+OUTPUT_OP_NAME = 'vggish/embedding'
+OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
+AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'

avism/data/datasets/extract_audio_feat/vggish_slim.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines the 'VGGish' model used to generate AudioSet embedding features.
+The public AudioSet release (https://research.google.com/audioset/download.html)
+includes 128-D features extracted from the embedding layer of a VGG-like model
+that was trained on a large Google-internal YouTube dataset. Here we provide
+a TF-Slim definition of the same model, without any dependences on libraries
+internal to Google. We call it 'VGGish'.
+Note that we only define the model up to the embedding layer, which is the
+penultimate layer before the final classifier layer. We also provide various
+hyperparameter values (in vggish_params.py) that were used to train this model
+internally.
+For comparison, here is TF-Slim's VGG definition:
+https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py
+"""
+import tensorflow._api.v2.compat.v1 as tf
+tf.disable_v2_behavior()
+import tf_slim as slim
+import vggish_params as params
+def define_vggish_slim(training=False):
+  """Defines the VGGish TensorFlow model.
+  All ops are created in the current default graph, under the scope 'vggish/'.
+  The input is a placeholder named 'vggish/input_features' of type float32 and
+  shape [batch_size, num_frames, num_bands] where batch_size is variable and
+  num_frames and num_bands are constants, and [num_frames, num_bands] represents
+  a log-mel-scale spectrogram patch covering num_bands frequency bands and
+  num_frames time frames (where each frame step is usually 10ms). This is
+  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
+  The output is an op named 'vggish/embedding' which produces the activations of
+  a 128-D embedding layer, which is usually the penultimate layer when used as
+  part of a full model with a final classifier layer.
+  Args:
+    training: If true, all parameters are marked trainable.
+  Returns:
+    The op 'vggish/embeddings'.
+  """
+  # Defaults:
+  # - All weights are initialized to N(0, INIT_STDDEV).
+  # - All biases are initialized to 0.
+  # - All activations are ReLU.
+  # - All convolutions are 3x3 with stride 1 and SAME padding.
+  # - All max-pools are 2x2 with stride 2 and SAME padding.
+  with slim.arg_scope([slim.conv2d, slim.fully_connected],
+                      weights_initializer=tf.truncated_normal_initializer(
+                          stddev=params.INIT_STDDEV),
+                      biases_initializer=tf.zeros_initializer(),
+                      activation_fn=tf.nn.relu,
+                      trainable=training), \
+       slim.arg_scope([slim.conv2d],
+                      kernel_size=[3, 3], stride=1, padding='SAME'), \
+       slim.arg_scope([slim.max_pool2d],
+                      kernel_size=[2, 2], stride=2, padding='SAME'), \
+       tf.compat.v1.variable_scope('vggish'):
+       # tf.variable_scope('vggish'):
+    # Input: a batch of 2-D log-mel-spectrogram patches.
+    # features = tf.placeholder(
+    features = tf.compat.v1.placeholder(
+        tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
+        name='input_features')
+    # Reshape to 4-D so that we can convolve a batch with conv2d().
+    net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])
+    # The VGG stack of alternating convolutions and max-pools.
+    net = slim.conv2d(net, 64, scope='conv1')
+    net = slim.max_pool2d(net, scope='pool1')
+    net = slim.conv2d(net, 128, scope='conv2')
+    net = slim.max_pool2d(net, scope='pool2')
+    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
+    net = slim.max_pool2d(net, scope='pool3')
+    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
+    net = slim.max_pool2d(net, scope='pool4')
+    # Flatten before entering fully-connected layers
+    net = slim.flatten(net)
+    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
+    # The embedding layer.
+    net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
+    return tf.identity(net, name='embedding')
+def load_vggish_slim_checkpoint(session, checkpoint_path):
+  """Loads a pre-trained VGGish-compatible checkpoint.
+  This function can be used as an initialization function (referred to as
+  init_fn in TensorFlow documentation) which is called in a Session after
+  initializating all variables. When used as an init_fn, this will load
+  a pre-trained checkpoint that is compatible with the VGGish model
+  definition. Only variables defined by VGGish will be loaded.
+  Args:
+    session: an active TensorFlow session.
+    checkpoint_path: path to a file containing a checkpoint that is
+      compatible with the VGGish model definition.
+  """
+  # Get the list of names of all VGGish variables that exist in
+  # the checkpoint (i.e., all inference-mode VGGish variables).
+  with tf.Graph().as_default():
+    define_vggish_slim(training=False)
+    # vggish_var_names = [v.name for v in tf.global_variables()]
+    vggish_var_names = [v.name for v in tf.compat.v1.global_variables()]
+  # Get the list of all currently existing variables that match
+  # the list of variable names we just computed.
+  # vggish_vars = [v for v in tf.global_variables() if v.name in vggish_var_names]
+  vggish_vars = [v for v in tf.compat.v1.global_variables() if v.name in vggish_var_names]
+  # Use a Saver to restore just the variables selected above.
+  # saver = tf.train.Saver(vggish_vars, name='vggish_load_pretrained')
+  saver = tf.compat.v1.train.Saver(vggish_vars, name='vggish_load_pretrained')
+  saver.restore(session, checkpoint_path)

avism/modeling/__init__.py ADDED Viewed

File without changes

avism/modeling/avism_criterion.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from detectron2.utils.comm import get_world_size
+from detectron2.projects.point_rend.point_features import (
+    get_uncertain_point_coords_with_randomness,
+    point_sample,
+)
+from ..utils.misc import is_dist_avail_and_initialized
+def dice_loss(
+        inputs: torch.Tensor,
+        targets: torch.Tensor,
+        num_masks: float,
+    ):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(-1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_masks
+dice_loss_jit = torch.jit.script(
+    dice_loss
+)  # type: torch.jit.ScriptModule
+def sigmoid_ce_loss(
+        inputs: torch.Tensor,
+        targets: torch.Tensor,
+        num_masks: float,
+    ):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    return loss.mean(1).sum() / num_masks
+sigmoid_ce_loss_jit = torch.jit.script(
+    sigmoid_ce_loss
+)  # type: torch.jit.ScriptModule
+def calculate_uncertainty(logits):
+    """
+    We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the
+        foreground class in `classes`.
+    Args:
+        logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or
+            class-agnostic, where R is the total number of predicted masks in all images and C is
+            the number of foreground classes. The values are logits.
+    Returns:
+        scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with
+            the most uncertain locations having the highest uncertainty score.
+    """
+    assert logits.shape[1] == 1
+    gt_class_logits = logits.clone()
+    return -(torch.abs(gt_class_logits))
+class AvismSetCriterion(nn.Module):
+    """This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses,
+                 num_points, oversample_ratio, importance_sample_ratio, sim_use_clip):
+        """Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+        # pointwise mask loss parameters
+        self.num_points = num_points
+        self.oversample_ratio = oversample_ratio
+        self.importance_sample_ratio = importance_sample_ratio
+        self.sim_use_clip = sim_use_clip
+    def loss_labels(self, outputs, targets, indices, num_masks):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert "pred_logits" in outputs
+        src_logits = outputs['pred_logits']
+        L, B, cQ, _ = src_logits.shape
+        src_logits = src_logits.reshape(L*B, cQ, self.num_classes+1)
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets * L, indices)])
+        target_classes = torch.full(
+            src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device
+        )
+        target_classes[idx] = target_classes_o
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
+        losses = {'loss_avism_ce': loss_ce}
+        return losses
+    def loss_masks(self, outputs, targets, indices, num_masks):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        L, B, cQ, T, H, W = src_masks.shape
+        src_masks = src_masks.reshape(L*B, cQ, T, H, W)
+        src_masks = src_masks[idx] # Nt x T x Hp x Wp
+        target_masks = torch.cat([t['masks'][i] for t, (_, i) in zip(targets * L, indices)]).to(src_masks)
+        # Nt x T x Ht x Wt
+        src_masks = src_masks.flatten(0, 1)[:, None]
+        target_masks = target_masks.flatten(0, 1)[:, None]
+        with torch.no_grad():
+            # sample point_coords
+            point_coords = get_uncertain_point_coords_with_randomness(
+                src_masks,
+                lambda logits: calculate_uncertainty(logits),
+                self.num_points,
+                self.oversample_ratio,
+                self.importance_sample_ratio,
+            )
+            # get gt labels
+            point_labels = point_sample(
+                target_masks,
+                point_coords,
+                align_corners=False,
+            ).squeeze(1)
+        point_logits = point_sample(
+            src_masks,
+            point_coords,
+            align_corners=False,
+        ).squeeze(1)
+        # Nt*T, randN -> Nt, T*randN
+        point_logits = point_logits.view(len(idx[0]), T * self.num_points)
+        point_labels = point_labels.view(len(idx[0]), T * self.num_points)
+        losses = {
+            "loss_avism_mask": sigmoid_ce_loss_jit(point_logits, point_labels, num_masks),
+            "loss_avism_dice": dice_loss_jit(point_logits, point_labels, num_masks),
+        }
+        del src_masks
+        del target_masks
+        return losses
+    def loss_fg_sim(
+        self, outputs, clip_targets, frame_targets,
+        clip_indices, frame_indices, num_masks, MULTIPLIER=1000
+    ):
+        total_src_q, total_tgt_ids, total_batch_idx = [], [], []
+        # Frame
+        src_fq = outputs["pred_fq_embed"]   # L, B, T, fQ, C
+        # L = number of frame_decoder layers
+        L, B, T, fQ, C = src_fq.shape
+        src_fq = src_fq.flatten(0, 2)       # LBT, fQ, C
+        frame_indices = sum(frame_indices, [])
+        frame_src_idx = self._get_src_permutation_idx(frame_indices)    # len = LBT
+        src_fq = src_fq[frame_src_idx]      # Nf, C
+        target_frame_ids = torch.cat(
+            [t["ids"][J] for t, (_, J) in zip(frame_targets * L, frame_indices)]
+        )
+        frame_batch_idx = torch.div(frame_src_idx[0].to(device=src_fq.device), T, rounding_mode="floor")
+        is_frame_valid = target_frame_ids != -1
+        target_frame_ids += frame_batch_idx * MULTIPLIER
+        total_src_q.append(src_fq[is_frame_valid])
+        total_tgt_ids.append(target_frame_ids[is_frame_valid])
+        total_batch_idx.append(frame_batch_idx[is_frame_valid])
+        # Clip
+        if self.sim_use_clip:
+            src_cq = outputs["pred_cq_embed"]   # L, B, cQ, C
+            src_cq = src_cq.flatten(0, 1)       # LB , cQ, C
+            clip_src_idx = self._get_src_permutation_idx(clip_indices)      # len = LB
+            src_cq = src_cq[clip_src_idx]       # Nc, C
+            target_clip_ids = torch.cat(        # clip_ids' shape = (N, num_frames) -> (N,)
+                [t["ids"][J] for t, (_, J) in zip(clip_targets * L, clip_indices)]
+            ).amax(dim=1)
+            clip_batch_idx = clip_src_idx[0].to(device=src_fq.device)
+            is_clip_valid = target_clip_ids != -1
+            target_clip_ids += clip_batch_idx * MULTIPLIER
+            total_src_q.append(src_cq[is_clip_valid])
+            total_tgt_ids.append(target_clip_ids[is_clip_valid])
+            total_batch_idx.append(clip_batch_idx[is_clip_valid])
+        # Clip + Frame
+        total_src_q = torch.cat(total_src_q)            # Nc+Nf, C
+        total_tgt_ids = torch.cat(total_tgt_ids)        # Nc+Nf
+        total_batch_idx = torch.cat(total_batch_idx)    # Nc+Nf
+        sim_pred_logits = torch.matmul(total_src_q, total_src_q.T)          # Nc+Nf, Nc+Nf
+        sim_tgt = (total_tgt_ids[:, None] == total_tgt_ids[None]).float()   # Nc+Nf, Nc+Nf
+        same_clip = (total_batch_idx[:, None] == total_batch_idx[None]).float()
+        loss = F.binary_cross_entropy_with_logits(sim_pred_logits, sim_tgt, reduction='none')
+        loss = loss * same_clip
+        loss_clip_sim = loss.sum() / (same_clip.sum() + 1e-6)
+        return {"loss_clip_sim": loss_clip_sim}
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+    def get_loss(
+        self, loss, outputs, clip_targets, frame_targets, clip_indices, frame_indices, num_masks
+    ):
+        loss_map = {
+            'avism_labels': self.loss_labels,
+            'avism_masks': self.loss_masks,
+            'fg_sim': self.loss_fg_sim,
+        }
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
+        if loss == 'fg_sim':
+            return loss_map[loss](
+                outputs, clip_targets, frame_targets, clip_indices, frame_indices, num_masks
+            )
+        return loss_map[loss](outputs, clip_targets, clip_indices, num_masks)
+    def forward(self, outputs, clip_targets, frame_targets, frame_indices=None):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
+        # Retrieve the matching between the outputs of the last layer and the targets
+        clip_indices = self.matcher(outputs_without_aux, clip_targets)
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_masks = sum(len(t["labels"]) for t in clip_targets) * len(outputs_without_aux["pred_masks"])
+        num_masks = torch.as_tensor(
+            [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device
+        )
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_masks)
+        num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(
+                self.get_loss(
+                    loss, outputs, clip_targets, frame_targets, clip_indices, frame_indices, num_masks
+                )
+            )
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                clip_indices = self.matcher(aux_outputs, clip_targets)
+                for loss in self.losses:
+                    if loss == "fg_sim":
+                        continue
+                    l_dict = self.get_loss(
+                        loss, aux_outputs, clip_targets, frame_targets, clip_indices, frame_indices, num_masks
+                    )
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        return losses
+    def __repr__(self):
+        head = "Criterion " + self.__class__.__name__
+        body = [
+            "matcher: {}".format(self.matcher.__repr__(_repr_indent=8)),
+            "losses: {}".format(self.losses),
+            "weight_dict: {}".format(self.weight_dict),
+            "num_classes: {}".format(self.num_classes),
+            "eos_coef: {}".format(self.eos_coef),
+            "num_points: {}".format(self.num_points),
+            "oversample_ratio: {}".format(self.oversample_ratio),
+            "importance_sample_ratio: {}".format(self.importance_sample_ratio),
+        ]
+        _repr_indent = 4
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)

avism/modeling/avism_matcher.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+Modules to compute the matching cost and solve the corresponding LSAP.
+"""
+import torch
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+from torch.cuda.amp import autocast
+from detectron2.projects.point_rend.point_features import point_sample
+def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
+    denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss
+batch_dice_loss_jit = torch.jit.script(
+    batch_dice_loss
+)  # type: torch.jit.ScriptModule
+def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    hw = inputs.shape[1]
+    pos = F.binary_cross_entropy_with_logits(
+        inputs, torch.ones_like(inputs), reduction="none"
+    )
+    neg = F.binary_cross_entropy_with_logits(
+        inputs, torch.zeros_like(inputs), reduction="none"
+    )
+    loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum(
+        "nc,mc->nm", neg, (1 - targets)
+    )
+    return loss / hw
+batch_sigmoid_ce_loss_jit = torch.jit.script(
+    batch_sigmoid_ce_loss
+)  # type: torch.jit.ScriptModule
+class AvismHungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+    def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0):
+        """Creates the matcher
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
+            cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_mask = cost_mask
+        self.cost_dice = cost_dice
+        assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
+        self.num_points = num_points
+    @torch.no_grad()
+    def memory_efficient_forward(self, outputs, targets):
+        # We flatten to compute the cost matrices in a batch
+        # Here, "L" is the number of frame-level decoder layers.
+        out_prob = outputs["pred_logits"].softmax(-1)   # L, B, cQ, K+1
+        out_mask = outputs["pred_masks"]                # L, B, cQ, T, H, W
+        L, B, cQ, T, s_h, s_w = out_mask.shape
+        out_prob = out_prob.reshape(L*B, cQ, -1)
+        out_mask = out_mask.reshape(L*B, cQ, T, s_h, s_w)
+        # If target is [vid1, vid2, vid3],
+        # it now becomes [vid1, vid2, vid3, vid1, vid2, vid3, ...].
+        targets = targets * L
+        indices = []
+        for b in range(L*B):
+            b_out_prob = out_prob[b]
+            tgt_ids = targets[b]["labels"]
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -b_out_prob[:, tgt_ids]
+            b_out_mask = out_mask[b]  # cQ x T x H_pred x W_pred
+            # gt masks are already padded when preparing target
+            tgt_mask = targets[b]["masks"].to(b_out_mask) # Nins x T x H_tgt x W_tgt
+            # out_mask = out_mask[:, None]
+            # tgt_mask = tgt_mask[:, None]
+            # all masks share the same set of points for efficient matching!
+            point_coords = torch.rand(1, self.num_points, 2, device=b_out_mask.device)
+            # get gt labels
+            tgt_mask = point_sample(
+                tgt_mask,
+                point_coords.repeat(tgt_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).flatten(1)
+            b_out_mask = point_sample(
+                b_out_mask,
+                point_coords.repeat(b_out_mask.shape[0], 1, 1),
+                align_corners=False,
+            ).flatten(1)
+            with autocast(enabled=False):
+                b_out_mask = b_out_mask.float()
+                tgt_mask = tgt_mask.float()
+                # Compute the focal loss between masks
+                cost_mask = batch_sigmoid_ce_loss_jit(b_out_mask, tgt_mask)
+                # Compute the dice loss betwen masks
+                cost_dice = batch_dice_loss(b_out_mask, tgt_mask)
+            # Final cost matrix
+            C = (
+                self.cost_mask * cost_mask
+                + self.cost_class * cost_class
+                + self.cost_dice * cost_dice
+            )
+            C = C.reshape(cQ, -1).cpu()
+            indices.append(linear_sum_assignment(C))
+        return [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64))
+            for i, j in indices
+        ]
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """Performs the matching
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        return self.memory_efficient_forward(outputs, targets)
+    def __repr__(self, _repr_indent=4):
+        head = "Matcher " + self.__class__.__name__
+        body = [
+            "cost_class: {}".format(self.cost_class),
+            "cost_mask: {}".format(self.cost_mask),
+            "cost_dice: {}".format(self.cost_dice),
+        ]
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)

avism/modeling/transformer_decoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .avism_transformer_decoder import AVISMMultiScaleMaskedTransformerDecoder

avism/modeling/transformer_decoder/avism.py ADDED Viewed

	@@ -0,0 +1,675 @@

+from math import ceil
+import fvcore.nn.weight_init as weight_init
+from typing import Optional
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+import copy
+from detectron2.config import configurable
+from detectron2.layers import Conv2d
+class SelfAttentionLayer(nn.Module):
+    def __init__(self, d_model, nhead, dropout=0.0,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt,
+                     tgt_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(self, tgt,
+                    tgt_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(self, tgt,
+                tgt_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, tgt_mask,
+                                    tgt_key_padding_mask, query_pos)
+        return self.forward_post(tgt, tgt_mask,
+                                 tgt_key_padding_mask, query_pos)
+class CrossAttentionLayer(nn.Module):
+    def __init__(self, d_model, nhead, dropout=0.0,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt, memory,
+                     memory_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(self, tgt, memory,
+                    memory_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(self, tgt, memory,
+                memory_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, memory_mask,
+                                    memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, memory_mask,
+                                 memory_key_padding_mask, pos, query_pos)
+class FFNLayer(nn.Module):
+    def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm = nn.LayerNorm(d_model)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt):
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(self, tgt):
+        tgt2 = self.norm(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(self, tgt):
+        if self.normalize_before:
+            return self.forward_pre(tgt)
+        return self.forward_post(tgt)
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+class Avism(nn.Module):
+    @configurable
+    def __init__(
+            self,
+            in_channels,
+            aux_loss,
+            *,
+            hidden_dim: int,
+            num_frame_queries: int,
+            num_queries: int,
+            nheads: int,
+            dim_feedforward: int,
+            enc_layers: int,
+            dec_layers: int,
+            enc_window_size: int,
+            pre_norm: bool,
+            enforce_input_project: bool,
+            num_frames: int,
+            num_classes: int,
+            clip_last_layer_num: bool,
+            conv_dim: int,
+            mask_dim: int,
+            sim_use_clip: list,
+            use_sim: bool,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dim_feedforward: feature dimension in feedforward network
+            enc_layers: number of Transformer encoder layers
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+        """
+        super().__init__()
+        # define Transformer decoder here
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.transformer_self_attention_layers = nn.ModuleList()
+        self.transformer_cross_attention_layers = nn.ModuleList()
+        self.transformer_ffn_layers = nn.ModuleList()
+        self.num_frames = num_frames
+        self.num_classes = num_classes
+        self.clip_last_layer_num = clip_last_layer_num
+        self.enc_layers = enc_layers
+        self.window_size = enc_window_size
+        self.sim_use_clip = sim_use_clip
+        self.use_sim = use_sim
+        self.aux_loss = aux_loss
+        self.av_proj = nn.Linear(128, hidden_dim)
+        self.enc_layers = enc_layers
+        if enc_layers > 0:
+            self.enc_self_attn = nn.ModuleList()
+            self.enc_ffn = nn.ModuleList()
+            for _ in range(self.enc_layers):
+                self.enc_self_attn.append(
+                    SelfAttentionLayer(
+                        d_model=hidden_dim,
+                        nhead=nheads,
+                        dropout=0.0,
+                        normalize_before=pre_norm,
+                    ),
+                )
+                self.enc_ffn.append(
+                    FFNLayer(
+                        d_model=hidden_dim,
+                        dim_feedforward=dim_feedforward,
+                        dropout=0.0,
+                        normalize_before=pre_norm,
+                    )
+                )
+        if enc_layers > 0:
+            self.enc_av_cross_attn = nn.ModuleList()
+            self.enc_av_ffn = nn.ModuleList()
+            for _ in range(self.enc_layers):
+                self.enc_av_cross_attn.append(
+                    CrossAttentionLayer(
+                        d_model=hidden_dim,
+                        nhead=nheads,
+                        dropout=0.0,
+                        normalize_before=pre_norm,
+                    ),
+                )
+                self.enc_av_ffn.append(
+                    FFNLayer(
+                        d_model=hidden_dim,
+                        dim_feedforward=dim_feedforward,
+                        dropout=0.0,
+                        normalize_before=pre_norm,
+                    )
+                )
+        for _ in range(self.num_layers):
+            self.transformer_self_attention_layers.append(
+                SelfAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_cross_attention_layers.append(
+                CrossAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_ffn_layers.append(
+                FFNLayer(
+                    d_model=hidden_dim,
+                    dim_feedforward=dim_feedforward,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+        self.avism_mask_features = Conv2d(
+            conv_dim,
+            mask_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        weight_init.c2_xavier_fill(self.avism_mask_features)
+        self.decoder_norm = nn.LayerNorm(hidden_dim)
+        self.num_queries = num_queries
+        # learnable query features
+        self.query_feat = nn.Embedding(num_queries, hidden_dim)
+        # learnable query p.e.
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        self.fq_pos = nn.Embedding(num_frame_queries, hidden_dim)
+        if in_channels != hidden_dim or enforce_input_project:
+            self.input_proj_dec = nn.Linear(hidden_dim, hidden_dim)
+        else:
+            self.input_proj_dec = nn.Sequential()
+        self.src_embed = nn.Identity()
+        self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
+        self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+        if self.use_sim:
+            self.sim_embed_frame = nn.Linear(hidden_dim, hidden_dim)
+            if self.sim_use_clip:
+                self.sim_embed_clip = nn.Linear(hidden_dim, hidden_dim)
+    @classmethod
+    def from_config(cls, cfg, in_channels):
+        ret = {}
+        ret["in_channels"] = in_channels
+        ret["hidden_dim"] = cfg.MODEL.AVISM.HIDDEN_DIM
+        ret["num_frame_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
+        ret["num_queries"] = cfg.MODEL.AVISM.NUM_OBJECT_QUERIES
+        # Transformer parameters:
+        ret["nheads"] = cfg.MODEL.AVISM.NHEADS
+        ret["dim_feedforward"] = cfg.MODEL.AVISM.DIM_FEEDFORWARD
+        assert cfg.MODEL.AVISM.DEC_LAYERS >= 1
+        ret["enc_layers"] = cfg.MODEL.AVISM.ENC_LAYERS
+        ret["dec_layers"] = cfg.MODEL.AVISM.DEC_LAYERS
+        ret["enc_window_size"] = cfg.MODEL.AVISM.ENC_WINDOW_SIZE
+        ret["pre_norm"] = cfg.MODEL.AVISM.PRE_NORM
+        ret["enforce_input_project"] = cfg.MODEL.AVISM.ENFORCE_INPUT_PROJ
+        ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
+        ret["num_frames"] = cfg.INPUT.SAMPLING_FRAME_NUM
+        ret["clip_last_layer_num"] = cfg.MODEL.AVISM.LAST_LAYER_NUM
+        ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+        ret["sim_use_clip"] = cfg.MODEL.AVISM.SIM_USE_CLIP
+        ret["use_sim"] = cfg.MODEL.AVISM.SIM_WEIGHT > 0.0
+        return ret
+    def forward(self, frame_query, audio_features):
+        """
+        L: Number of Layers.
+        B: Batch size.
+        T: Temporal window size. Number of frames per video.
+        C: Channel size.
+        fQ: Number of frame-wise queries from IFC.
+        cQ: Number of clip-wise queries to decode Q.
+        """
+        if not self.training:
+            frame_query = frame_query[[-1]]
+        L, BT, fQ, C = frame_query.shape
+        B = BT // self.num_frames if self.training else 1
+        T = self.num_frames if self.training else BT // B
+        frame_query = frame_query.reshape(L * B, T, fQ, C)
+        frame_query = frame_query.permute(1, 2, 0, 3).contiguous()
+        frame_query = self.input_proj_dec(frame_query)  # T, fQ, LB, C
+        audio_feat = self.av_proj(audio_features)  # T, C
+        audio_feat = audio_feat[:, None, None, :].repeat(1, fQ, L * B, 1)
+        if self.window_size > 0:
+            pad = int(ceil(T / self.window_size)) * self.window_size - T
+            _T = pad + T
+            frame_query = F.pad(frame_query, (0, 0, 0, 0, 0, 0, 0, pad))  # _T, fQ, LB, C
+            audio_feat = F.pad(audio_feat, (0, 0, 0, 0, 0, 0, 0, pad))
+            enc_mask = frame_query.new_ones(L * B, _T).bool()  # LB, _T
+            enc_mask[:, :T] = False
+        else:
+            enc_mask = None
+        frame_query = self.encode_frame_query(frame_query, enc_mask)
+        # audio
+        av_feat = self.encode_av_fusion(frame_query, enc_mask, audio_feat)
+        frame_query = frame_query[:T].flatten(0, 1)  # TfQ, LB, C
+        av_feat = av_feat[:T].flatten(0, 1)
+        frame_query = frame_query + av_feat
+        if self.use_sim:
+            pred_fq_embed = self.sim_embed_frame(frame_query)  # TfQ, LB, C
+            pred_fq_embed = pred_fq_embed.transpose(0, 1).reshape(L, B, T, fQ, C)
+        else:
+            pred_fq_embed = None
+        src = self.src_embed(frame_query)  # TfQ, LB, C
+        dec_pos = self.fq_pos.weight[None, :, None, :].repeat(T, 1, L * B, 1).flatten(0, 1)  # TfQ, LB, C
+        # QxNxC
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, L * B, 1)  # cQ, LB, C
+        output = self.query_feat.weight.unsqueeze(1).repeat(1, L * B, 1)  # cQ, LB, C
+        decoder_outputs = []
+        for i in range(self.num_layers):
+            # attention: cross-attention first
+            output = self.transformer_cross_attention_layers[i](
+                output, src,
+                memory_mask=None,
+                memory_key_padding_mask=None,
+                pos=dec_pos, query_pos=query_embed
+            )
+            output = self.transformer_self_attention_layers[i](
+                output, tgt_mask=None,
+                tgt_key_padding_mask=None,
+                query_pos=query_embed
+            )
+            # FFN
+            output = self.transformer_ffn_layers[i](
+                output
+            )
+            if (self.training and self.aux_loss) or (i == self.num_layers - 1):
+                dec_out = self.decoder_norm(output)  # cQ, LB, C
+                dec_out = dec_out.transpose(0, 1)  # LB, cQ, C
+                decoder_outputs.append(dec_out.view(L, B, self.num_queries, C))
+        decoder_outputs = torch.stack(decoder_outputs, dim=0)  # D, L, B, cQ, C
+        pred_cls = self.class_embed(decoder_outputs)
+        pred_mask_embed = self.mask_embed(decoder_outputs)
+        if self.use_sim and self.sim_use_clip:
+            pred_cq_embed = self.sim_embed_clip(decoder_outputs)
+        else:
+            pred_cq_embed = [None] * self.num_layers
+        out = {
+            'pred_logits': pred_cls[-1],
+            'pred_mask_embed': pred_mask_embed[-1],
+            'pred_fq_embed': pred_fq_embed,
+            'pred_cq_embed': pred_cq_embed[-1],
+            'aux_outputs': self._set_aux_loss(
+                pred_cls, pred_mask_embed, pred_cq_embed, pred_fq_embed
+            )
+        }
+        return out
+    @torch.jit.unused
+    def _set_aux_loss(
+            self, outputs_cls, outputs_mask_embed, outputs_cq_embed, outputs_fq_embed
+    ):
+        return [{"pred_logits": a, "pred_mask_embed": b, "pred_cq_embed": c, "pred_fq_embed": outputs_fq_embed}
+                for a, b, c in zip(outputs_cls[:-1], outputs_mask_embed[:-1], outputs_cq_embed[:-1])]
+    def encode_frame_query(self, frame_query, attn_mask):
+        """
+        input shape (frame_query)   : T, fQ, LB, C
+        output shape (frame_query)  : T, fQ, LB, C
+        """
+        # Not using window-based attention if self.window_size == 0.
+        if self.window_size == 0:
+            return_shape = frame_query.shape  # T, fQ, LB, C
+            frame_query = frame_query.flatten(0, 1)  # TfQ, LB, C
+            for i in range(self.enc_layers):
+                frame_query = self.enc_self_attn[i](frame_query)
+                frame_query = self.enc_ffn[i](frame_query)
+            frame_query = frame_query.view(return_shape)
+            return frame_query
+        # Using window-based attention if self.window_size > 0.
+        else:
+            T, fQ, LB, C = frame_query.shape
+            W = self.window_size
+            Nw = T // W
+            half_W = int(ceil(W / 2))
+            window_mask = attn_mask.view(LB * Nw, W)[..., None].repeat(1, 1, fQ).flatten(1)
+            _attn_mask = torch.roll(attn_mask, half_W, 1)
+            _attn_mask = _attn_mask.view(LB, Nw, W)[..., None].repeat(1, 1, 1, W)  # LB, Nw, W, W
+            _attn_mask[:, 0] = _attn_mask[:, 0] | _attn_mask[:, 0].transpose(-2, -1)
+            _attn_mask[:, -1] = _attn_mask[:, -1] | _attn_mask[:, -1].transpose(-2, -1)
+            _attn_mask[:, 0, :half_W, half_W:] = True
+            _attn_mask[:, 0, half_W:, :half_W] = True
+            _attn_mask = _attn_mask.view(LB * Nw, 1, W, 1, W, 1).repeat(1, self.num_heads, 1, fQ, 1, fQ).view(
+                LB * Nw * self.num_heads, W * fQ, W * fQ)
+            shift_window_mask = _attn_mask.float() * -1000
+            for layer_idx in range(self.enc_layers):
+                if self.training or layer_idx % 2 == 0:
+                    frame_query = self._window_attn(frame_query, window_mask, layer_idx)
+                else:
+                    frame_query = self._shift_window_attn(frame_query, shift_window_mask, layer_idx)
+            return frame_query
+    def _window_attn(self, frame_query, attn_mask, layer_idx):
+        T, fQ, LB, C = frame_query.shape
+        # LBN, WTfQ = attn_mask.shape
+        W = self.window_size
+        Nw = T // W
+        frame_query = frame_query.view(Nw, W, fQ, LB, C)
+        frame_query = frame_query.permute(1, 2, 3, 0, 4).reshape(W * fQ, LB * Nw, C)
+        frame_query = self.enc_self_attn[layer_idx](frame_query, tgt_key_padding_mask=attn_mask)
+        frame_query = self.enc_ffn[layer_idx](frame_query)
+        frame_query = frame_query.reshape(W, fQ, LB, Nw, C).permute(3, 0, 1, 2, 4).reshape(T, fQ, LB, C)
+        return frame_query
+    def _shift_window_attn(self, frame_query, attn_mask, layer_idx):
+        T, fQ, LB, C = frame_query.shape
+        # LBNH, WfQ, WfQ = attn_mask.shape
+        W = self.window_size
+        Nw = T // W
+        half_W = int(ceil(W / 2))
+        frame_query = torch.roll(frame_query, half_W, 0)
+        frame_query = frame_query.view(Nw, W, fQ, LB, C)
+        frame_query = frame_query.permute(1, 2, 3, 0, 4).reshape(W * fQ, LB * Nw, C)
+        frame_query = self.enc_self_attn[layer_idx](frame_query, tgt_mask=attn_mask)
+        frame_query = self.enc_ffn[layer_idx](frame_query)
+        frame_query = frame_query.reshape(W, fQ, LB, Nw, C).permute(3, 0, 1, 2, 4).reshape(T, fQ, LB, C)
+        frame_query = torch.roll(frame_query, -half_W, 0)
+        return frame_query
+    def encode_av_fusion(self, frame_query, attn_mask, audio_feats):
+        """
+        input shape (frame_query)   : T, fQ, LB, C
+        output shape (frame_query)  : T, fQ, LB, C
+        """
+        # Not using window-based attention if self.window_size == 0.
+        if self.window_size == 0:
+            return_shape = frame_query.shape  # T, fQ, LB, C
+            frame_query = frame_query.flatten(0, 1)  # TfQ, LB, C
+            audio_feats = audio_feats.flatten(0, 1)
+            for i in range(self.enc_layers):
+                audio_feats = self.enc_av_cross_attn[i](audio_feats, frame_query)
+                audio_feats = self.enc_av_ffn[i](audio_feats)
+            audio_feats = audio_feats.view(return_shape)
+            return audio_feats
+        # Using window-based attention if self.window_size > 0.
+        else:
+            T, fQ, LB, C = frame_query.shape
+            W = self.window_size
+            Nw = T // W
+            half_W = int(ceil(W / 2))
+            window_mask = attn_mask.view(LB * Nw, W)[..., None].repeat(1, 1, fQ).flatten(1)
+            _attn_mask = torch.roll(attn_mask, half_W, 1)
+            _attn_mask = _attn_mask.view(LB, Nw, W)[..., None].repeat(1, 1, 1, W)  # LB, Nw, W, W
+            _attn_mask[:, 0] = _attn_mask[:, 0] | _attn_mask[:, 0].transpose(-2, -1)
+            _attn_mask[:, -1] = _attn_mask[:, -1] | _attn_mask[:, -1].transpose(-2, -1)
+            _attn_mask[:, 0, :half_W, half_W:] = True
+            _attn_mask[:, 0, half_W:, :half_W] = True
+            _attn_mask = _attn_mask.view(LB * Nw, 1, W, 1, W, 1).repeat(1, self.num_heads, 1, fQ, 1, fQ).view(
+                LB * Nw * self.num_heads, W * fQ, W * fQ)
+            shift_window_mask = _attn_mask.float() * -1000
+            for layer_idx in range(self.enc_layers):
+                if layer_idx % 2 == 0:
+                    frame_query, audio_feats = self._window_av_attn(frame_query, window_mask, layer_idx, audio_feats)
+                else:
+                    frame_query, audio_feats = self._shift_window_av_attn(frame_query, shift_window_mask, layer_idx, audio_feats)
+            return audio_feats
+    def _window_av_attn(self, frame_query, attn_mask, layer_idx, audio_feats):
+        T, fQ, LB, C = frame_query.shape
+        W = self.window_size
+        Nw = T // W
+        frame_query = frame_query.view(Nw, W, fQ, LB, C)
+        frame_query = frame_query.permute(1, 2, 3, 0, 4).reshape(W * fQ, LB * Nw, C)
+        audio_feats = audio_feats.view(Nw, W, fQ, LB, C)
+        audio_feats = audio_feats.permute(1, 2, 3, 0, 4).reshape(W * fQ, LB * Nw, C)
+        audio_feats = self.enc_av_cross_attn[layer_idx](audio_feats, frame_query, memory_key_padding_mask=attn_mask)
+        audio_feats = self.enc_av_ffn[layer_idx](audio_feats)
+        frame_query = frame_query.reshape(W, fQ, LB, Nw, C).permute(3, 0, 1, 2, 4).reshape(T, fQ, LB, C)
+        audio_feats = audio_feats.reshape(W, fQ, LB, Nw, C).permute(3, 0, 1, 2, 4).reshape(T, fQ, LB, C)
+        return frame_query, audio_feats
+    def _shift_window_av_attn(self, frame_query, attn_mask, layer_idx, audio_feats):
+        T, fQ, LB, C = frame_query.shape
+        W = self.window_size
+        Nw = T // W
+        half_W = int(ceil(W / 2))
+        frame_query = torch.roll(frame_query, half_W, 0)
+        frame_query = frame_query.view(Nw, W, fQ, LB, C)
+        frame_query = frame_query.permute(1, 2, 3, 0, 4).reshape(W * fQ, LB * Nw, C)
+        audio_feats = torch.roll(audio_feats, half_W, 0)
+        audio_feats = audio_feats.view(Nw, W, fQ, LB, C)
+        audio_feats = audio_feats.permute(1, 2, 3, 0, 4).reshape(W * fQ, LB * Nw, C)
+        audio_feats = self.enc_av_cross_attn[layer_idx](audio_feats, frame_query, memory_mask=attn_mask)
+        audio_feats = self.enc_av_ffn[layer_idx](audio_feats)
+        frame_query = frame_query.reshape(W, fQ, LB, Nw, C).permute(3, 0, 1, 2, 4).reshape(T, fQ, LB, C)
+        frame_query = torch.roll(frame_query, -half_W, 0)
+        audio_feats = audio_feats.reshape(W, fQ, LB, Nw, C).permute(3, 0, 1, 2, 4).reshape(T, fQ, LB, C)
+        audio_feats = torch.roll(audio_feats, -half_W, 0)
+        return frame_query, audio_feats

avism/modeling/transformer_decoder/avism_coco.py ADDED Viewed

	@@ -0,0 +1,675 @@

+from math import ceil
+import fvcore.nn.weight_init as weight_init
+from typing import Optional
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+import copy
+from detectron2.config import configurable
+from detectron2.layers import Conv2d
+class SelfAttentionLayer(nn.Module):
+    def __init__(self, d_model, nhead, dropout=0.0,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt,
+                     tgt_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(self, tgt,
+                    tgt_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(self, tgt,
+                tgt_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, tgt_mask,
+                                    tgt_key_padding_mask, query_pos)
+        return self.forward_post(tgt, tgt_mask,
+                                 tgt_key_padding_mask, query_pos)
+class CrossAttentionLayer(nn.Module):
+    def __init__(self, d_model, nhead, dropout=0.0,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt, memory,
+                     memory_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(self, tgt, memory,
+                    memory_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(self, tgt, memory,
+                memory_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, memory_mask,
+                                    memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, memory_mask,
+                                 memory_key_padding_mask, pos, query_pos)
+class FFNLayer(nn.Module):
+    def __init__(self, d_model, dim_feedforward=2048, dropout=0.0,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm = nn.LayerNorm(d_model)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt):
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(self, tgt):
+        tgt2 = self.norm(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(self, tgt):
+        if self.normalize_before:
+            return self.forward_pre(tgt)
+        return self.forward_post(tgt)
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+class Avism_COCO(nn.Module):
+    @configurable
+    def __init__(
+            self,
+            in_channels,
+            aux_loss,
+            *,
+            hidden_dim: int,
+            num_frame_queries: int,
+            num_queries: int,
+            nheads: int,
+            dim_feedforward: int,
+            enc_layers: int,
+            dec_layers: int,
+            enc_window_size: int,
+            pre_norm: bool,
+            enforce_input_project: bool,
+            num_frames: int,
+            num_classes: int,
+            clip_last_layer_num: bool,
+            conv_dim: int,
+            mask_dim: int,
+            sim_use_clip: list,
+            use_sim: bool,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dim_feedforward: feature dimension in feedforward network
+            enc_layers: number of Transformer encoder layers
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+        """
+        super().__init__()
+        # define Transformer decoder here
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.transformer_self_attention_layers = nn.ModuleList()
+        self.transformer_cross_attention_layers = nn.ModuleList()
+        self.transformer_ffn_layers = nn.ModuleList()
+        self.num_frames = num_frames
+        self.num_classes = num_classes
+        self.clip_last_layer_num = clip_last_layer_num
+        self.enc_layers = enc_layers
+        self.window_size = enc_window_size
+        self.sim_use_clip = sim_use_clip
+        self.use_sim = use_sim
+        self.aux_loss = aux_loss
+        self.av_proj = nn.Linear(128, hidden_dim)
+        self.enc_layers = enc_layers
+        if enc_layers > 0:
+            self.enc_self_attn = nn.ModuleList()
+            self.enc_ffn = nn.ModuleList()
+            for _ in range(self.enc_layers):
+                self.enc_self_attn.append(
+                    SelfAttentionLayer(
+                        d_model=hidden_dim,
+                        nhead=nheads,
+                        dropout=0.0,
+                        normalize_before=pre_norm,
+                    ),
+                )
+                self.enc_ffn.append(
+                    FFNLayer(
+                        d_model=hidden_dim,
+                        dim_feedforward=dim_feedforward,
+                        dropout=0.0,
+                        normalize_before=pre_norm,
+                    )
+                )
+        if enc_layers > 0:
+            self.enc_av_cross_attn = nn.ModuleList()
+            self.enc_av_ffn = nn.ModuleList()
+            for _ in range(self.enc_layers):
+                self.enc_av_cross_attn.append(
+                    CrossAttentionLayer(
+                        d_model=hidden_dim,
+                        nhead=nheads,
+                        dropout=0.0,
+                        normalize_before=pre_norm,
+                    ),
+                )
+                self.enc_av_ffn.append(
+                    FFNLayer(
+                        d_model=hidden_dim,
+                        dim_feedforward=dim_feedforward,
+                        dropout=0.0,
+                        normalize_before=pre_norm,
+                    )
+                )
+        for _ in range(self.num_layers):
+            self.transformer_self_attention_layers.append(
+                SelfAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_cross_attention_layers.append(
+                CrossAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+            self.transformer_ffn_layers.append(
+                FFNLayer(
+                    d_model=hidden_dim,
+                    dim_feedforward=dim_feedforward,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                )
+            )
+        self.vita_mask_features = Conv2d(
+            conv_dim,
+            mask_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        weight_init.c2_xavier_fill(self.vita_mask_features)
+        self.decoder_norm = nn.LayerNorm(hidden_dim)
+        self.num_queries = num_queries
+        # learnable query features
+        self.query_feat = nn.Embedding(num_queries, hidden_dim)
+        # learnable query p.e.
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        self.fq_pos = nn.Embedding(num_frame_queries, hidden_dim)
+        if in_channels != hidden_dim or enforce_input_project:
+            self.input_proj_dec = nn.Linear(hidden_dim, hidden_dim)
+        else:
+            self.input_proj_dec = nn.Sequential()
+        self.src_embed = nn.Identity()
+        self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
+        self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+        if self.use_sim:
+            self.sim_embed_frame = nn.Linear(hidden_dim, hidden_dim)
+            if self.sim_use_clip:
+                self.sim_embed_clip = nn.Linear(hidden_dim, hidden_dim)
+    @classmethod
+    def from_config(cls, cfg, in_channels):
+        ret = {}
+        ret["in_channels"] = in_channels
+        ret["hidden_dim"] = cfg.MODEL.AVISM.HIDDEN_DIM
+        ret["num_frame_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
+        ret["num_queries"] = cfg.MODEL.AVISM.NUM_OBJECT_QUERIES
+        # Transformer parameters:
+        ret["nheads"] = cfg.MODEL.AVISM.NHEADS
+        ret["dim_feedforward"] = cfg.MODEL.AVISM.DIM_FEEDFORWARD
+        assert cfg.MODEL.AVISM.DEC_LAYERS >= 1
+        ret["enc_layers"] = cfg.MODEL.AVISM.ENC_LAYERS
+        ret["dec_layers"] = cfg.MODEL.AVISM.DEC_LAYERS
+        ret["enc_window_size"] = cfg.MODEL.AVISM.ENC_WINDOW_SIZE
+        ret["pre_norm"] = cfg.MODEL.AVISM.PRE_NORM
+        ret["enforce_input_project"] = cfg.MODEL.AVISM.ENFORCE_INPUT_PROJ
+        ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
+        ret["num_frames"] = cfg.INPUT.SAMPLING_FRAME_NUM
+        ret["clip_last_layer_num"] = cfg.MODEL.AVISM.LAST_LAYER_NUM
+        ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+        ret["sim_use_clip"] = cfg.MODEL.AVISM.SIM_USE_CLIP
+        ret["use_sim"] = cfg.MODEL.AVISM.SIM_WEIGHT > 0.0
+        return ret
+    def forward(self, frame_query, audio_features):
+        """
+        L: Number of Layers.
+        B: Batch size.
+        T: Temporal window size. Number of frames per video.
+        C: Channel size.
+        fQ: Number of frame-wise queries from IFC.
+        cQ: Number of clip-wise queries to decode Q.
+        """
+        if not self.training:
+            frame_query = frame_query[[-1]]
+        L, BT, fQ, C = frame_query.shape
+        B = BT // self.num_frames if self.training else 1
+        T = self.num_frames if self.training else BT // B
+        frame_query = frame_query.reshape(L * B, T, fQ, C)
+        frame_query = frame_query.permute(1, 2, 0, 3).contiguous()
+        frame_query = self.input_proj_dec(frame_query)  # T, fQ, LB, C
+        audio_feat = self.av_proj(audio_features)  # T, C
+        audio_feat = audio_feat[:, None, None, :].repeat(1, fQ, L * B, 1)
+        if self.window_size > 0:
+            pad = int(ceil(T / self.window_size)) * self.window_size - T
+            _T = pad + T
+            frame_query = F.pad(frame_query, (0, 0, 0, 0, 0, 0, 0, pad))  # _T, fQ, LB, C
+            audio_feat = F.pad(audio_feat, (0, 0, 0, 0, 0, 0, 0, pad))
+            enc_mask = frame_query.new_ones(L * B, _T).bool()  # LB, _T
+            enc_mask[:, :T] = False
+        else:
+            enc_mask = None
+        frame_query = self.encode_frame_query(frame_query, enc_mask)
+        # audio
+        av_feat = self.encode_av_fusion(frame_query, enc_mask, audio_feat)
+        frame_query = frame_query[:T].flatten(0, 1)  # TfQ, LB, C
+        av_feat = av_feat[:T].flatten(0, 1)
+        frame_query = frame_query + av_feat
+        if self.use_sim:
+            pred_fq_embed = self.sim_embed_frame(frame_query)  # TfQ, LB, C
+            pred_fq_embed = pred_fq_embed.transpose(0, 1).reshape(L, B, T, fQ, C)
+        else:
+            pred_fq_embed = None
+        src = self.src_embed(frame_query)  # TfQ, LB, C
+        dec_pos = self.fq_pos.weight[None, :, None, :].repeat(T, 1, L * B, 1).flatten(0, 1)  # TfQ, LB, C
+        # QxNxC
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, L * B, 1)  # cQ, LB, C
+        output = self.query_feat.weight.unsqueeze(1).repeat(1, L * B, 1)  # cQ, LB, C
+        decoder_outputs = []
+        for i in range(self.num_layers):
+            # attention: cross-attention first
+            output = self.transformer_cross_attention_layers[i](
+                output, src,
+                memory_mask=None,
+                memory_key_padding_mask=None,
+                pos=dec_pos, query_pos=query_embed
+            )
+            output = self.transformer_self_attention_layers[i](
+                output, tgt_mask=None,
+                tgt_key_padding_mask=None,
+                query_pos=query_embed
+            )
+            # FFN
+            output = self.transformer_ffn_layers[i](
+                output
+            )
+            if (self.training and self.aux_loss) or (i == self.num_layers - 1):
+                dec_out = self.decoder_norm(output)  # cQ, LB, C
+                dec_out = dec_out.transpose(0, 1)  # LB, cQ, C
+                decoder_outputs.append(dec_out.view(L, B, self.num_queries, C))
+        decoder_outputs = torch.stack(decoder_outputs, dim=0)  # D, L, B, cQ, C
+        pred_cls = self.class_embed(decoder_outputs)
+        pred_mask_embed = self.mask_embed(decoder_outputs)
+        if self.use_sim and self.sim_use_clip:
+            pred_cq_embed = self.sim_embed_clip(decoder_outputs)
+        else:
+            pred_cq_embed = [None] * self.num_layers
+        out = {
+            'pred_logits': pred_cls[-1],
+            'pred_mask_embed': pred_mask_embed[-1],
+            'pred_fq_embed': pred_fq_embed,
+            'pred_cq_embed': pred_cq_embed[-1],
+            'aux_outputs': self._set_aux_loss(
+                pred_cls, pred_mask_embed, pred_cq_embed, pred_fq_embed
+            )
+        }
+        return out
+    @torch.jit.unused
+    def _set_aux_loss(
+            self, outputs_cls, outputs_mask_embed, outputs_cq_embed, outputs_fq_embed
+    ):
+        return [{"pred_logits": a, "pred_mask_embed": b, "pred_cq_embed": c, "pred_fq_embed": outputs_fq_embed}
+                for a, b, c in zip(outputs_cls[:-1], outputs_mask_embed[:-1], outputs_cq_embed[:-1])]
+    def encode_frame_query(self, frame_query, attn_mask):
+        """
+        input shape (frame_query)   : T, fQ, LB, C
+        output shape (frame_query)  : T, fQ, LB, C
+        """
+        # Not using window-based attention if self.window_size == 0.
+        if self.window_size == 0:
+            return_shape = frame_query.shape  # T, fQ, LB, C
+            frame_query = frame_query.flatten(0, 1)  # TfQ, LB, C
+            for i in range(self.enc_layers):
+                frame_query = self.enc_self_attn[i](frame_query)
+                frame_query = self.enc_ffn[i](frame_query)
+            frame_query = frame_query.view(return_shape)
+            return frame_query
+        # Using window-based attention if self.window_size > 0.
+        else:
+            T, fQ, LB, C = frame_query.shape
+            W = self.window_size
+            Nw = T // W
+            half_W = int(ceil(W / 2))
+            window_mask = attn_mask.view(LB * Nw, W)[..., None].repeat(1, 1, fQ).flatten(1)
+            _attn_mask = torch.roll(attn_mask, half_W, 1)
+            _attn_mask = _attn_mask.view(LB, Nw, W)[..., None].repeat(1, 1, 1, W)  # LB, Nw, W, W
+            _attn_mask[:, 0] = _attn_mask[:, 0] | _attn_mask[:, 0].transpose(-2, -1)
+            _attn_mask[:, -1] = _attn_mask[:, -1] | _attn_mask[:, -1].transpose(-2, -1)
+            _attn_mask[:, 0, :half_W, half_W:] = True
+            _attn_mask[:, 0, half_W:, :half_W] = True
+            _attn_mask = _attn_mask.view(LB * Nw, 1, W, 1, W, 1).repeat(1, self.num_heads, 1, fQ, 1, fQ).view(
+                LB * Nw * self.num_heads, W * fQ, W * fQ)
+            shift_window_mask = _attn_mask.float() * -1000
+            for layer_idx in range(self.enc_layers):
+                if self.training or layer_idx % 2 == 0:
+                    frame_query = self._window_attn(frame_query, window_mask, layer_idx)
+                else:
+                    frame_query = self._shift_window_attn(frame_query, shift_window_mask, layer_idx)
+            return frame_query
+    def _window_attn(self, frame_query, attn_mask, layer_idx):
+        T, fQ, LB, C = frame_query.shape
+        # LBN, WTfQ = attn_mask.shape
+        W = self.window_size
+        Nw = T // W
+        frame_query = frame_query.view(Nw, W, fQ, LB, C)
+        frame_query = frame_query.permute(1, 2, 3, 0, 4).reshape(W * fQ, LB * Nw, C)
+        frame_query = self.enc_self_attn[layer_idx](frame_query, tgt_key_padding_mask=attn_mask)
+        frame_query = self.enc_ffn[layer_idx](frame_query)
+        frame_query = frame_query.reshape(W, fQ, LB, Nw, C).permute(3, 0, 1, 2, 4).reshape(T, fQ, LB, C)
+        return frame_query
+    def _shift_window_attn(self, frame_query, attn_mask, layer_idx):
+        T, fQ, LB, C = frame_query.shape
+        # LBNH, WfQ, WfQ = attn_mask.shape
+        W = self.window_size
+        Nw = T // W
+        half_W = int(ceil(W / 2))
+        frame_query = torch.roll(frame_query, half_W, 0)
+        frame_query = frame_query.view(Nw, W, fQ, LB, C)
+        frame_query = frame_query.permute(1, 2, 3, 0, 4).reshape(W * fQ, LB * Nw, C)
+        frame_query = self.enc_self_attn[layer_idx](frame_query, tgt_mask=attn_mask)
+        frame_query = self.enc_ffn[layer_idx](frame_query)
+        frame_query = frame_query.reshape(W, fQ, LB, Nw, C).permute(3, 0, 1, 2, 4).reshape(T, fQ, LB, C)
+        frame_query = torch.roll(frame_query, -half_W, 0)
+        return frame_query
+    def encode_av_fusion(self, frame_query, attn_mask, audio_feats):
+        """
+        input shape (frame_query)   : T, fQ, LB, C
+        output shape (frame_query)  : T, fQ, LB, C
+        """
+        # Not using window-based attention if self.window_size == 0.
+        if self.window_size == 0:
+            return_shape = frame_query.shape  # T, fQ, LB, C
+            frame_query = frame_query.flatten(0, 1)  # TfQ, LB, C
+            audio_feats = audio_feats.flatten(0, 1)
+            for i in range(self.enc_layers):
+                audio_feats = self.enc_av_cross_attn[i](audio_feats, frame_query)
+                audio_feats = self.enc_av_ffn[i](audio_feats)
+            audio_feats = audio_feats.view(return_shape)
+            return audio_feats
+        # Using window-based attention if self.window_size > 0.
+        else:
+            T, fQ, LB, C = frame_query.shape
+            W = self.window_size
+            Nw = T // W
+            half_W = int(ceil(W / 2))
+            window_mask = attn_mask.view(LB * Nw, W)[..., None].repeat(1, 1, fQ).flatten(1)
+            _attn_mask = torch.roll(attn_mask, half_W, 1)
+            _attn_mask = _attn_mask.view(LB, Nw, W)[..., None].repeat(1, 1, 1, W)  # LB, Nw, W, W
+            _attn_mask[:, 0] = _attn_mask[:, 0] | _attn_mask[:, 0].transpose(-2, -1)
+            _attn_mask[:, -1] = _attn_mask[:, -1] | _attn_mask[:, -1].transpose(-2, -1)
+            _attn_mask[:, 0, :half_W, half_W:] = True
+            _attn_mask[:, 0, half_W:, :half_W] = True
+            _attn_mask = _attn_mask.view(LB * Nw, 1, W, 1, W, 1).repeat(1, self.num_heads, 1, fQ, 1, fQ).view(
+                LB * Nw * self.num_heads, W * fQ, W * fQ)
+            shift_window_mask = _attn_mask.float() * -1000
+            for layer_idx in range(self.enc_layers):
+                if layer_idx % 2 == 0:
+                    frame_query, audio_feats = self._window_av_attn(frame_query, window_mask, layer_idx, audio_feats)
+                else:
+                    frame_query, audio_feats = self._shift_window_av_attn(frame_query, shift_window_mask, layer_idx, audio_feats)
+            return audio_feats
+    def _window_av_attn(self, frame_query, attn_mask, layer_idx, audio_feats):
+        T, fQ, LB, C = frame_query.shape
+        W = self.window_size
+        Nw = T // W
+        frame_query = frame_query.view(Nw, W, fQ, LB, C)
+        frame_query = frame_query.permute(1, 2, 3, 0, 4).reshape(W * fQ, LB * Nw, C)
+        audio_feats = audio_feats.view(Nw, W, fQ, LB, C)
+        audio_feats = audio_feats.permute(1, 2, 3, 0, 4).reshape(W * fQ, LB * Nw, C)
+        audio_feats = self.enc_av_cross_attn[layer_idx](audio_feats, frame_query, memory_key_padding_mask=attn_mask)
+        audio_feats = self.enc_av_ffn[layer_idx](audio_feats)
+        frame_query = frame_query.reshape(W, fQ, LB, Nw, C).permute(3, 0, 1, 2, 4).reshape(T, fQ, LB, C)
+        audio_feats = audio_feats.reshape(W, fQ, LB, Nw, C).permute(3, 0, 1, 2, 4).reshape(T, fQ, LB, C)
+        return frame_query, audio_feats
+    def _shift_window_av_attn(self, frame_query, attn_mask, layer_idx, audio_feats):
+        T, fQ, LB, C = frame_query.shape
+        W = self.window_size
+        Nw = T // W
+        half_W = int(ceil(W / 2))
+        frame_query = torch.roll(frame_query, half_W, 0)
+        frame_query = frame_query.view(Nw, W, fQ, LB, C)
+        frame_query = frame_query.permute(1, 2, 3, 0, 4).reshape(W * fQ, LB * Nw, C)
+        audio_feats = torch.roll(audio_feats, half_W, 0)
+        audio_feats = audio_feats.view(Nw, W, fQ, LB, C)
+        audio_feats = audio_feats.permute(1, 2, 3, 0, 4).reshape(W * fQ, LB * Nw, C)
+        audio_feats = self.enc_av_cross_attn[layer_idx](audio_feats, frame_query, memory_mask=attn_mask)
+        audio_feats = self.enc_av_ffn[layer_idx](audio_feats)
+        frame_query = frame_query.reshape(W, fQ, LB, Nw, C).permute(3, 0, 1, 2, 4).reshape(T, fQ, LB, C)
+        frame_query = torch.roll(frame_query, -half_W, 0)
+        audio_feats = audio_feats.reshape(W, fQ, LB, Nw, C).permute(3, 0, 1, 2, 4).reshape(T, fQ, LB, C)
+        audio_feats = torch.roll(audio_feats, -half_W, 0)
+        return frame_query, audio_feats