Spaces:

csuhan
/

opendet2

Runtime error

App Files Files Community

csuhan commited on Mar 20, 2022

Commit

b94fa0f

1 Parent(s): 864cac0

init

Browse files

Files changed (46) hide show

.gitignore +60 -0
README.md +5 -8
app.py +52 -0
configs/Base-RCNN-FPN-OPENDET.yaml +25 -0
configs/Base-RCNN-FPN.yaml +44 -0
configs/Base-RetinaNet.yaml +26 -0
configs/faster_rcnn_R_50_FPN_3x_baseline.yaml +16 -0
configs/faster_rcnn_R_50_FPN_3x_ds.yaml +18 -0
configs/faster_rcnn_R_50_FPN_3x_opendet.yaml +16 -0
configs/faster_rcnn_R_50_FPN_3x_proser.yaml +16 -0
configs/faster_rcnn_Swin_T_FPN_3x_opendet.yaml +25 -0
configs/retinanet_R_50_FPN_3x_baseline.yaml +17 -0
configs/retinanet_R_50_FPN_3x_opendet.yaml +25 -0
datasets/README.md +51 -0
demo/demo.py +194 -0
demo/predictor.py +224 -0
opendet2/__init__.py +7 -0
opendet2/config/__init__.py +3 -0
opendet2/config/defaults.py +50 -0
opendet2/data/__init__.py +4 -0
opendet2/data/build.py +299 -0
opendet2/data/builtin.py +31 -0
opendet2/data/voc_coco.py +35 -0
opendet2/engine/__init__.py +3 -0
opendet2/engine/defaults.py +441 -0
opendet2/evaluation/__init__.py +3 -0
opendet2/evaluation/pascal_voc_evaluation.py +377 -0
opendet2/modeling/__init__.py +5 -0
opendet2/modeling/backbone/__init__.py +3 -0
opendet2/modeling/backbone/swin_transformer.py +726 -0
opendet2/modeling/layers/__init__.py +3 -0
opendet2/modeling/layers/mlp.py +46 -0
opendet2/modeling/losses/__init__.py +4 -0
opendet2/modeling/losses/instance_contrastive_loss.py +40 -0
opendet2/modeling/losses/unknown_probability_loss.py +93 -0
opendet2/modeling/meta_arch/__init__.py +3 -0
opendet2/modeling/meta_arch/retinanet.py +483 -0
opendet2/modeling/roi_heads/__init__.py +4 -0
opendet2/modeling/roi_heads/box_head.py +163 -0
opendet2/modeling/roi_heads/fast_rcnn.py +645 -0
opendet2/modeling/roi_heads/roi_heads.py +150 -0
opendet2/solver/__init__.py +3 -0
opendet2/solver/build.py +57 -0
setup.py +14 -0
tools/convert_swin_to_d2.py +36 -0
tools/train_net.py +79 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,60 @@

+# output dir
+output*
+instant_test_output
+inference_test_output
+*.png
+*.json
+*.diff
+*.jpg
+!/projects/DensePose/doc/images/*.jpg
+# compilation and distribution
+__pycache__
+_ext
+*.pyc
+*.pyd
+*.so
+*.dll
+*.egg-info/
+build/
+dist/
+wheels/
+# pytorch/python/numpy formats
+*.pth
+*.pkl
+*.npy
+*.ts
+model_ts*.txt
+# ipython/jupyter notebooks
+*.ipynb
+**/.ipynb_checkpoints/
+# Editor temporaries
+*.swn
+*.swo
+*.swp
+*~
+# editor settings
+.idea
+.vscode
+_darcs
+# project dirs
+/detectron2/model_zoo/configs
+/datasets/*
+!/datasets/*.*
+/projects/*/datasets
+/models
+/snippet
+res*
+/checkpoints
+detectron2
+results
+checkpoints
+demo_images
+exps

README.md CHANGED Viewed

@@ -1,12 +1,9 @@
 ---
-title: Opendet2
-emoji: 🌍
-colorFrom: pink
-colorTo: blue
 sdk: gradio
-sdk_version: 2.8.12
 app_file: app.py
 pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference

 ---
+title: OpenDet2
+emoji:🦓
+colorFrom: gray
+colorTo: pink
 sdk: gradio
 app_file: app.py
 pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os
+os.system('pip install torch==1.9 torchvision')
+os.system('pip install detectron2==0.5 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.9/index.html')
+os.system('pip install timm opencv-python-headless')
+import gradio as gr
+from demo.predictor import VisualizationDemo
+from detectron2.config import get_cfg
+from opendet2 import add_opendet_config
+model_cfgs = {
+    "FR-CNN": ["configs/faster_rcnn_R_50_FPN_3x_baseline.yaml", "frcnn_r50.pth"],
+    "OpenDet-R50": ["configs/faster_rcnn_R_50_FPN_3x_opendet.yaml", "opendet2_r50.pth"],
+    "OpenDet-SwinT": ["configs/faster_rcnn_Swin_T_FPN_18e_opendet_voc.yaml", "opendet2_swint.pth"],
+}
+def setup_cfg(model):
+    cfg = get_cfg()
+    add_opendet_config(cfg)
+    model_cfg = model_cfgs[model]
+    cfg.merge_from_file(model_cfg[0])
+    cfg.MODEL.WEIGHTS = model_cfg[1]
+    cfg.MODEL.DEVICE = "cpu"
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
+    cfg.MODEL.ROI_HEADS.VIS_IOU_THRESH = 0.8
+    cfg.freeze()
+    return cfg
+def inference(input, model):
+    cfg = setup_cfg(model)
+    demo = VisualizationDemo(cfg)
+    # use PIL, to be consistent with evaluation
+    predictions, visualized_output = demo.run_on_image(input)
+    output = visualized_output.get_image()[:, :, ::-1]
+    return output
+iface = gr.Interface(
+    inference,
+    [
+        "image",
+        gr.inputs.Radio(
+            ["FR-CNN", "OpenDet-R50", "OpenDet-SwinT"], default='OpenDet-R50'),
+    ],
+    "image")
+iface.launch()

configs/Base-RCNN-FPN-OPENDET.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+_BASE_: "./Base-RCNN-FPN.yaml"
+MODEL:
+  MASK_ON: False
+  ROI_HEADS:
+    NAME: "OpenSetStandardROIHeads"
+    NUM_CLASSES: 81
+    NUM_KNOWN_CLASSES: 20
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNSeparateConvFCHead"
+    OUTPUT_LAYERS: "OpenDetFastRCNNOutputLayers"
+    CLS_AGNOSTIC_BBOX_REG: True
+UPLOSS:
+  START_ITER: 100
+  SAMPLING_METRIC: "min_score"
+  TOPK: 3
+  ALPHA: 1.0
+  WEIGHT: 1.0
+ICLOSS:
+  OUT_DIM: 128
+  QUEUE_SIZE: 256
+  IN_QUEUE_SIZE: 16
+  BATCH_IOU_THRESH: 0.5
+  QUEUE_IOU_THRESH: 0.7
+  TEMPERATURE: 0.1
+  WEIGHT: 0.1

configs/Base-RCNN-FPN.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+# The same as detectron2/configs/Base-RCNN-FPN.yaml
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  MIN_SIZE_TEST: 800
+VERSION: 2

configs/Base-RetinaNet.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# The same as detectron2/configs/Base-RetinaNet.yaml
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  BACKBONE:
+    NAME: "build_retinanet_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"]
+  FPN:
+    IN_FEATURES: ["res3", "res4", "res5"]
+  RETINANET:
+    IOU_THRESHOLDS: [0.4, 0.5]
+    IOU_LABELS: [0, -1, 1]
+    SMOOTH_L1_LOSS_BETA: 0.0
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01  # Note that RetinaNet uses a different default learning rate
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2

configs/faster_rcnn_R_50_FPN_3x_baseline.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+_BASE_: "./Base-RCNN-FPN-OPENDET.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_BOX_HEAD:
+    OUTPUT_LAYERS: "CosineFastRCNNOutputLayers" # baseline use a simple cosine FRCNN
+DATASETS:
+  TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+  TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+SOLVER:
+  STEPS: (21000, 29000)
+  MAX_ITER: 32000
+  WARMUP_ITERS: 100
+  AMP:
+    ENABLED: True

configs/faster_rcnn_R_50_FPN_3x_ds.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_BASE_: "./Base-RCNN-FPN-OPENDET.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NAME: "DropoutStandardROIHeads"
+  ROI_BOX_HEAD:
+    OUTPUT_LAYERS: "DropoutFastRCNNOutputLayers"
+DATASETS:
+  TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+  TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+SOLVER:
+  STEPS: (21000, 29000)
+  MAX_ITER: 32000
+  WARMUP_ITERS: 100
+  AMP:
+    ENABLED: True

configs/faster_rcnn_R_50_FPN_3x_opendet.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+_BASE_: "./Base-RCNN-FPN-OPENDET.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+DATASETS:
+  TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+  TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+SOLVER:
+  STEPS: (21000, 29000)
+  MAX_ITER: 32000
+  WARMUP_ITERS: 100
+  AMP:
+    ENABLED: True
+# UPLOSS.WEIGHT: former two are 0.5, the last is 1.0

configs/faster_rcnn_R_50_FPN_3x_proser.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+_BASE_: "./Base-RCNN-FPN-OPENDET.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  ROI_BOX_HEAD:
+    OUTPUT_LAYERS: "PROSERFastRCNNOutputLayers"
+DATASETS:
+  TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+  TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+SOLVER:
+  STEPS: (21000, 29000)
+  MAX_ITER: 32000
+  WARMUP_ITERS: 100
+  AMP:
+    ENABLED: True

configs/faster_rcnn_Swin_T_FPN_3x_opendet.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+_BASE_: "./Base-RCNN-FPN-OPENDET.yaml"
+MODEL:
+  WEIGHTS: "checkpoints/swin_tiny_patch4_window7_224_d2.pth"
+  PIXEL_MEAN: [123.675, 116.28, 103.53]
+  PIXEL_STD: [58.395, 57.12, 57.375]
+  RESNETS:
+    DEPTH: 50
+  BACKBONE:
+    NAME: "build_swint_fpn_backbone"
+  SWINT:
+    OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
+  FPN:
+    IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
+DATASETS:
+  TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+  TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+SOLVER:
+  STEPS: (21000, 29000)
+  MAX_ITER: 32000
+  WARMUP_ITERS: 100
+  WEIGHT_DECAY: 0.05
+  BASE_LR: 0.0001
+  OPTIMIZER: "ADAMW"
+  AMP:
+    ENABLED: True

configs/retinanet_R_50_FPN_3x_baseline.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+_BASE_: "./Base-RetinaNet.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  RETINANET:
+    NUM_CLASSES: 81
+    NUM_KNOWN_CLASSES: 20
+DATASETS:
+  TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+  TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+SOLVER:
+  STEPS: (21000, 29000)
+  MAX_ITER: 32000
+  WARMUP_ITERS: 1000
+  AMP:
+    ENABLED: True

configs/retinanet_R_50_FPN_3x_opendet.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+_BASE_: "./Base-RetinaNet.yaml"
+MODEL:
+  META_ARCHITECTURE: "OpenSetRetinaNet"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+  RETINANET:
+    NUM_CLASSES: 81
+    NUM_KNOWN_CLASSES: 20
+DATASETS:
+  TRAIN: ('voc_2007_train', 'voc_2012_trainval')
+  TEST: ('voc_2007_test', 'voc_coco_20_40_test', 'voc_coco_20_60_test', 'voc_coco_20_80_test', 'voc_coco_2500_test', 'voc_coco_5000_test', 'voc_coco_10000_test', 'voc_coco_20000_test')
+SOLVER:
+  STEPS: (21000, 29000)
+  MAX_ITER: 32000
+  WARMUP_ITERS: 1000
+  AMP:
+    ENABLED: True
+UPLOSS:
+  TOPK: 10
+  WEIGHT: 0.2
+ICLOSS:
+  QUEUE_SIZE: 1024
+  IN_QUEUE_SIZE: 64
+  WEIGHT: 0.2

datasets/README.md ADDED Viewed

	@@ -0,0 +1,51 @@

+# Use Builtin Datasets
+A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
+for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
+This document explains how to setup the builtin datasets so they can be used by the above APIs.
+[Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
+and how to add new datasets to them.
+Detectron2 has builtin support for a few datasets.
+The datasets are assumed to exist in a directory specified by the environment variable
+`DETECTRON2_DATASETS`.
+Under this directory, detectron2 will look for datasets in the structure described below, if needed.
+```
+$DETECTRON2_DATASETS/
+  coco/
+  VOC20{07,12}/
+```
+You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
+If left unset, the default is `./datasets` relative to your current working directory.
+The [model zoo](https://github.com/facebookresearch/detectron2/blob/master/MODEL_ZOO.md)
+contains configs and models that use these builtin datasets.
+## Expected dataset structure for [COCO instance/keypoint detection](https://cocodataset.org/#download):
+```
+coco/
+  annotations/
+    instances_{train,val}2017.json
+    person_keypoints_{train,val}2017.json
+  {train,val}2017/
+    # image files that are mentioned in the corresponding json
+```
+You can use the 2014 version of the dataset as well.
+Some of the builtin tests (`dev/run_*_tests.sh`) uses a tiny version of the COCO dataset,
+which you can download with `./datasets/prepare_for_tests.sh`.
+## Expected dataset structure for [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/index.html):
+```
+VOC20{07,12}/
+  Annotations/
+  ImageSets/
+    Main/
+      trainval.txt
+      test.txt
+      # train.txt or val.txt, if you use these splits
+  JPEGImages/
+```

demo/demo.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import argparse
+import glob
+import multiprocessing as mp
+import numpy as np
+import os
+import tempfile
+import time
+import warnings
+import cv2
+import tqdm
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.logger import setup_logger
+from predictor import VisualizationDemo
+import sys
+sys.path.insert(-1, "../")
+from opendet2 import add_opendet_config, builtin, OpenDetTrainer
+# constants
+WINDOW_NAME = "COCO detections"
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_opendet_config(cfg)
+    # To use demo for Panoptic-DeepLab, please uncomment the following two lines.
+    # from detectron2.projects.panoptic_deeplab import add_panoptic_deeplab_config  # noqa
+    # add_panoptic_deeplab_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    # Set score_threshold for builtin models
+    cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
+    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
+    cfg.MODEL.ROI_HEADS.VIS_IOU_THRESH = 0.8
+    cfg.freeze()
+    return cfg
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
+    parser.add_argument(
+        "--config-file",
+        default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
+    parser.add_argument("--video-input", help="Path to video file.")
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+    parser.add_argument(
+        "--confidence-threshold",
+        type=float,
+        default=0.5,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+def test_opencv_video_format(codec, file_ext):
+    with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
+        filename = os.path.join(dir, "test_file" + file_ext)
+        writer = cv2.VideoWriter(
+            filename=filename,
+            fourcc=cv2.VideoWriter_fourcc(*codec),
+            fps=float(30),
+            frameSize=(10, 10),
+            isColor=True,
+        )
+        [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
+        writer.release()
+        if os.path.isfile(filename):
+            return True
+        return False
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+    cfg = setup_cfg(args)
+    demo = VisualizationDemo(cfg)
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]))
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            # use PIL, to be consistent with evaluation
+            img = read_image(path, format="BGR")
+            start_time = time.time()
+            predictions, visualized_output = demo.run_on_image(img)
+            logger.info(
+                "{}: {} in {:.2f}s".format(
+                    path,
+                    "detected {} instances".format(len(predictions["instances"]))
+                    if "instances" in predictions
+                    else "finished",
+                    time.time() - start_time,
+                )
+            )
+            if args.output:
+                if os.path.isdir(args.output):
+                    assert os.path.isdir(args.output), args.output
+                    out_filename = os.path.join(args.output, os.path.basename(path))
+                else:
+                    assert len(args.input) == 1, "Please specify a directory with args.output"
+                    out_filename = args.output
+                visualized_output.save(out_filename)
+            else:
+                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+                cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
+                if cv2.waitKey(0) == 27:
+                    break  # esc to quit
+    elif args.webcam:
+        assert args.input is None, "Cannot have both --input and --webcam!"
+        assert args.output is None, "output not yet supported with --webcam!"
+        cam = cv2.VideoCapture(0)
+        for vis in tqdm.tqdm(demo.run_on_video(cam)):
+            cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+            cv2.imshow(WINDOW_NAME, vis)
+            if cv2.waitKey(1) == 27:
+                break  # esc to quit
+        cam.release()
+        cv2.destroyAllWindows()
+    elif args.video_input:
+        video = cv2.VideoCapture(args.video_input)
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames_per_second = video.get(cv2.CAP_PROP_FPS)
+        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        basename = os.path.basename(args.video_input)
+        codec, file_ext = (
+            ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
+        )
+        if codec == ".mp4v":
+            warnings.warn("x264 codec not available, switching to mp4v")
+        if args.output:
+            if os.path.isdir(args.output):
+                output_fname = os.path.join(args.output, basename)
+                output_fname = os.path.splitext(output_fname)[0] + file_ext
+            else:
+                output_fname = args.output
+            assert not os.path.isfile(output_fname), output_fname
+            output_file = cv2.VideoWriter(
+                filename=output_fname,
+                # some installation of opencv may not support x264 (due to its license),
+                # you can try other format (e.g. MPEG)
+                fourcc=cv2.VideoWriter_fourcc(*codec),
+                fps=float(frames_per_second),
+                frameSize=(width, height),
+                isColor=True,
+            )
+        assert os.path.isfile(args.video_input)
+        for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
+            if args.output:
+                output_file.write(vis_frame)
+            else:
+                cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
+                cv2.imshow(basename, vis_frame)
+                if cv2.waitKey(1) == 27:
+                    break  # esc to quit
+        video.release()
+        if args.output:
+            output_file.release()
+        else:
+            cv2.destroyAllWindows()

demo/predictor.py ADDED Viewed

	@@ -0,0 +1,224 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import atexit
+import bisect
+import multiprocessing as mp
+from collections import deque
+import cv2
+import torch
+from detectron2.data import MetadataCatalog
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.utils.video_visualizer import VideoVisualizer
+from detectron2.utils.visualizer import ColorMode, Visualizer
+from detectron2.data.datasets.builtin_meta import _get_coco_instances_meta
+class VisualizationDemo(object):
+    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
+        """
+        Args:
+            cfg (CfgNode):
+            instance_mode (ColorMode):
+            parallel (bool): whether to run the model in different processes from visualization.
+                Useful since the visualization logic can be slow.
+        """
+        self.metadata = MetadataCatalog.get(
+            cfg.DATASETS.TEST[-1] if len(cfg.DATASETS.TEST) else "__unused"
+        )
+        thing_colors = _get_coco_instances_meta()["thing_colors"]
+        thing_colors.append((0,0,0))
+        self.metadata.set(thing_colors=thing_colors)
+        self.cpu_device = torch.device("cpu")
+        self.instance_mode = instance_mode
+        self.parallel = parallel
+        if parallel:
+            num_gpu = torch.cuda.device_count()
+            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
+        else:
+            self.predictor = DefaultPredictor(cfg)
+    def run_on_image(self, image):
+        """
+        Args:
+            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+                This is the format used by OpenCV.
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        vis_output = None
+        predictions = self.predictor(image)
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
+        if "panoptic_seg" in predictions:
+            panoptic_seg, segments_info = predictions["panoptic_seg"]
+            vis_output = visualizer.draw_panoptic_seg_predictions(
+                panoptic_seg.to(self.cpu_device), segments_info
+            )
+        else:
+            if "sem_seg" in predictions:
+                vis_output = visualizer.draw_sem_seg(
+                    predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+            if "instances" in predictions:
+                instances = predictions["instances"].to(self.cpu_device)
+                vis_output = visualizer.draw_instance_predictions(predictions=instances)
+        return predictions, vis_output
+    def _frame_from_video(self, video):
+        while video.isOpened():
+            success, frame = video.read()
+            if success:
+                yield frame
+            else:
+                break
+    def run_on_video(self, video):
+        """
+        Visualizes predictions on frames of the input video.
+        Args:
+            video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
+                either a webcam or a video file.
+        Yields:
+            ndarray: BGR visualizations of each video frame.
+        """
+        video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
+        def process_predictions(frame, predictions):
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            if "panoptic_seg" in predictions:
+                panoptic_seg, segments_info = predictions["panoptic_seg"]
+                vis_frame = video_visualizer.draw_panoptic_seg_predictions(
+                    frame, panoptic_seg.to(self.cpu_device), segments_info
+                )
+            elif "instances" in predictions:
+                predictions = predictions["instances"].to(self.cpu_device)
+                vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
+            elif "sem_seg" in predictions:
+                vis_frame = video_visualizer.draw_sem_seg(
+                    frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+            # Converts Matplotlib RGB format to OpenCV BGR format
+            vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
+            return vis_frame
+        frame_gen = self._frame_from_video(video)
+        if self.parallel:
+            buffer_size = self.predictor.default_buffer_size
+            frame_data = deque()
+            for cnt, frame in enumerate(frame_gen):
+                frame_data.append(frame)
+                self.predictor.put(frame)
+                if cnt >= buffer_size:
+                    frame = frame_data.popleft()
+                    predictions = self.predictor.get()
+                    yield process_predictions(frame, predictions)
+            while len(frame_data):
+                frame = frame_data.popleft()
+                predictions = self.predictor.get()
+                yield process_predictions(frame, predictions)
+        else:
+            for frame in frame_gen:
+                yield process_predictions(frame, self.predictor(frame))
+class AsyncPredictor:
+    """
+    A predictor that runs the model asynchronously, possibly on >1 GPUs.
+    Because rendering the visualization takes considerably amount of time,
+    this helps improve throughput a little bit when rendering videos.
+    """
+    class _StopToken:
+        pass
+    class _PredictWorker(mp.Process):
+        def __init__(self, cfg, task_queue, result_queue):
+            self.cfg = cfg
+            self.task_queue = task_queue
+            self.result_queue = result_queue
+            super().__init__()
+        def run(self):
+            predictor = DefaultPredictor(self.cfg)
+            while True:
+                task = self.task_queue.get()
+                if isinstance(task, AsyncPredictor._StopToken):
+                    break
+                idx, data = task
+                result = predictor(data)
+                self.result_queue.put((idx, result))
+    def __init__(self, cfg, num_gpus: int = 1):
+        """
+        Args:
+            cfg (CfgNode):
+            num_gpus (int): if 0, will run on CPU
+        """
+        num_workers = max(num_gpus, 1)
+        self.task_queue = mp.Queue(maxsize=num_workers * 3)
+        self.result_queue = mp.Queue(maxsize=num_workers * 3)
+        self.procs = []
+        for gpuid in range(max(num_gpus, 1)):
+            cfg = cfg.clone()
+            cfg.defrost()
+            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
+            self.procs.append(
+                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
+            )
+        self.put_idx = 0
+        self.get_idx = 0
+        self.result_rank = []
+        self.result_data = []
+        for p in self.procs:
+            p.start()
+        atexit.register(self.shutdown)
+    def put(self, image):
+        self.put_idx += 1
+        self.task_queue.put((self.put_idx, image))
+    def get(self):
+        self.get_idx += 1  # the index needed for this request
+        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
+            res = self.result_data[0]
+            del self.result_data[0], self.result_rank[0]
+            return res
+        while True:
+            # make sure the results are returned in the correct order
+            idx, res = self.result_queue.get()
+            if idx == self.get_idx:
+                return res
+            insert = bisect.bisect(self.result_rank, idx)
+            self.result_rank.insert(insert, idx)
+            self.result_data.insert(insert, res)
+    def __len__(self):
+        return self.put_idx - self.get_idx
+    def __call__(self, image):
+        self.put(image)
+        return self.get()
+    def shutdown(self):
+        for _ in self.procs:
+            self.task_queue.put(AsyncPredictor._StopToken())
+    @property
+    def default_buffer_size(self):
+        return len(self.procs) * 5

opendet2/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .config import *
+from .data import *
+from .engine import *
+from .evaluation import *
+from .modeling import *
+__all__ = [k for k in globals().keys() if not k.startswith("_")]

opendet2/config/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .defaults import add_opendet_config
2	+
3	+ __all__ = [k for k in globals().keys() if not k.startswith("_")]

opendet2/config/defaults.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from detectron2.config import CfgNode as CN
+def add_opendet_config(cfg):
+    _C = cfg
+    # unknown probability loss
+    _C.UPLOSS = CN()
+    _C.UPLOSS.START_ITER = 100  # usually the same as warmup iter
+    _C.UPLOSS.SAMPLING_METRIC = "min_score"
+    _C.UPLOSS.TOPK = 3
+    _C.UPLOSS.ALPHA = 1.0
+    _C.UPLOSS.WEIGHT = 0.5
+    # instance contrastive loss
+    _C.ICLOSS = CN()
+    _C.ICLOSS.OUT_DIM = 128
+    _C.ICLOSS.QUEUE_SIZE = 256
+    _C.ICLOSS.IN_QUEUE_SIZE = 16
+    _C.ICLOSS.BATCH_IOU_THRESH = 0.5
+    _C.ICLOSS.QUEUE_IOU_THRESH = 0.7
+    _C.ICLOSS.TEMPERATURE = 0.1
+    _C.ICLOSS.WEIGHT = 0.1
+    # register RoI output layer
+    _C.MODEL.ROI_BOX_HEAD.OUTPUT_LAYERS = "FastRCNNOutputLayers"
+    # known classes
+    _C.MODEL.ROI_HEADS.NUM_KNOWN_CLASSES = 20
+    _C.MODEL.RETINANET.NUM_KNOWN_CLASSES = 20
+    # thresh for visualization results.
+    _C.MODEL.ROI_HEADS.VIS_IOU_THRESH = 1.0
+    # scale for cosine classifier
+    _C.MODEL.ROI_HEADS.COSINE_SCALE = 20
+    # swin transformer
+    _C.MODEL.SWINT = CN()
+    _C.MODEL.SWINT.EMBED_DIM = 96
+    _C.MODEL.SWINT.OUT_FEATURES = ["stage2", "stage3", "stage4", "stage5"]
+    _C.MODEL.SWINT.DEPTHS = [2, 2, 6, 2]
+    _C.MODEL.SWINT.NUM_HEADS = [3, 6, 12, 24]
+    _C.MODEL.SWINT.WINDOW_SIZE = 7
+    _C.MODEL.SWINT.MLP_RATIO = 4
+    _C.MODEL.SWINT.DROP_PATH_RATE = 0.2
+    _C.MODEL.SWINT.APE = False
+    _C.MODEL.BACKBONE.FREEZE_AT = -1
+    _C.MODEL.FPN.TOP_LEVELS = 2
+    # solver, e.g., adamw for swin
+    _C.SOLVER.OPTIMIZER = 'SGD'
+    _C.SOLVER.BETAS = (0.9, 0.999)

opendet2/data/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .build import *
+from . import builtin
+__all__ = [k for k in globals().keys() if not k.startswith("_")]

opendet2/data/build.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+import numpy as np
+import copy
+import torch.utils.data
+import torch
+from detectron2.config import configurable
+from detectron2.utils.logger import _log_api_usage
+from detectron2.data.catalog import DatasetCatalog, MetadataCatalog
+from detectron2.data.common import DatasetFromList, MapDataset
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.detection_utils import check_metadata_consistency
+from detectron2.data.samplers import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler
+from detectron2.data.build import trivial_batch_collator
+from detectron2.data import (build_batch_data_loader,
+                             print_instances_class_histogram,
+                             load_proposals_into_dataset)
+from detectron2.data.build import (filter_images_with_few_keypoints,
+                                   filter_images_with_only_crowd_annotations)
+"""
+This file contains the default logic to build a dataloader for training or testing.
+"""
+__all__ = [
+    "build_detection_train_loader",
+    "build_detection_test_loader",
+    "get_detection_dataset_dicts",
+]
+def get_detection_dataset_dicts(names, filter_empty=True, min_keypoints=0, proposal_files=None, cfg=None):
+    """
+    Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
+    Args:
+        names (str or list[str]): a dataset name or a list of dataset names
+        filter_empty (bool): whether to filter out images without instance annotations
+        min_keypoints (int): filter out images with fewer keypoints than
+            `min_keypoints`. Set to 0 to do nothing.
+        proposal_files (list[str]): if given, a list of object proposal files
+            that match each dataset in `names`.
+    Returns:
+        list[dict]: a list of dicts following the standard dataset dict format.
+    """
+    if isinstance(names, str):
+        names = [names]
+    assert len(names), names
+    dataset_dicts = [DatasetCatalog.get(dataset_name)
+                     for dataset_name in names]
+    for dataset_name, dicts in zip(names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+    if proposal_files is not None:
+        assert len(names) == len(proposal_files)
+        # load precomputed proposals from proposal files
+        dataset_dicts = [
+            load_proposals_into_dataset(dataset_i_dicts, proposal_file)
+            for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
+        ]
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+    has_instances = "annotations" in dataset_dicts[0]
+    if filter_empty and has_instances:
+        dataset_dicts = filter_images_with_only_crowd_annotations(
+            dataset_dicts)
+    if min_keypoints > 0 and has_instances:
+        dataset_dicts = filter_images_with_few_keypoints(
+            dataset_dicts, min_keypoints)
+    d_name = names[0]
+    # if 'voc_coco' in d_name:
+    if 'train' in d_name:
+        dataset_dicts = remove_unk_instances(cfg, dataset_dicts)
+    elif 'test' in d_name:
+        dataset_dicts = label_known_class_and_unknown(cfg, dataset_dicts)
+    if has_instances:
+        try:
+            class_names = MetadataCatalog.get(names[0]).thing_classes
+            check_metadata_consistency("thing_classes", names)
+            print_instances_class_histogram(dataset_dicts, class_names)
+        except AttributeError:  # class names are not available for this dataset
+            pass
+    assert len(dataset_dicts), "No valid data found in {}.".format(
+        ",".join(names))
+    return dataset_dicts
+def remove_unk_instances(cfg, dataset_dicts):
+    num_known_classes = cfg.MODEL.ROI_HEADS.NUM_KNOWN_CLASSES
+    valid_classes = range(0, num_known_classes)
+    logger = logging.getLogger(__name__)
+    logger.info("Valid classes: " + str(valid_classes))
+    logger.info("Removing unknown objects...")
+    for entry in copy.copy(dataset_dicts):
+        annos = entry["annotations"]
+        for annotation in copy.copy(annos):
+            if annotation["category_id"] not in valid_classes:
+                annos.remove(annotation)
+        if len(annos) == 0:
+            dataset_dicts.remove(entry)
+    return dataset_dicts
+def label_known_class_and_unknown(cfg, dataset_dicts):
+    num_known_classes = cfg.MODEL.ROI_HEADS.NUM_KNOWN_CLASSES
+    total_num_class = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+    known_classes = range(0, num_known_classes)
+    logger = logging.getLogger(__name__)
+    logger.info("Known classes: " + str(known_classes))
+    logger.info(
+        "Labelling known instances the corresponding label, and unknown instances as unknown...")
+    for entry in dataset_dicts:
+        annos = entry["annotations"]
+        for annotation in annos:
+            if annotation["category_id"] not in known_classes:
+                annotation["category_id"] = total_num_class - 1
+    return dataset_dicts
+def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
+    if dataset is None:
+        dataset = get_detection_dataset_dicts(
+            cfg.DATASETS.TRAIN,
+            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+            if cfg.MODEL.KEYPOINT_ON
+            else 0,
+            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+            cfg=cfg
+        )
+        _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    else:
+        mapper = mapper(cfg, True)
+    if sampler is None:
+        sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+        logger = logging.getLogger(__name__)
+        logger.info("Using training sampler {}".format(sampler_name))
+        if sampler_name == "TrainingSampler":
+            sampler = TrainingSampler(len(dataset))
+        elif sampler_name == "RepeatFactorTrainingSampler":
+            repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
+                dataset, cfg.DATALOADER.REPEAT_THRESHOLD
+            )
+            sampler = RepeatFactorTrainingSampler(repeat_factors)
+        else:
+            raise ValueError(
+                "Unknown training sampler: {}".format(sampler_name))
+    return {
+        "dataset": dataset,
+        "sampler": sampler,
+        "mapper": mapper,
+        "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
+        "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
+        "num_workers": cfg.DATALOADER.NUM_WORKERS,
+    }
+# TODO can allow dataset as an iterable or IterableDataset to make this function more general
+@configurable(from_config=_train_loader_from_config)
+def build_detection_train_loader(
+    dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0
+):
+    """
+    Build a dataloader for object detection with some default features.
+    This interface is experimental.
+    Args:
+        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+            or a map-style pytorch dataset. They can be obtained by using
+            :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper (callable): a callable which takes a sample (dict) from dataset and
+            returns the format to be consumed by the model.
+            When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
+        sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
+            indices to be applied on ``dataset``. Default to :class:`TrainingSampler`,
+            which coordinates an infinite random shuffle sequence across all workers.
+        total_batch_size (int): total batch size across all workers. Batching
+            simply puts data into a list.
+        aspect_ratio_grouping (bool): whether to group images with similar
+            aspect ratio for efficiency. When enabled, it requires each
+            element in dataset be a dict with keys "width" and "height".
+        num_workers (int): number of parallel data loading workers
+    Returns:
+        torch.utils.data.DataLoader:
+            a dataloader. Each output from it is a ``list[mapped_element]`` of length
+            ``total_batch_size / num_workers``, where ``mapped_element`` is produced
+            by the ``mapper``.
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = TrainingSampler(len(dataset))
+    assert isinstance(sampler, torch.utils.data.sampler.Sampler)
+    return build_batch_data_loader(
+        dataset,
+        sampler,
+        total_batch_size,
+        aspect_ratio_grouping=aspect_ratio_grouping,
+        num_workers=num_workers,
+    )
+def _test_loader_from_config(cfg, dataset_name, mapper=None):
+    """
+    Uses the given `dataset_name` argument (instead of the names in cfg), because the
+    standard practice is to evaluate each test set individually (not combining them).
+    """
+    if isinstance(dataset_name, str):
+        dataset_name = [dataset_name]
+    dataset = get_detection_dataset_dicts(
+        dataset_name,
+        filter_empty=False,
+        proposal_files=[
+            cfg.DATASETS.PROPOSAL_FILES_TEST[list(
+                cfg.DATASETS.TEST).index(dataset_name)]
+        ]
+        if cfg.MODEL.LOAD_PROPOSALS
+        else None,
+        cfg=cfg
+    )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+    return {"dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS}
+@configurable(from_config=_test_loader_from_config)
+def build_detection_test_loader(dataset, *, mapper, sampler=None, num_workers=0):
+    """
+    Similar to `build_detection_train_loader`, but uses a batch size of 1,
+    and :class:`InferenceSampler`. This sampler coordinates all workers to
+    produce the exact set of all samples.
+    This interface is experimental.
+    Args:
+        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+            or a map-style pytorch dataset. They can be obtained by using
+            :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper (callable): a callable which takes a sample (dict) from dataset
+           and returns the format to be consumed by the model.
+           When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
+        sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
+            indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
+            which splits the dataset across all workers.
+        num_workers (int): number of parallel data loading workers
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+        dataset, with test-time transformation and batching.
+    Examples:
+    ::
+        data_loader = build_detection_test_loader(
+            DatasetRegistry.get("my_test"),
+            mapper=DatasetMapper(...))
+        # or, instantiate with a CfgNode:
+        data_loader = build_detection_test_loader(cfg, "my_test")
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = InferenceSampler(len(dataset))
+    # Always use 1 image per worker during inference since this is the
+    # standard when reporting inference time in papers.
+    batch_sampler = torch.utils.data.sampler.BatchSampler(
+        sampler, 1, drop_last=False)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=num_workers,
+        batch_sampler=batch_sampler,
+        collate_fn=trivial_batch_collator,
+    )
+    return data_loader

opendet2/data/builtin.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+from .voc_coco import register_voc_coco
+from detectron2.data import MetadataCatalog
+def register_all_voc_coco(root):
+    SPLITS = [
+        # VOC_COCO_openset
+        ("voc_coco_20_40_test", "voc_coco", "voc_coco_20_40_test"),
+        ("voc_coco_20_60_test", "voc_coco", "voc_coco_20_60_test"),
+        ("voc_coco_20_80_test", "voc_coco", "voc_coco_20_80_test"),
+        ("voc_coco_2500_test", "voc_coco", "voc_coco_2500_test"),
+        ("voc_coco_5000_test", "voc_coco", "voc_coco_5000_test"),
+        ("voc_coco_10000_test", "voc_coco", "voc_coco_10000_test"),
+        ("voc_coco_20000_test", "voc_coco", "voc_coco_20000_test"),
+        ("voc_coco_val", "voc_coco", "voc_coco_val"),
+    ]
+    for name, dirname, split in SPLITS:
+        year = 2007 if "2007" in name else 2012
+        register_voc_coco(name, os.path.join(root, dirname), split, year)
+        MetadataCatalog.get(name).evaluator_type = "pascal_voc"
+if __name__.endswith(".builtin"):
+    # Register them all under "./datasets"
+    _root = os.getenv("DETECTRON2_DATASETS", "datasets")
+    register_all_voc_coco(_root)

opendet2/data/voc_coco.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_voc_instances
+VOC_COCO_CATEGORIES = [
+    # VOC
+    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
+    "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
+    "pottedplant", "sheep", "sofa", "train", "tvmonitor",
+    # COCO-20-40
+    "truck", "traffic light", "fire hydrant", "stop sign", "parking meter",
+    "bench", "elephant", "bear", "zebra", "giraffe",
+    "backpack", "umbrella", "handbag", "tie", "suitcase",
+    "microwave", "oven", "toaster", "sink", "refrigerator",
+    # COCO-40-60
+    "frisbee", "skis", "snowboard", "sports ball", "kite",
+    "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
+    "banana", "apple", "sandwich", "orange", "broccoli",
+    "carrot", "hot dog", "pizza", "donut", "cake",
+    # COCO-60-80
+    "bed", "toilet", "laptop", "mouse",
+    "remote", "keyboard", "cell phone", "book", "clock",
+    "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
+    "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+    # Unknown
+    "unknown",
+]
+def register_voc_coco(name, dirname, split, year):
+    class_names = VOC_COCO_CATEGORIES
+    DatasetCatalog.register(
+        name, lambda: load_voc_instances(dirname, split, class_names))
+    MetadataCatalog.get(name).set(
+        thing_classes=list(class_names), dirname=dirname, year=year, split=split
+    )

opendet2/engine/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .defaults import OpenDetTrainer
2	+
3	+ __all__ = [k for k in globals().keys() if not k.startswith("_")]

opendet2/engine/defaults.py ADDED Viewed

	@@ -0,0 +1,441 @@

+import logging
+import os
+import weakref
+from collections import OrderedDict
+from typing import Dict
+import torch
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import CfgNode
+from detectron2.data import MetadataCatalog
+from detectron2.engine import (AMPTrainer, SimpleTrainer,
+                               TrainerBase, create_ddp_model, hooks, create_ddp_model, default_writers)
+from detectron2.evaluation import (DatasetEvaluator, DatasetEvaluators,
+                                   inference_on_dataset, print_csv_format,
+                                   verify_results)
+from detectron2.modeling import GeneralizedRCNNWithTTA, build_model
+from detectron2.solver import build_lr_scheduler
+from detectron2.utils import comm
+from detectron2.utils.logger import setup_logger
+from fvcore.nn.precise_bn import get_bn_modules
+from ..data import build_detection_test_loader, build_detection_train_loader
+from ..evaluation import PascalVOCDetectionEvaluator
+from ..solver import build_optimizer
+class OpenDetTrainer(TrainerBase):
+    """
+    A trainer with default training logic. It does the following:
+    1. Create a :class:`SimpleTrainer` using model, optimizer, dataloader
+       defined by the given config. Create a LR scheduler defined by the config.
+    2. Load the last checkpoint or `cfg.MODEL.WEIGHTS`, if exists, when
+       `resume_or_load` is called.
+    3. Register a few common hooks defined by the config.
+    It is created to simplify the **standard model training workflow** and reduce code boilerplate
+    for users who only need the standard training workflow, with standard features.
+    It means this class makes *many assumptions* about your training logic that
+    may easily become invalid in a new research. In fact, any assumptions beyond those made in the
+    :class:`SimpleTrainer` are too much for research.
+    The code of this class has been annotated about restrictive assumptions it makes.
+    When they do not work for you, you're encouraged to:
+    1. Overwrite methods of this class, OR:
+    2. Use :class:`SimpleTrainer`, which only does minimal SGD training and
+       nothing else. You can then add your own hooks if needed. OR:
+    3. Write your own training loop similar to `tools/plain_train_net.py`.
+    See the :doc:`/tutorials/training` tutorials for more details.
+    Note that the behavior of this class, like other functions/classes in
+    this file, is not stable, since it is meant to represent the "common default behavior".
+    It is only guaranteed to work well with the standard models and training workflow in detectron2.
+    To obtain more stable behavior, write your own training logic with other public APIs.
+    Examples:
+    ::
+        trainer = DefaultTrainer(cfg)
+        trainer.resume_or_load()  # load last checkpoint or MODEL.WEIGHTS
+        trainer.train()
+    Attributes:
+        scheduler:
+        checkpointer (DetectionCheckpointer):
+        cfg (CfgNode):
+    """
+    def __init__(self, cfg):
+        """
+        Args:
+            cfg (CfgNode):
+        """
+        super().__init__()
+        logger = logging.getLogger("detectron2")
+        # setup_logger is not called for d2
+        if not logger.isEnabledFor(logging.INFO):
+            setup_logger()
+        cfg = OpenDetTrainer.auto_scale_workers(cfg, comm.get_world_size())
+        # Assume these objects must be constructed in this order.
+        model = self.build_model(cfg)
+        optimizer = self.build_optimizer(cfg, model)
+        data_loader = self.build_train_loader(cfg)
+        model = create_ddp_model(
+            model, broadcast_buffers=False, find_unused_parameters=True)
+        # model = create_ddp_model(model, broadcast_buffers=False)
+        self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(
+            model, data_loader, optimizer
+        )
+        self.scheduler = self.build_lr_scheduler(cfg, optimizer)
+        self.checkpointer = DetectionCheckpointer(
+            # Assume you want to save checkpoints together with logs/statistics
+            model,
+            cfg.OUTPUT_DIR,
+            trainer=weakref.proxy(self),
+        )
+        self.start_iter = 0
+        self.max_iter = cfg.SOLVER.MAX_ITER
+        self.cfg = cfg
+        self.register_hooks(self.build_hooks())
+    def resume_or_load(self, resume=True):
+        """
+        If `resume==True` and `cfg.OUTPUT_DIR` contains the last checkpoint (defined by
+        a `last_checkpoint` file), resume from the file. Resuming means loading all
+        available states (eg. optimizer and scheduler) and update iteration counter
+        from the checkpoint. ``cfg.MODEL.WEIGHTS`` will not be used.
+        Otherwise, this is considered as an independent training. The method will load model
+        weights from the file `cfg.MODEL.WEIGHTS` (but will not load other states) and start
+        from iteration 0.
+        Args:
+            resume (bool): whether to do resume or not
+        """
+        self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume)
+        if resume and self.checkpointer.has_checkpoint():
+            # The checkpoint stores the training iteration that just finished, thus we start
+            # at the next iteration
+            self.start_iter = self.iter + 1
+    def build_hooks(self):
+        """
+        Build a list of default hooks, including timing, evaluation,
+        checkpointing, lr scheduling, precise BN, writing events.
+        Returns:
+            list[HookBase]:
+        """
+        cfg = self.cfg.clone()
+        cfg.defrost()
+        cfg.DATALOADER.NUM_WORKERS = 0  # save some memory and time for PreciseBN
+        ret = [
+            hooks.IterationTimer(),
+            hooks.LRScheduler(),
+            hooks.PreciseBN(
+                # Run at the same freq as (but before) evaluation.
+                cfg.TEST.EVAL_PERIOD,
+                self.model,
+                # Build a new data loader to not affect training
+                self.build_train_loader(cfg),
+                cfg.TEST.PRECISE_BN.NUM_ITER,
+            )
+            if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
+            else None,
+        ]
+        # Do PreciseBN before checkpointer, because it updates the model and need to
+        # be saved by checkpointer.
+        # This is not always the best: if checkpointing has a different frequency,
+        # some checkpoints may have more precise statistics than others.
+        if comm.is_main_process():
+            ret.append(hooks.PeriodicCheckpointer(
+                self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD))
+        def test_and_save_results():
+            self._last_eval_results = self.test(self.cfg, self.model)
+            return self._last_eval_results
+        # Do evaluation after checkpointer, because then if it fails,
+        # we can use the saved checkpoint to debug.
+        ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
+        if comm.is_main_process():
+            # Here the default print/log frequency of each writer is used.
+            # run writers in the end, so that evaluation metrics are written
+            ret.append(hooks.PeriodicWriter(self.build_writers(), period=20))
+        return ret
+    def build_writers(self):
+        """
+        Build a list of writers to be used using :func:`default_writers()`.
+        If you'd like a different list of writers, you can overwrite it in
+        your trainer.
+        Returns:
+            list[EventWriter]: a list of :class:`EventWriter` objects.
+        """
+        return default_writers(self.cfg.OUTPUT_DIR, self.max_iter)
+    def train(self):
+        """
+        Run training.
+        Returns:
+            OrderedDict of results, if evaluation is enabled. Otherwise None.
+        """
+        super().train(self.start_iter, self.max_iter)
+        if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process():
+            assert hasattr(
+                self, "_last_eval_results"
+            ), "No evaluation results obtained during training!"
+            verify_results(self.cfg, self._last_eval_results)
+            return self._last_eval_results
+    def run_step(self):
+        self._trainer.iter = self.iter
+        self._trainer.run_step()
+    @classmethod
+    def build_model(cls, cfg):
+        """
+        Returns:
+            torch.nn.Module:
+        It now calls :func:`detectron2.modeling.build_model`.
+        Overwrite it if you'd like a different model.
+        """
+        model = build_model(cfg)
+        logger = logging.getLogger(__name__)
+        logger.info("Model:\n{}".format(model))
+        return model
+    @classmethod
+    def build_optimizer(cls, cfg, model):
+        """
+        Returns:
+            torch.optim.Optimizer:
+        It now calls :func:`detectron2.solver.build_optimizer`.
+        Overwrite it if you'd like a different optimizer.
+        """
+        return build_optimizer(cfg, model)
+    @classmethod
+    def build_lr_scheduler(cls, cfg, optimizer):
+        """
+        It now calls :func:`detectron2.solver.build_lr_scheduler`.
+        Overwrite it if you'd like a different scheduler.
+        """
+        return build_lr_scheduler(cfg, optimizer)
+    @classmethod
+    def build_train_loader(cls, cfg):
+        """
+        Returns:
+            iterable
+        It now calls :func:`detectron2.data.build_detection_train_loader`.
+        Overwrite it if you'd like a different data loader.
+        """
+        return build_detection_train_loader(cfg)
+    @classmethod
+    def build_test_loader(cls, cfg, dataset_name):
+        """
+        Returns:
+            iterable
+        It now calls :func:`detectron2.data.build_detection_test_loader`.
+        Overwrite it if you'd like a different data loader.
+        """
+        return build_detection_test_loader(cfg, dataset_name)
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        if evaluator_type == "pascal_voc":
+            return PascalVOCDetectionEvaluator(dataset_name, cfg)
+        if len(evaluator_list) == 0:
+            raise NotImplementedError(
+                "no Evaluator for the dataset {} with the type {}".format(
+                    dataset_name, evaluator_type
+                )
+            )
+        elif len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+    @classmethod
+    def test_with_TTA(cls, cfg, model):
+        logger = logging.getLogger("detectron2.trainer")
+        # In the end of training, run an evaluation with TTA
+        # Only support some R-CNN models.
+        logger.info("Running inference with test-time augmentation ...")
+        model = GeneralizedRCNNWithTTA(cfg, model)
+        evaluators = [
+            cls.build_evaluator(
+                cfg, name, output_folder=os.path.join(
+                    cfg.OUTPUT_DIR, "inference_TTA")
+            )
+            for name in cfg.DATASETS.TEST
+        ]
+        res = cls.test(cfg, model, evaluators)
+        res = OrderedDict({k + "_TTA": v for k, v in res.items()})
+        return res
+    @classmethod
+    def test(cls, cfg, model, evaluators=None):
+        """
+        Args:
+            cfg (CfgNode):
+            model (nn.Module):
+            evaluators (list[DatasetEvaluator] or None): if None, will call
+                :meth:`build_evaluator`. Otherwise, must have the same length as
+                ``cfg.DATASETS.TEST``.
+        Returns:
+            dict: a dict of result metrics
+        """
+        logger = logging.getLogger(__name__)
+        if isinstance(evaluators, DatasetEvaluator):
+            evaluators = [evaluators]
+        if evaluators is not None:
+            assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
+                len(cfg.DATASETS.TEST), len(evaluators)
+            )
+        results = OrderedDict()
+        for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
+            data_loader = cls.build_test_loader(cfg, dataset_name)
+            # When evaluators are passed in as arguments,
+            # implicitly assume that evaluators can be created before data_loader.
+            if evaluators is not None:
+                evaluator = evaluators[idx]
+            else:
+                try:
+                    evaluator = cls.build_evaluator(cfg, dataset_name)
+                except NotImplementedError:
+                    logger.warn(
+                        "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
+                        "or implement its `build_evaluator` method."
+                    )
+                    results[dataset_name] = {}
+                    continue
+            results_i = inference_on_dataset(model, data_loader, evaluator)
+            results[dataset_name] = results_i
+            if comm.is_main_process():
+                assert isinstance(
+                    results_i, dict
+                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+                    results_i
+                )
+                logger.info(
+                    "Evaluation results for {} in csv format:".format(dataset_name))
+                print_csv_format(results_i)
+        if len(results) == 1:
+            results = list(results.values())[0]
+        return results
+    @staticmethod
+    def auto_scale_workers(cfg, num_workers: int):
+        """
+        When the config is defined for certain number of workers (according to
+        ``cfg.SOLVER.REFERENCE_WORLD_SIZE``) that's different from the number of
+        workers currently in use, returns a new cfg where the total batch size
+        is scaled so that the per-GPU batch size stays the same as the
+        original ``IMS_PER_BATCH // REFERENCE_WORLD_SIZE``.
+        Other config options are also scaled accordingly:
+        * training steps and warmup steps are scaled inverse proportionally.
+        * learning rate are scaled proportionally, following :paper:`ImageNet in 1h`.
+        For example, with the original config like the following:
+        .. code-block:: yaml
+            IMS_PER_BATCH: 16
+            BASE_LR: 0.1
+            REFERENCE_WORLD_SIZE: 8
+            MAX_ITER: 5000
+            STEPS: (4000,)
+            CHECKPOINT_PERIOD: 1000
+        When this config is used on 16 GPUs instead of the reference number 8,
+        calling this method will return a new config with:
+        .. code-block:: yaml
+            IMS_PER_BATCH: 32
+            BASE_LR: 0.2
+            REFERENCE_WORLD_SIZE: 16
+            MAX_ITER: 2500
+            STEPS: (2000,)
+            CHECKPOINT_PERIOD: 500
+        Note that both the original config and this new config can be trained on 16 GPUs.
+        It's up to user whether to enable this feature (by setting ``REFERENCE_WORLD_SIZE``).
+        Returns:
+            CfgNode: a new config. Same as original if ``cfg.SOLVER.REFERENCE_WORLD_SIZE==0``.
+        """
+        old_world_size = cfg.SOLVER.REFERENCE_WORLD_SIZE
+        if old_world_size == 0 or old_world_size == num_workers:
+            return cfg
+        cfg = cfg.clone()
+        frozen = cfg.is_frozen()
+        cfg.defrost()
+        assert (
+            cfg.SOLVER.IMS_PER_BATCH % old_world_size == 0
+        ), "Invalid REFERENCE_WORLD_SIZE in config!"
+        scale = num_workers / old_world_size
+        bs = cfg.SOLVER.IMS_PER_BATCH = int(
+            round(cfg.SOLVER.IMS_PER_BATCH * scale))
+        lr = cfg.SOLVER.BASE_LR = cfg.SOLVER.BASE_LR * scale
+        max_iter = cfg.SOLVER.MAX_ITER = int(
+            round(cfg.SOLVER.MAX_ITER / scale))
+        warmup_iter = cfg.SOLVER.WARMUP_ITERS = int(
+            round(cfg.SOLVER.WARMUP_ITERS / scale))
+        cfg.SOLVER.STEPS = tuple(int(round(s / scale))
+                                 for s in cfg.SOLVER.STEPS)
+        cfg.TEST.EVAL_PERIOD = int(round(cfg.TEST.EVAL_PERIOD / scale))
+        cfg.SOLVER.CHECKPOINT_PERIOD = int(
+            round(cfg.SOLVER.CHECKPOINT_PERIOD / scale))
+        cfg.SOLVER.REFERENCE_WORLD_SIZE = num_workers  # maintain invariant
+        logger = logging.getLogger(__name__)
+        logger.info(
+            f"Auto-scaling the config to batch_size={bs}, learning_rate={lr}, "
+            f"max_iter={max_iter}, warmup={warmup_iter}."
+        )
+        if frozen:
+            cfg.freeze()
+        return cfg
+# Access basic attributes from the underlying trainer
+for _attr in ["model", "data_loader", "optimizer"]:
+    setattr(
+        OpenDetTrainer,
+        _attr,
+        property(
+            # getter
+            lambda self, x=_attr: getattr(self._trainer, x),
+            # setter
+            lambda self, value, x=_attr: setattr(self._trainer, x, value),
+        ),
+    )

opendet2/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .pascal_voc_evaluation import *
2	+
3	+ __all__ = [k for k in globals().keys() if not k.startswith("_")]

opendet2/evaluation/pascal_voc_evaluation.py ADDED Viewed

	@@ -0,0 +1,377 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Code is modified from https://github.com/JosephKJ/OWOD
+import logging
+import os
+import tempfile
+import xml.etree.ElementTree as ET
+from collections import OrderedDict, defaultdict
+from functools import lru_cache
+from tabulate import tabulate
+import numpy as np
+import torch
+from detectron2.data import MetadataCatalog
+from detectron2.evaluation import DatasetEvaluator
+from detectron2.evaluation.pascal_voc_evaluation import voc_ap
+from detectron2.utils import comm
+from detectron2.utils.file_io import PathManager
+class PascalVOCDetectionEvaluator(DatasetEvaluator):
+    def __init__(self, dataset_name, cfg=None):
+        """
+        Args:
+            dataset_name (str): name of the dataset, e.g., "voc_2007_test"
+        """
+        self._dataset_name = dataset_name
+        meta = MetadataCatalog.get(dataset_name)
+        # Too many tiny files, download all to local for speed.
+        annotation_dir_local = PathManager.get_local_path(
+            os.path.join(meta.dirname, "Annotations/")
+        )
+        self._anno_file_template = os.path.join(annotation_dir_local, "{}.xml")
+        self._image_set_path = os.path.join(
+            meta.dirname, "ImageSets", "Main", meta.split + ".txt")
+        self._class_names = meta.thing_classes
+        assert meta.year in [2007, 2012], meta.year
+        self.logger = logging.getLogger(__name__)
+        self._is_2007 = meta.year == 2007
+        self._cpu_device = torch.device("cpu")
+        if cfg is not None:
+            self.output_dir = cfg.OUTPUT_DIR
+            self.total_num_class = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+            self.unknown_class_index = self.total_num_class - 1
+            self.num_known_classes = cfg.MODEL.ROI_HEADS.NUM_KNOWN_CLASSES
+            self.known_classes = self._class_names[:self.num_known_classes]
+    def reset(self):
+        # class name -> list of prediction strings
+        self._predictions = defaultdict(list)
+    def process(self, inputs, outputs):
+        for input, output in zip(inputs, outputs):
+            image_id = input["image_id"]
+            instances = output["instances"].to(self._cpu_device)
+            boxes = instances.pred_boxes.tensor.numpy()
+            scores = instances.scores.tolist()
+            classes = instances.pred_classes.tolist()
+            for box, score, cls in zip(boxes, scores, classes):
+                xmin, ymin, xmax, ymax = box
+                # The inverse of data loading logic in `datasets/pascal_voc.py`
+                xmin += 1
+                ymin += 1
+                self._predictions[cls].append(
+                    f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}"
+                )
+    def compute_WI_at_many_recall_level(self, recalls, tp_plus_fp_cs, fp_os):
+        wi_at_recall = {}
+        # for r in range(1, 10):
+        for r in [8]:
+            r = r/10
+            wi = self.compute_WI_at_a_recall_level(
+                recalls, tp_plus_fp_cs, fp_os, recall_level=r)
+            wi_at_recall[r] = wi
+        return wi_at_recall
+    def compute_WI_at_a_recall_level(self, recalls, tp_plus_fp_cs, fp_os, recall_level=0.5):
+        wi_at_iou = {}
+        for iou, recall in recalls.items():
+            tp_plus_fps = []
+            fps = []
+            for cls_id, rec in enumerate(recall):
+                if cls_id in range(self.num_known_classes) and len(rec) > 0:
+                    index = min(range(len(rec)), key=lambda i: abs(
+                        rec[i] - recall_level))
+                    tp_plus_fp = tp_plus_fp_cs[iou][cls_id][index]
+                    tp_plus_fps.append(tp_plus_fp)
+                    fp = fp_os[iou][cls_id][index]
+                    fps.append(fp)
+            if len(tp_plus_fps) > 0:
+                wi_at_iou[iou] = np.mean(fps) / np.mean(tp_plus_fps)
+            else:
+                wi_at_iou[iou] = 0
+        return wi_at_iou
+    def evaluate(self):
+        """
+        Returns:
+            dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75".
+        """
+        all_predictions = comm.gather(self._predictions, dst=0)
+        if not comm.is_main_process():
+            return
+        predictions = defaultdict(list)
+        for predictions_per_rank in all_predictions:
+            for clsid, lines in predictions_per_rank.items():
+                predictions[clsid].extend(lines)
+        del all_predictions
+        self.logger.info(
+            "Evaluating {} using {} metric. "
+            "Note that results do not use the official Matlab API.".format(
+                self._dataset_name, 2007 if self._is_2007 else 2012
+            )
+        )
+        dirname = os.path.join(self.output_dir, 'pascal_voc_eval')
+        if not os.path.exists(dirname):
+            os.mkdir(dirname)
+        # with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname:
+        res_file_template = os.path.join(dirname, "{}.txt")
+        aps = defaultdict(list)  # iou -> ap per class
+        recs = defaultdict(list)
+        precs = defaultdict(list)
+        all_recs = defaultdict(list)
+        all_precs = defaultdict(list)
+        unk_det_as_knowns = defaultdict(list)
+        num_unks = defaultdict(list)
+        tp_plus_fp_cs = defaultdict(list)
+        fp_os = defaultdict(list)
+        for cls_id, cls_name in enumerate(self._class_names):
+            lines = predictions.get(cls_id, [""])
+            with open(res_file_template.format(cls_name), "w") as f:
+                f.write("\n".join(lines))
+            for thresh in [50, ]:
+                # for thresh in range(50, 100, 5):
+                (rec, prec, ap, unk_det_as_known, num_unk,
+                 tp_plus_fp_closed_set, fp_open_set) = voc_eval(
+                    res_file_template,
+                    self._anno_file_template,
+                    self._image_set_path,
+                    cls_name,
+                    ovthresh=thresh / 100.0,
+                    use_07_metric=self._is_2007,
+                    known_classes=self.known_classes
+                )
+                aps[thresh].append(ap * 100)
+                unk_det_as_knowns[thresh].append(unk_det_as_known)
+                num_unks[thresh].append(num_unk)
+                all_precs[thresh].append(prec)
+                all_recs[thresh].append(rec)
+                tp_plus_fp_cs[thresh].append(tp_plus_fp_closed_set)
+                fp_os[thresh].append(fp_open_set)
+                try:
+                    recs[thresh].append(rec[-1] * 100)
+                    precs[thresh].append(prec[-1] * 100)
+                except:
+                    recs[thresh].append(0)
+                    precs[thresh].append(0)
+        results_2d = {}
+        mAP = {iou: np.mean(x) for iou, x in aps.items()}
+        results_2d['mAP'] = mAP[50]
+        wi = self.compute_WI_at_many_recall_level(
+            all_recs, tp_plus_fp_cs, fp_os)
+        results_2d['WI'] = wi[0.8][50] * 100
+        total_num_unk_det_as_known = {iou: np.sum(
+            x) for iou, x in unk_det_as_knowns.items()}
+        # total_num_unk = num_unks[50][0]
+        # self.logger.info('num_unk ' + str(total_num_unk))
+        results_2d['AOSE'] = total_num_unk_det_as_known[50]
+        # class-wise P-R
+        # self.logger.info(self._class_names)
+        # self.logger.info("AP50: " + str(['%.1f' % x for x in aps[50]]))
+        # self.logger.info("P50: " + str(['%.1f' % x for x in precs[50]]))
+        # self.logger.info("R50: " + str(['%.1f' % x for x in recs[50]]))
+        # Known
+        results_2d.update({
+            "AP@K": np.mean(aps[50][:self.num_known_classes]),
+            "P@K": np.mean(precs[50][:self.num_known_classes]),
+            "R@K": np.mean(recs[50][:self.num_known_classes]),
+        })
+        # Unknown
+        results_2d.update({
+            "AP@U": np.mean(aps[50][-1]),
+            "P@U": np.mean(precs[50][-1]),
+            "R@U": np.mean(recs[50][-1]),
+        })
+        results_head = list(results_2d.keys())
+        results_data = [[float(results_2d[k]) for k in results_2d]]
+        table = tabulate(
+            results_data,
+            tablefmt="pipe",
+            floatfmt=".2f",
+            headers=results_head,
+            numalign="left",
+        )
+        self.logger.info("\n" + table)
+        return {",".join(results_head): ",".join([str(round(x,2)) for x in results_data[0]])}
+@lru_cache(maxsize=None)
+def parse_rec(filename, known_classes):
+    """Parse a PASCAL VOC xml file."""
+    with PathManager.open(filename) as f:
+        tree = ET.parse(f)
+    objects = []
+    for obj in tree.findall("object"):
+        obj_struct = {}
+        cls_name = obj.find("name").text
+        # translate unseen classes to unknown
+        if cls_name not in known_classes:
+            cls_name = 'unknown'
+        obj_struct["name"] = cls_name
+        # obj_struct["pose"] = obj.find("pose").text
+        # obj_struct["truncated"] = int(obj.find("truncated").text)
+        obj_struct["difficult"] = int(obj.find("difficult").text)
+        bbox = obj.find("bndbox")
+        obj_struct["bbox"] = [
+            int(bbox.find("xmin").text),
+            int(bbox.find("ymin").text),
+            int(bbox.find("xmax").text),
+            int(bbox.find("ymax").text),
+        ]
+        objects.append(obj_struct)
+    return objects
+def compute_overlaps(BBGT, bb):
+    # compute overlaps
+    # intersection
+    ixmin = np.maximum(BBGT[:, 0], bb[0])
+    iymin = np.maximum(BBGT[:, 1], bb[1])
+    ixmax = np.minimum(BBGT[:, 2], bb[2])
+    iymax = np.minimum(BBGT[:, 3], bb[3])
+    iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
+    ih = np.maximum(iymax - iymin + 1.0, 0.0)
+    inters = iw * ih
+    # union
+    uni = (
+        (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0)
+        + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0)
+        - inters
+    )
+    return inters / uni
+def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False, known_classes=None):
+    # first load gt
+    # read list of images
+    with PathManager.open(imagesetfile, "r") as f:
+        lines = f.readlines()
+    imagenames = [x.strip() for x in lines]
+    # load annots
+    recs = {}
+    for imagename in imagenames:
+        recs[imagename] = parse_rec(
+            annopath.format(imagename), tuple(known_classes))
+    # extract gt objects for this class
+    class_recs = {}
+    npos = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj["name"] == classname]
+        bbox = np.array([x["bbox"] for x in R])
+        difficult = np.array([x["difficult"] for x in R]).astype(np.bool)
+        # difficult = np.array([False for x in R]).astype(np.bool)  # treat all "difficult" as GT
+        det = [False] * len(R)
+        npos = npos + sum(~difficult)
+        class_recs[imagename] = {"bbox": bbox,
+                                 "difficult": difficult, "det": det}
+    # read dets
+    detfile = detpath.format(classname)
+    with open(detfile, "r") as f:
+        lines = f.readlines()
+    splitlines = [x.strip().split(" ") for x in lines]
+    image_ids = [x[0] for x in splitlines]
+    confidence = np.array([float(x[1]) for x in splitlines])
+    BB = np.array([[float(z) for z in x[2:]]
+                  for x in splitlines]).reshape(-1, 4)
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R["bbox"].astype(float)
+        if BBGT.size > 0:
+            overlaps = compute_overlaps(BBGT, bb)
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+        if ovmax > ovthresh:
+            if not R["difficult"][jmax]:
+                if not R["det"][jmax]:
+                    tp[d] = 1.0
+                    R["det"][jmax] = 1
+                else:
+                    fp[d] = 1.0
+        else:
+            fp[d] = 1.0
+    # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+    # compute unknown det as known
+    unknown_class_recs = {}
+    n_unk = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj["name"] == 'unknown']
+        bbox = np.array([x["bbox"] for x in R])
+        difficult = np.array([x["difficult"] for x in R]).astype(np.bool)
+        det = [False] * len(R)
+        n_unk = n_unk + sum(~difficult)
+        unknown_class_recs[imagename] = {
+            "bbox": bbox, "difficult": difficult, "det": det}
+    if classname == 'unknown':
+        return rec, prec, ap, 0, n_unk, None, None
+    # Go down each detection and see if it has an overlap with an unknown object.
+    # If so, it is an unknown object that was classified as known.
+    is_unk = np.zeros(nd)
+    for d in range(nd):
+        R = unknown_class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R["bbox"].astype(float)
+        if BBGT.size > 0:
+            overlaps = compute_overlaps(BBGT, bb)
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+        if ovmax > ovthresh:
+            is_unk[d] = 1.0
+    is_unk_sum = np.sum(is_unk)
+    tp_plus_fp_closed_set = tp+fp
+    fp_open_set = np.cumsum(is_unk)
+    return rec, prec, ap, is_unk_sum, n_unk, tp_plus_fp_closed_set, fp_open_set

opendet2/modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .meta_arch import OpenSetRetinaNet
+from .backbone import *
+from .roi_heads import *
+__all__ = list(globals().keys())

opendet2/modeling/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .swin_transformer import SwinTransformer
2	+
3	+ __all__ = [k for k in globals().keys() if not k.startswith("_")]

opendet2/modeling/backbone/swin_transformer.py ADDED Viewed

	@@ -0,0 +1,726 @@

+# --------------------------------------------------------
+# Swin Transformer
+# modified from https://github.com/xiaohu2015/SwinT_detectron2/blob/main/swint/swin_transformer.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+import numpy as np
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+from detectron2.modeling.backbone.fpn import FPN, LastLevelMaxPool, LastLevelP6P7
+from detectron2.layers import ShapeSpec
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer)
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(Backbone):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 frozen_stages=-1,
+                 use_checkpoint=False,
+                 out_features=None):
+        super(SwinTransformer, self).__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.frozen_stages = frozen_stages
+        self.out_features = out_features
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        self._out_feature_strides = {}
+        self._out_feature_channels = {}
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+            stage = f'stage{i_layer+2}'
+            if stage in self.out_features:
+                self._out_feature_channels[stage] = embed_dim * 2 ** i_layer
+                self._out_feature_strides[stage] = 4 * 2 ** i_layer
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in range(self.num_layers):
+            stage = f'stage{i_layer+2}'
+            if stage in self.out_features:
+                layer = norm_layer(num_features[i_layer])
+                layer_name = f'norm{i_layer}'
+                self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        self.apply(_init_weights)
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = {}
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            name = f'stage{i+2}'
+            if name in self.out_features:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs[name] = out
+        return outs #{"stage%d" % (i+2,): out for i, out in enumerate(outs)} #tuple(outs)
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self.out_features
+        }
+@BACKBONE_REGISTRY.register()
+def build_swint_backbone(cfg, input_shape):
+    """
+    Create a SwinT instance from config.
+    Returns:
+        VoVNet: a :class:`VoVNet` instance.
+    """
+    out_features = cfg.MODEL.SWINT.OUT_FEATURES
+    return SwinTransformer(
+        patch_size=4,
+        in_chans=input_shape.channels,
+        embed_dim=cfg.MODEL.SWINT.EMBED_DIM,
+        depths=cfg.MODEL.SWINT.DEPTHS,
+        num_heads=cfg.MODEL.SWINT.NUM_HEADS,
+        window_size=cfg.MODEL.SWINT.WINDOW_SIZE,
+        mlp_ratio=cfg.MODEL.SWINT.MLP_RATIO,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=cfg.MODEL.SWINT.DROP_PATH_RATE,
+        norm_layer=nn.LayerNorm,
+        ape=cfg.MODEL.SWINT.APE,
+        patch_norm=True,
+        frozen_stages=cfg.MODEL.BACKBONE.FREEZE_AT,
+        out_features=out_features
+    )
+@BACKBONE_REGISTRY.register()
+def build_swint_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_swint_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
+class LastLevelP6(nn.Module):
+    """
+    This module is used in FCOS to generate extra layers
+    """
+    def __init__(self, in_channels, out_channels, in_features="res5"):
+        super().__init__()
+        self.num_levels = 1
+        self.in_feature = in_features
+        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+        for module in [self.p6]:
+            weight_init.c2_xavier_fill(module)
+    def forward(self, x):
+        p6 = self.p6(x)
+        return [p6]
+@BACKBONE_REGISTRY.register()
+def build_retinanet_swint_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_swint_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    top_levels = cfg.MODEL.FPN.TOP_LEVELS
+    in_channels_top = out_channels
+    if top_levels == 2:
+        top_block = LastLevelP6P7(in_channels_top, out_channels, "p5")
+    if top_levels == 1:
+        top_block = LastLevelP6(in_channels_top, out_channels, "p5")
+    elif top_levels == 0:
+        top_block = None
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=top_block,
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone

opendet2/modeling/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .mlp import *
2	+
3	+ __all__ = [k for k in globals().keys() if not k.startswith("_")]

opendet2/modeling/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import fvcore.nn.weight_init as weight_init
+class MLP(nn.Module):
+    def __init__(self, in_dim, out_dim, hidden_dim=None):
+        super().__init__()
+        if not hidden_dim:
+            hidden_dim = in_dim
+        self.head = nn.Sequential(
+            nn.Linear(in_dim, hidden_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(hidden_dim, out_dim),
+        )
+        for layer in self.head:
+            if isinstance(layer, nn.Linear):
+                weight_init.c2_xavier_fill(layer)
+    def forward(self, x):
+        feat = self.head(x)
+        feat_norm = F.normalize(feat, dim=1)
+        return feat_norm
+class ConvMLP(nn.Module):
+    def __init__(self, in_dim, out_dim, hidden_dim=None):
+        super().__init__()
+        if not hidden_dim:
+            hidden_dim = in_dim
+        self.head = nn.Sequential(
+            nn.Conv2d(in_dim, hidden_dim, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(hidden_dim, out_dim, kernel_size=3, stride=1, padding=1),
+        )
+        # Initialization
+        for layer in self.head:
+            if isinstance(layer, nn.Conv2d):
+                    torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
+                    torch.nn.init.constant_(layer.bias, 0)
+    def forward(self, x):
+        feat = self.head(x)
+        feat_norm = F.normalize(feat, dim=1)
+        return feat_norm

opendet2/modeling/losses/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .unknown_probability_loss import UPLoss
+from .instance_contrastive_loss import ICLoss
+__all__ = [k for k in globals().keys() if not k.startswith("_")]

opendet2/modeling/losses/instance_contrastive_loss.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ICLoss(nn.Module):
+    """ Instance Contrastive Loss
+    """
+    def __init__(self, tau=0.1):
+        super().__init__()
+        self.tau = tau
+    def forward(self, features, labels, queue_features, queue_labels):
+        device = features.device
+        mask = torch.eq(labels[:, None], queue_labels[:, None].T).float().to(device)
+        # compute logits
+        anchor_dot_contrast = torch.div(
+            torch.matmul(features, queue_features.T), self.tau)
+        # for numerical stability
+        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
+        logits = anchor_dot_contrast - logits_max.detach()
+        logits_mask = torch.ones_like(logits)
+        # mask itself
+        logits_mask[logits == 0] = 0
+        mask = mask * logits_mask
+        # compute log_prob
+        exp_logits = torch.exp(logits) * logits_mask
+        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))
+        # compute mean of log-likelihood over positive
+        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
+        # loss
+        loss = - mean_log_prob_pos.mean()
+        # trick: avoid loss nan
+        return loss if not torch.isnan(loss) else features.new_tensor(0.0)

opendet2/modeling/losses/unknown_probability_loss.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+import torch.distributions as dists
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+class UPLoss(nn.Module):
+    """Unknown Probability Loss
+    """
+    def __init__(self,
+                 num_classes: int,
+                 sampling_metric: str = "min_score",
+                 topk: int = 3,
+                 alpha: float = 1.0):
+        super().__init__()
+        self.num_classes = num_classes
+        assert sampling_metric in ["min_score", "max_entropy", "random"]
+        self.sampling_metric = sampling_metric
+        # if topk==-1, sample len(fg)*2 examples
+        self.topk = topk
+        self.alpha = alpha
+    def _soft_cross_entropy(self, input: Tensor, target: Tensor):
+        logprobs = F.log_softmax(input, dim=1)
+        return -(target * logprobs).sum() / input.shape[0]
+    def _sampling(self, scores: Tensor, labels: Tensor):
+        fg_inds = labels != self.num_classes
+        fg_scores, fg_labels = scores[fg_inds], labels[fg_inds]
+        bg_scores, bg_labels = scores[~fg_inds], labels[~fg_inds]
+        # remove unknown classes
+        _fg_scores = torch.cat(
+            [fg_scores[:, :self.num_classes-1], fg_scores[:, -1:]], dim=1)
+        _bg_scores = torch.cat(
+            [bg_scores[:, :self.num_classes-1], bg_scores[:, -1:]], dim=1)
+        num_fg = fg_scores.size(0)
+        topk = num_fg if (self.topk == -1) or (num_fg <
+                                               self.topk) else self.topk
+        # use maximum entropy as a metric for uncertainty
+        # we select topk proposals with maximum entropy
+        if self.sampling_metric == "max_entropy":
+            pos_metric = dists.Categorical(
+                _fg_scores.softmax(dim=1)).entropy()
+            neg_metric = dists.Categorical(
+                _bg_scores.softmax(dim=1)).entropy()
+        # use minimum score as a metric for uncertainty
+        # we select topk proposals with minimum max-score
+        elif self.sampling_metric == "min_score":
+            pos_metric = -_fg_scores.max(dim=1)[0]
+            neg_metric = -_bg_scores.max(dim=1)[0]
+        # we randomly select topk proposals
+        elif self.sampling_metric == "random":
+            pos_metric = torch.rand(_fg_scores.size(0),).to(scores.device)
+            neg_metric = torch.rand(_bg_scores.size(0),).to(scores.device)
+        _, pos_inds = pos_metric.topk(topk)
+        _, neg_inds = neg_metric.topk(topk)
+        fg_scores, fg_labels = fg_scores[pos_inds], fg_labels[pos_inds]
+        bg_scores, bg_labels = bg_scores[neg_inds], bg_labels[neg_inds]
+        return fg_scores, bg_scores, fg_labels, bg_labels
+    def forward(self, scores: Tensor, labels: Tensor):
+        fg_scores, bg_scores, fg_labels, bg_labels = self._sampling(
+            scores, labels)
+        # sample both fg and bg
+        scores = torch.cat([fg_scores, bg_scores])
+        labels = torch.cat([fg_labels, bg_labels])
+        num_sample, num_classes = scores.shape
+        mask = torch.arange(num_classes).repeat(
+            num_sample, 1).to(scores.device)
+        inds = mask != labels[:, None].repeat(1, num_classes)
+        mask = mask[inds].reshape(num_sample, num_classes-1)
+        gt_scores = torch.gather(
+            F.softmax(scores, dim=1), 1, labels[:, None]).squeeze(1)
+        mask_scores = torch.gather(scores, 1, mask)
+        gt_scores[gt_scores < 0] = 0.0
+        targets = torch.zeros_like(mask_scores)
+        num_fg = fg_scores.size(0)
+        targets[:num_fg, self.num_classes-2] = gt_scores[:num_fg] * \
+            (1-gt_scores[:num_fg]).pow(self.alpha)
+        targets[num_fg:, self.num_classes-1] = gt_scores[num_fg:] * \
+            (1-gt_scores[num_fg:]).pow(self.alpha)
+        return self._soft_cross_entropy(mask_scores, targets.detach())

opendet2/modeling/meta_arch/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .retinanet import OpenSetRetinaNet
2	+
3	+ __all__ = list(globals().keys())

opendet2/modeling/meta_arch/retinanet.py ADDED Viewed

	@@ -0,0 +1,483 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from typing import Dict, List, Tuple
+import numpy as np
+import torch
+import torch.distributions as dists
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec, cat, cross_entropy
+from detectron2.modeling import META_ARCH_REGISTRY
+from detectron2.modeling.box_regression import _dense_box_regression_loss
+from detectron2.modeling.meta_arch.retinanet import RetinaNet, RetinaNetHead
+from detectron2.modeling.postprocessing import detector_postprocess
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+from fvcore.nn import sigmoid_focal_loss_jit
+from torch import Tensor, nn
+from torch.nn import functional as F
+from ..layers import ConvMLP
+from ..losses import ICLoss
+logger = logging.getLogger(__name__)
+def permute_to_N_HWA_K(tensor, K: int):
+    """
+    Transpose/reshape a tensor from (N, (Ai x K), H, W) to (N, (HxWxAi), K)
+    """
+    assert tensor.dim() == 4, tensor.shape
+    N, _, H, W = tensor.shape
+    tensor = tensor.view(N, -1, K, H, W)
+    tensor = tensor.permute(0, 3, 4, 1, 2)
+    tensor = tensor.reshape(N, -1, K)  # Size=(N,HWA,K)
+    return tensor
+class UPLoss(nn.Module):
+    """Unknown Probability Loss for RetinaNet
+    """
+    def __init__(self,
+                 num_classes: int,
+                 sampling_metric: str = "min_score",
+                 topk: int = 3,
+                 alpha: float = 1.0):
+        super().__init__()
+        self.num_classes = num_classes
+        assert sampling_metric in ["min_score", "max_entropy", "random"]
+        self.sampling_metric = sampling_metric
+        # if topk==-1, sample len(fg)*2 examples
+        self.topk = topk
+        self.alpha = alpha
+    def _soft_cross_entropy(self, input: Tensor, target: Tensor):
+        logprobs = F.log_softmax(input, dim=1)
+        return -(target * logprobs).sum() / input.shape[0]
+    def _sampling(self, scores: Tensor, labels: Tensor):
+        fg_inds = labels != self.num_classes
+        fg_scores, fg_labels = scores[fg_inds], labels[fg_inds]
+        # remove unknown classes
+        _fg_scores = torch.cat(
+            [fg_scores[:, :self.num_classes-1], fg_scores[:, -1:]], dim=1)
+        num_fg = fg_scores.size(0)
+        topk = num_fg if (self.topk == -1) or (num_fg <
+                                               self.topk) else self.topk
+        # use maximum entropy as a metric for uncertainty
+        # we select topk proposals with maximum entropy
+        if self.sampling_metric == "max_entropy":
+            pos_metric = dists.Categorical(
+                _fg_scores.softmax(dim=1)).entropy()
+        # use minimum score as a metric for uncertainty
+        # we select topk proposals with minimum max-score
+        elif self.sampling_metric == "min_score":
+            pos_metric = -_fg_scores.max(dim=1)[0]
+        # we randomly select topk proposals
+        elif self.sampling_metric == "random":
+            pos_metric = torch.rand(_fg_scores.size(0),).to(scores.device)
+        _, pos_inds = pos_metric.topk(topk)
+        fg_scores, fg_labels = fg_scores[pos_inds], fg_labels[pos_inds]
+        return fg_scores, fg_labels
+    def forward(self, scores: Tensor, labels: Tensor):
+        scores, labels = self._sampling(scores, labels)
+        num_sample, num_classes = scores.shape
+        mask = torch.arange(num_classes).repeat(
+            num_sample, 1).to(scores.device)
+        inds = mask != labels[:, None].repeat(1, num_classes)
+        mask = mask[inds].reshape(num_sample, num_classes-1)
+        gt_scores = torch.gather(
+            F.softmax(scores, dim=1), 1, labels[:, None]).squeeze(1)
+        mask_scores = torch.gather(scores, 1, mask)
+        gt_scores[gt_scores < 0] = 0.0
+        targets = torch.zeros_like(mask_scores)
+        targets[:, self.num_classes-2] = gt_scores * \
+            (1-gt_scores).pow(self.alpha)
+        return self._soft_cross_entropy(mask_scores, targets.detach())
+@META_ARCH_REGISTRY.register()
+class OpenSetRetinaNet(RetinaNet):
+    """
+    Implement RetinaNet in :paper:`RetinaNet`.
+    """
+    @configurable
+    def __init__(
+        self,
+        num_known_classes,
+        max_iters,
+        up_loss_start_iter,
+        up_loss_sampling_metric,
+        up_loss_topk,
+        up_loss_alpha,
+        up_loss_weight,
+        ins_con_out_dim,
+        ins_con_queue_size,
+        ins_con_in_queue_size,
+        ins_con_batch_iou_thr,
+        ins_con_queue_iou_thr,
+        ins_con_queue_tau,
+        ins_con_loss_weight,
+        *args,
+        **kargs,
+    ):
+        super().__init__(*args, **kargs)
+        self.num_known_classes = num_known_classes
+        self.max_iters = max_iters
+        self.up_loss = UPLoss(
+            self.num_classes,
+            sampling_metric=up_loss_sampling_metric,
+            topk=up_loss_topk,
+            alpha=up_loss_alpha
+        )
+        self.up_loss_start_iter = up_loss_start_iter
+        self.up_loss_weight = up_loss_weight
+        self.ins_con_loss = ICLoss(tau=ins_con_queue_tau)
+        self.ins_con_out_dim = ins_con_out_dim
+        self.ins_con_queue_size = ins_con_queue_size
+        self.ins_con_in_queue_size = ins_con_in_queue_size
+        self.ins_con_batch_iou_thr = ins_con_batch_iou_thr
+        self.ins_con_queue_iou_thr = ins_con_queue_iou_thr
+        self.ins_con_loss_weight = ins_con_loss_weight
+        self.register_buffer('queue', torch.zeros(
+            self.num_known_classes, ins_con_queue_size, ins_con_out_dim))
+        self.register_buffer('queue_label', torch.empty(
+            self.num_known_classes, ins_con_queue_size).fill_(-1).long())
+        self.register_buffer('queue_ptr', torch.zeros(
+            self.num_known_classes, dtype=torch.long))
+    @classmethod
+    def from_config(cls, cfg):
+        ret = super().from_config(cfg)
+        backbone_shape = ret["backbone"].output_shape()
+        feature_shapes = [backbone_shape[f] for f in cfg.MODEL.RETINANET.IN_FEATURES]
+        head = OpenSetRetinaNetHead(cfg, feature_shapes)
+        ret.update({
+            "head": head,
+            "num_known_classes": cfg.MODEL.ROI_HEADS.NUM_KNOWN_CLASSES,
+            "max_iters": cfg.SOLVER.MAX_ITER,
+            "up_loss_start_iter": cfg.UPLOSS.START_ITER,
+            "up_loss_sampling_metric": cfg.UPLOSS.SAMPLING_METRIC,
+            "up_loss_topk": cfg.UPLOSS.TOPK,
+            "up_loss_alpha": cfg.UPLOSS.ALPHA,
+            "up_loss_weight": cfg.UPLOSS.WEIGHT,
+            "ins_con_out_dim": cfg.ICLOSS.OUT_DIM,
+            "ins_con_queue_size": cfg.ICLOSS.QUEUE_SIZE,
+            "ins_con_in_queue_size": cfg.ICLOSS.IN_QUEUE_SIZE,
+            "ins_con_batch_iou_thr": cfg.ICLOSS.BATCH_IOU_THRESH,
+            "ins_con_queue_iou_thr": cfg.ICLOSS.QUEUE_IOU_THRESH,
+            "ins_con_queue_tau": cfg.ICLOSS.TEMPERATURE,
+            "ins_con_loss_weight": cfg.ICLOSS.WEIGHT,
+        })
+        return ret
+    def get_up_loss(self, scores, gt_classes):
+        # start up loss after warmup iters
+        storage = get_event_storage()
+        if storage.iter > self.up_loss_start_iter:
+            loss_cls_up = self.up_loss(scores, gt_classes)
+        else:
+            loss_cls_up = scores.new_tensor(0.0)
+        return self.up_loss_weight * loss_cls_up
+    def get_ins_con_loss(self, feat, gt_classes, ious):
+        # select foreground and iou > thr instance in a mini-batch
+        pos_inds = (ious > self.ins_con_batch_iou_thr) & (
+            gt_classes != self.num_classes)
+        if not pos_inds.sum():
+            return feat.new_tensor(0.0)
+        feat, gt_classes = feat[pos_inds], gt_classes[pos_inds]
+        queue = self.queue.reshape(-1, self.ins_con_out_dim)
+        queue_label = self.queue_label.reshape(-1)
+        queue_inds = queue_label != -1  # filter empty queue
+        queue, queue_label = queue[queue_inds], queue_label[queue_inds]
+        loss_ins_con = self.ins_con_loss(feat, gt_classes, queue, queue_label)
+        # loss decay
+        storage = get_event_storage()
+        decay_weight = 1.0 - storage.iter / self.max_iters
+        return self.ins_con_loss_weight * decay_weight * loss_ins_con
+    @ torch.no_grad()
+    def _dequeue_and_enqueue(self, feat, gt_classes, ious, iou_thr=0.7):
+        # 1. gather variable
+        # feat = self.concat_all_gather(feat)
+        # gt_classes = self.concat_all_gather(gt_classes)
+        # ious = self.concat_all_gather(ious)
+        # 2. filter by iou and obj, remove bg
+        keep = (ious > iou_thr) & (gt_classes != self.num_classes)
+        feat, gt_classes = feat[keep], gt_classes[keep]
+        for i in range(self.num_known_classes):
+            ptr = int(self.queue_ptr[i])
+            cls_ind = gt_classes == i
+            cls_feat, cls_gt_classes = feat[cls_ind], gt_classes[cls_ind]
+            # 3. sort by similarity, low sim ranks first
+            cls_queue = self.queue[i, self.queue_label[i] != -1]
+            _, sim_inds = F.cosine_similarity(
+                cls_feat[:, None], cls_queue[None, :], dim=-1).mean(dim=1).sort()
+            top_sim_inds = sim_inds[:self.ins_con_in_queue_size]
+            cls_feat, cls_gt_classes = cls_feat[top_sim_inds], cls_gt_classes[top_sim_inds]
+            # 4. in queue
+            batch_size = cls_feat.size(
+                0) if ptr + cls_feat.size(0) <= self.ins_con_queue_size else self.ins_con_queue_size - ptr
+            self.queue[i, ptr:ptr+batch_size] = cls_feat[:batch_size]
+            self.queue_label[i, ptr:ptr +
+                             batch_size] = cls_gt_classes[:batch_size]
+            ptr = ptr + batch_size if ptr + batch_size < self.ins_con_queue_size else 0
+            self.queue_ptr[i] = ptr
+    @ torch.no_grad()
+    def concat_all_gather(self, tensor):
+        tensors_gather = [torch.ones_like(tensor) for _ in range(
+            torch.distributed.get_world_size())]
+        torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+        output = torch.cat(tensors_gather, dim=0)
+        return output
+    def forward(self, batched_inputs: List[Dict[str, Tensor]]):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                * image: Tensor, image in (C, H, W) format.
+                * instances: Instances
+                Other information that's included in the original dicts, such as:
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+        Returns:
+            In training, dict[str, Tensor]: mapping from a named loss to a tensor storing the
+            loss. Used during training only. In inference, the standard output format, described
+            in :doc:`/tutorials/models`.
+        """
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+        features = [features[f] for f in self.head_in_features]
+        anchors = self.anchor_generator(features)
+        pred_logits, pred_anchor_deltas, pred_mlp_feats = self.head(features)
+        # Transpose the Hi*Wi*A dimension to the middle:
+        pred_logits = [permute_to_N_HWA_K(
+            x, self.num_classes) for x in pred_logits]
+        pred_anchor_deltas = [permute_to_N_HWA_K(
+            x, 4) for x in pred_anchor_deltas]
+        pred_mlp_feats = [permute_to_N_HWA_K(
+            x, self.ins_con_out_dim) for x in pred_mlp_feats]
+        if self.training:
+            assert not torch.jit.is_scripting(), "Not supported"
+            assert "instances" in batched_inputs[0], "Instance annotations are missing in training!"
+            gt_instances = [x["instances"].to(
+                self.device) for x in batched_inputs]
+            gt_labels, gt_boxes, gt_ious = self.label_anchors(
+                anchors, gt_instances)
+            losses = self.losses(anchors, pred_logits, pred_mlp_feats,
+                                 gt_labels, pred_anchor_deltas, gt_boxes, gt_ious)
+            if self.vis_period > 0:
+                storage = get_event_storage()
+                if storage.iter % self.vis_period == 0:
+                    results = self.inference(
+                        anchors, pred_logits, pred_anchor_deltas, images.image_sizes
+                    )
+                    self.visualize_training(batched_inputs, results)
+            return losses
+        else:
+            results = self.inference(
+                anchors, pred_logits, pred_anchor_deltas, images.image_sizes)
+            if torch.jit.is_scripting():
+                return results
+            processed_results = []
+            for results_per_image, input_per_image, image_size in zip(
+                results, batched_inputs, images.image_sizes
+            ):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                r = detector_postprocess(results_per_image, height, width)
+                processed_results.append({"instances": r})
+            return processed_results
+    def losses(self, anchors, pred_logits, pred_mlp_feats, gt_labels, pred_anchor_deltas, gt_boxes, gt_ious):
+        """
+        Args:
+            anchors (list[Boxes]): a list of #feature level Boxes
+            gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`.
+                Their shapes are (N, R) and (N, R, 4), respectively, where R is
+                the total number of anchors across levels, i.e. sum(Hi x Wi x Ai)
+            pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the
+                list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4).
+                Where K is the number of classes used in `pred_logits`.
+        Returns:
+            dict[str, Tensor]:
+                mapping from a named loss to a scalar tensor
+                storing the loss. Used during training only. The dict keys are:
+                "loss_cls" and "loss_box_reg"
+        """
+        num_images = len(gt_labels)
+        gt_labels = torch.stack(gt_labels)  # (N, R)
+        valid_mask = gt_labels >= 0
+        pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
+        num_pos_anchors = pos_mask.sum().item()
+        get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
+        self.loss_normalizer = self.loss_normalizer_momentum * self.loss_normalizer + (
+            1 - self.loss_normalizer_momentum
+        ) * max(num_pos_anchors, 1)
+        # classification and regression loss
+        gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[
+            :, :-1
+        ]  # no loss for the last (background) class
+        loss_cls_ce = sigmoid_focal_loss_jit(
+            cat(pred_logits, dim=1)[valid_mask],
+            gt_labels_target.to(pred_logits[0].dtype),
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction="sum",
+        )
+        loss_cls_up = self.get_up_loss(cat(pred_logits, dim=1)[
+                                       valid_mask], gt_labels[valid_mask])
+        gt_ious = torch.stack(gt_ious)
+        # we first store feats in the queue, then cmopute the loss
+        pred_mlp_feats = cat(pred_mlp_feats, dim=1)[valid_mask]  # [N, *, 128]
+        # [N*, 128]
+        pred_mlp_feats = pred_mlp_feats.reshape(-1, pred_mlp_feats.shape[-1])
+        self._dequeue_and_enqueue(
+            pred_mlp_feats, gt_labels[valid_mask], gt_ious[valid_mask], iou_thr=self.ins_con_queue_iou_thr)
+        loss_ins_con = self.get_ins_con_loss(
+            pred_mlp_feats, gt_labels[valid_mask], gt_ious[valid_mask])
+        loss_box_reg = _dense_box_regression_loss(
+            anchors,
+            self.box2box_transform,
+            pred_anchor_deltas,
+            gt_boxes,
+            pos_mask,
+            box_reg_loss_type=self.box_reg_loss_type,
+            smooth_l1_beta=self.smooth_l1_beta,
+        )
+        return {
+            "loss_cls_ce": loss_cls_ce / self.loss_normalizer,
+            "loss_box_reg": loss_box_reg / self.loss_normalizer,
+            "loss_ins_con": loss_ins_con,
+            "loss_cls_up": loss_cls_up,
+        }
+    @torch.no_grad()
+    def label_anchors(self, anchors, gt_instances):
+        anchors = Boxes.cat(anchors)  # Rx4
+        gt_labels = []
+        matched_gt_boxes = []
+        matched_gt_ious = []
+        for gt_per_image in gt_instances:
+            match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors)
+            matched_idxs, anchor_labels = self.anchor_matcher(
+                match_quality_matrix)
+            # del match_quality_matrix
+            if len(gt_per_image) > 0:
+                matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs]
+                matched_gt_ious_i = match_quality_matrix.max(dim=1)[
+                    0][matched_idxs]
+                gt_labels_i = gt_per_image.gt_classes[matched_idxs]
+                # Anchors with label 0 are treated as background.
+                gt_labels_i[anchor_labels == 0] = self.num_classes
+                # Anchors with label -1 are ignored.
+                gt_labels_i[anchor_labels == -1] = -1
+            else:
+                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
+                matched_gt_ious_i = torch.zeros_like(matched_idxs)
+                gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes
+            gt_labels.append(gt_labels_i)
+            matched_gt_boxes.append(matched_gt_boxes_i)
+            matched_gt_ious.append(matched_gt_ious_i)
+            del match_quality_matrix
+        return gt_labels, matched_gt_boxes, matched_gt_ious
+class OpenSetRetinaNetHead(RetinaNetHead):
+    """
+    The head used in RetinaNet for object classification and box regression.
+    It has two subnets for the two tasks, with a common structure but separate parameters.
+    """
+    @configurable
+    def __init__(
+        self,
+        *args,
+        ins_con_out_dim,
+        **kargs
+    ):
+        super().__init__(*args, **kargs)
+        self.mlp = ConvMLP(kargs["conv_dims"][-1], ins_con_out_dim * kargs["num_anchors"])
+    @classmethod
+    def from_config(cls, cfg, input_shape: List[ShapeSpec]):
+        ret = super().from_config(cfg, input_shape)
+        ret["ins_con_out_dim"] = cfg.ICLOSS.OUT_DIM
+        return ret
+    def forward(self, features: List[Tensor]):
+        """
+        Arguments:
+            features (list[Tensor]): FPN feature map tensors in high to low resolution.
+                Each tensor in the list correspond to different feature levels.
+        Returns:
+            logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
+                The tensor predicts the classification probability
+                at each spatial position for each of the A anchors and K object
+                classes.
+            bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
+                The tensor predicts 4-vector (dx,dy,dw,dh) box
+                regression values for every anchor. These values are the
+                relative offset between the anchor and the ground truth box.
+        """
+        logits = []
+        mlp_feats = []
+        bbox_reg = []
+        for feature in features:
+            cls_feat = self.cls_subnet(feature)
+            mlp_feats.append(self.mlp(cls_feat))
+            logits.append(self.cls_score(cls_feat))
+            bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature)))
+        return logits, bbox_reg, mlp_feats

opendet2/modeling/roi_heads/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .roi_heads import OpenSetStandardROIHeads
+from .box_head import FastRCNNSeparateConvFCHead, FastRCNNSeparateDropoutConvFCHead
+__all__ = list(globals().keys())

opendet2/modeling/roi_heads/box_head.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import List
+import fvcore.nn.weight_init as weight_init
+import numpy as np
+import torch
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling.roi_heads import ROI_BOX_HEAD_REGISTRY
+from detectron2.utils.registry import Registry
+from torch import nn
+@ROI_BOX_HEAD_REGISTRY.register()
+class FastRCNNSeparateConvFCHead(nn.Module):
+    """
+    FastRCNN with separate ConvFC layers
+    """
+    @configurable
+    def __init__(
+        self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], conv_norm=""
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape (ShapeSpec): shape of the input feature.
+            conv_dims (list[int]): the output dimensions of the conv layers
+            fc_dims (list[int]): the output dimensions of the fc layers
+            conv_norm (str or callable): normalization for the conv layers.
+                See :func:`detectron2.layers.get_norm` for supported types.
+        """
+        super().__init__()
+        assert len(conv_dims) + len(fc_dims) > 0
+        self.conv_dims = conv_dims
+        self.fc_dims = fc_dims
+        self._output_size = (input_shape.channels,
+                             input_shape.height, input_shape.width)
+        self.reg_conv_norm_relus = self._add_conv_norm_relus(
+            self._output_size[0], conv_dims, conv_norm)
+        self.cls_conv_norm_relus = self._add_conv_norm_relus(
+            self._output_size[0], conv_dims, conv_norm)
+        conv_dim = self._output_size[0] if len(conv_dims) == 0 else conv_dims[-1]
+        self._output_size = (
+            conv_dim, self._output_size[1], self._output_size[2])
+        self.reg_fcs = self._add_fcs(np.prod(self._output_size), fc_dims)
+        self.cls_fcs = self._add_fcs(np.prod(self._output_size), fc_dims)
+        self._output_size = self._output_size if len(fc_dims)==0 else fc_dims[-1]
+        for layer in self.reg_conv_norm_relus:
+            weight_init.c2_msra_fill(layer)
+        for layer in self.cls_conv_norm_relus:
+            weight_init.c2_msra_fill(layer)
+        for layer in self.cls_fcs:
+            if isinstance(layer, nn.Linear):
+                weight_init.c2_xavier_fill(layer)
+        for layer in self.reg_fcs:
+            if isinstance(layer, nn.Linear):
+                weight_init.c2_xavier_fill(layer)
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        num_conv = cfg.MODEL.ROI_BOX_HEAD.NUM_CONV
+        conv_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_DIM
+        num_fc = cfg.MODEL.ROI_BOX_HEAD.NUM_FC
+        fc_dim = cfg.MODEL.ROI_BOX_HEAD.FC_DIM
+        return {
+            "input_shape": input_shape,
+            "conv_dims": [conv_dim] * num_conv,
+            "fc_dims": [fc_dim] * num_fc,
+            "conv_norm": cfg.MODEL.ROI_BOX_HEAD.NORM,
+        }
+    def _add_conv_norm_relus(self, input_dim, conv_dims, conv_norm):
+        conv_norm_relus = []
+        for k, conv_dim in enumerate(conv_dims):
+            conv = Conv2d(
+                input_dim,
+                conv_dim,
+                kernel_size=3,
+                padding=1,
+                bias=not conv_norm,
+                norm=get_norm(conv_norm, conv_dim),
+                activation=nn.ReLU(),
+            )
+            input_dim = conv_dim
+            conv_norm_relus.append(conv)
+        return nn.Sequential(*conv_norm_relus)
+    def _add_fcs(self, input_dim, fc_dims):
+        fcs = []
+        for k, fc_dim in enumerate(fc_dims):
+            if k == 0:
+                fcs.append(nn.Flatten())
+            fc = nn.Linear(int(input_dim), fc_dim)
+            fcs.append(fc)
+            fcs.append(nn.ReLU())
+            input_dim = fc_dim
+        return nn.Sequential(*fcs)
+    def forward(self, x):
+        reg_feat = x
+        cls_feat = x
+        if len(self.conv_dims) > 0:
+            reg_feat = self.reg_conv_norm_relus(x)
+            cls_feat = self.cls_conv_norm_relus(x)
+        if len(self.fc_dims) > 0:
+            reg_feat = self.reg_fcs(reg_feat)
+            cls_feat = self.cls_fcs(cls_feat)
+        return reg_feat, cls_feat
+    @property
+    @torch.jit.unused
+    def output_shape(self):
+        """
+        Returns:
+            ShapeSpec: the output feature shape
+        """
+        o = self._output_size
+        if isinstance(o, int):
+            return ShapeSpec(channels=o)
+        else:
+            return ShapeSpec(channels=o[0], height=o[1], width=o[2])
+@ROI_BOX_HEAD_REGISTRY.register()
+class FastRCNNSeparateDropoutConvFCHead(nn.Module):
+    """Add dropout before each conv/fc layer
+    """
+    def _add_conv_norm_relus(self, input_dim, conv_dims, conv_norm):
+        conv_norm_relus = []
+        for k, conv_dim in enumerate(conv_dims):
+            conv = Conv2d(
+                input_dim,
+                conv_dim,
+                kernel_size=3,
+                padding=1,
+                bias=not conv_norm,
+                norm=get_norm(conv_norm, conv_dim),
+                activation=nn.ReLU(),
+            )
+            input_dim = conv_dim
+            conv_norm_relus.append(nn.Dropout2d(p=0.5))
+            conv_norm_relus.append(conv)
+        return nn.Sequential(*conv_norm_relus)
+    def _add_fcs(self, input_dim, fc_dims):
+        fcs = []
+        for k, fc_dim in enumerate(fc_dims):
+            if k == 0:
+                fcs.append(nn.Flatten())
+            fc = nn.Linear(int(input_dim), fc_dim)
+            fcs.append(nn.Dropout2d(p=0.5))
+            fcs.append(fc)
+            fcs.append(nn.ReLU())
+            input_dim = fc_dim
+        return nn.Sequential(*fcs)

opendet2/modeling/roi_heads/fast_rcnn.py ADDED Viewed

	@@ -0,0 +1,645 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+import math
+import os
+import random
+from typing import Dict, List, Tuple, Union
+import numpy as np
+import torch
+import torch.distributions as dists
+from detectron2.config import configurable
+from detectron2.layers import (ShapeSpec, batched_nms, cat, cross_entropy,
+                               nonzero_tuple)
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.roi_heads.fast_rcnn import (FastRCNNOutputLayers,
+                                                     _log_classification_stats)
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.structures.boxes import matched_boxlist_iou
+#  fast_rcnn_inference)
+from detectron2.utils import comm
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+from fvcore.nn import giou_loss, smooth_l1_loss
+from torch import nn
+from torch.nn import functional as F
+from ..layers import MLP
+from ..losses import ICLoss, UPLoss
+ROI_BOX_OUTPUT_LAYERS_REGISTRY = Registry("ROI_BOX_OUTPUT_LAYERS")
+ROI_BOX_OUTPUT_LAYERS_REGISTRY.__doc__ = """
+ROI_BOX_OUTPUT_LAYERS
+"""
+def fast_rcnn_inference(
+    boxes: List[torch.Tensor],
+    scores: List[torch.Tensor],
+    image_shapes: List[Tuple[int, int]],
+    score_thresh: float,
+    nms_thresh: float,
+    topk_per_image: int,
+    vis_iou_thr: float = 1.0,
+):
+    result_per_image = [
+        fast_rcnn_inference_single_image(
+            boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image, vis_iou_thr
+        )
+        for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
+    ]
+    return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
+def fast_rcnn_inference_single_image(
+    boxes,
+    scores,
+    image_shape: Tuple[int, int],
+    score_thresh: float,
+    nms_thresh: float,
+    topk_per_image: int,
+    vis_iou_thr: float,
+):
+    valid_mask = torch.isfinite(boxes).all(
+        dim=1) & torch.isfinite(scores).all(dim=1)
+    if not valid_mask.all():
+        boxes = boxes[valid_mask]
+        scores = scores[valid_mask]
+    scores = scores[:, :-1]
+    num_bbox_reg_classes = boxes.shape[1] // 4
+    # Convert to Boxes to use the `clip` function ...
+    boxes = Boxes(boxes.reshape(-1, 4))
+    boxes.clip(image_shape)
+    boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4)  # R x C x 4
+    # 1. Filter results based on detection scores. It can make NMS more efficient
+    #    by filtering out low-confidence detections.
+    filter_mask = scores > score_thresh  # R x K
+    # R' x 2. First column contains indices of the R predictions;
+    # Second column contains indices of classes.
+    filter_inds = filter_mask.nonzero()
+    if num_bbox_reg_classes == 1:
+        boxes = boxes[filter_inds[:, 0], 0]
+    else:
+        boxes = boxes[filter_mask]
+    scores = scores[filter_mask]
+    # 2. Apply NMS for each class independently.
+    keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
+    if topk_per_image >= 0:
+        keep = keep[:topk_per_image]
+    boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
+    # apply nms between known classes and unknown class for visualization.
+    if vis_iou_thr < 1.0:
+        boxes, scores, filter_inds = unknown_aware_nms(
+            boxes, scores, filter_inds, iou_thr=vis_iou_thr)
+    result = Instances(image_shape)
+    result.pred_boxes = Boxes(boxes)
+    result.scores = scores
+    result.pred_classes = filter_inds[:, 1]
+    return result, filter_inds[:, 0]
+def unknown_aware_nms(boxes, scores, labels, ukn_class_id=80, iou_thr=0.9):
+    u_inds = labels[:, 1] == ukn_class_id
+    k_inds = ~u_inds
+    if k_inds.sum() == 0 or u_inds.sum() == 0:
+        return boxes, scores, labels
+    k_boxes, k_scores, k_labels = boxes[k_inds], scores[k_inds], labels[k_inds]
+    u_boxes, u_scores, u_labels = boxes[u_inds], scores[u_inds], labels[u_inds]
+    ious = pairwise_iou(Boxes(k_boxes), Boxes(u_boxes))
+    mask = torch.ones((ious.size(0), ious.size(1), 2), device=ious.device)
+    inds = (ious > iou_thr).nonzero()
+    if not inds.numel():
+        return boxes, scores, labels
+    for [ind_x, ind_y] in inds:
+        if k_scores[ind_x] >= u_scores[ind_y]:
+            mask[ind_x, ind_y, 1] = 0
+        else:
+            mask[ind_x, ind_y, 0] = 0
+    k_inds = mask[..., 0].mean(dim=1) == 1
+    u_inds = mask[..., 1].mean(dim=0) == 1
+    k_boxes, k_scores, k_labels = k_boxes[k_inds], k_scores[k_inds], k_labels[k_inds]
+    u_boxes, u_scores, u_labels = u_boxes[u_inds], u_scores[u_inds], u_labels[u_inds]
+    boxes = torch.cat([k_boxes, u_boxes])
+    scores = torch.cat([k_scores, u_scores])
+    labels = torch.cat([k_labels, u_labels])
+    return boxes, scores, labels
+logger = logging.getLogger(__name__)
+def build_roi_box_output_layers(cfg, input_shape):
+    """
+    Build ROIHeads defined by `cfg.MODEL.ROI_HEADS.NAME`.
+    """
+    name = cfg.MODEL.ROI_BOX_HEAD.OUTPUT_LAYERS
+    return ROI_BOX_OUTPUT_LAYERS_REGISTRY.get(name)(cfg, input_shape)
+@ROI_BOX_OUTPUT_LAYERS_REGISTRY.register()
+class CosineFastRCNNOutputLayers(FastRCNNOutputLayers):
+    @configurable
+    def __init__(
+        self,
+        *args,
+        scale: int = 20,
+        vis_iou_thr: float = 1.0,
+        **kargs,
+    ):
+        super().__init__(*args, **kargs)
+        # prediction layer for num_classes foreground classes and one background class (hence + 1)
+        self.cls_score = nn.Linear(
+            self.cls_score.in_features, self.num_classes + 1, bias=False)
+        nn.init.normal_(self.cls_score.weight, std=0.01)
+        # scaling factor
+        self.scale = scale
+        self.vis_iou_thr = vis_iou_thr
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        ret['scale'] = cfg.MODEL.ROI_HEADS.COSINE_SCALE
+        ret['vis_iou_thr'] = cfg.MODEL.ROI_HEADS.VIS_IOU_THRESH
+        return ret
+    def forward(self, feats):
+        # support shared & sepearte head
+        if isinstance(feats, tuple):
+            reg_x, cls_x = feats
+        else:
+            reg_x = cls_x = feats
+        if reg_x.dim() > 2:
+            reg_x = torch.flatten(reg_x, start_dim=1)
+            cls_x = torch.flatten(cls_x, start_dim=1)
+        x_norm = torch.norm(cls_x, p=2, dim=1).unsqueeze(1).expand_as(cls_x)
+        x_normalized = cls_x.div(x_norm + 1e-5)
+        # normalize weight
+        temp_norm = (
+            torch.norm(self.cls_score.weight.data, p=2, dim=1)
+            .unsqueeze(1)
+            .expand_as(self.cls_score.weight.data)
+        )
+        self.cls_score.weight.data = self.cls_score.weight.data.div(
+            temp_norm + 1e-5
+        )
+        cos_dist = self.cls_score(x_normalized)
+        scores = self.scale * cos_dist
+        proposal_deltas = self.bbox_pred(reg_x)
+        return scores, proposal_deltas
+    def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]):
+        boxes = self.predict_boxes(predictions, proposals)
+        scores = self.predict_probs(predictions, proposals)
+        image_shapes = [x.image_size for x in proposals]
+        return fast_rcnn_inference(
+            boxes,
+            scores,
+            image_shapes,
+            self.test_score_thresh,
+            self.test_nms_thresh,
+            self.test_topk_per_image,
+            self.vis_iou_thr,
+        )
+    def predict_boxes(
+        self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
+    ):
+        if not len(proposals):
+            return []
+        proposal_deltas = predictions[1]
+        num_prop_per_image = [len(p) for p in proposals]
+        proposal_boxes = cat(
+            [p.proposal_boxes.tensor for p in proposals], dim=0)
+        predict_boxes = self.box2box_transform.apply_deltas(
+            proposal_deltas,
+            proposal_boxes,
+        )  # Nx(KxB)
+        return predict_boxes.split(num_prop_per_image)
+    def predict_probs(
+        self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
+    ):
+        scores = predictions[0]
+        num_inst_per_image = [len(p) for p in proposals]
+        probs = F.softmax(scores, dim=-1)
+        return probs.split(num_inst_per_image, dim=0)
+@ROI_BOX_OUTPUT_LAYERS_REGISTRY.register()
+class OpenDetFastRCNNOutputLayers(CosineFastRCNNOutputLayers):
+    @configurable
+    def __init__(
+        self,
+        *args,
+        num_known_classes,
+        max_iters,
+        up_loss_start_iter,
+        up_loss_sampling_metric,
+        up_loss_topk,
+        up_loss_alpha,
+        up_loss_weight,
+        ic_loss_out_dim,
+        ic_loss_queue_size,
+        ic_loss_in_queue_size,
+        ic_loss_batch_iou_thr,
+        ic_loss_queue_iou_thr,
+        ic_loss_queue_tau,
+        ic_loss_weight,
+        **kargs
+    ):
+        super().__init__(*args, **kargs)
+        self.num_known_classes = num_known_classes
+        self.max_iters = max_iters
+        self.up_loss = UPLoss(
+            self.num_classes,
+            sampling_metric=up_loss_sampling_metric,
+            topk=up_loss_topk,
+            alpha=up_loss_alpha
+        )
+        self.up_loss_start_iter = up_loss_start_iter
+        self.up_loss_weight = up_loss_weight
+        self.encoder = MLP(self.cls_score.in_features, ic_loss_out_dim)
+        self.ic_loss_loss = ICLoss(tau=ic_loss_queue_tau)
+        self.ic_loss_out_dim = ic_loss_out_dim
+        self.ic_loss_queue_size = ic_loss_queue_size
+        self.ic_loss_in_queue_size = ic_loss_in_queue_size
+        self.ic_loss_batch_iou_thr = ic_loss_batch_iou_thr
+        self.ic_loss_queue_iou_thr = ic_loss_queue_iou_thr
+        self.ic_loss_weight = ic_loss_weight
+        self.register_buffer('queue', torch.zeros(
+            self.num_known_classes, ic_loss_queue_size, ic_loss_out_dim))
+        self.register_buffer('queue_label', torch.empty(
+            self.num_known_classes, ic_loss_queue_size).fill_(-1).long())
+        self.register_buffer('queue_ptr', torch.zeros(
+            self.num_known_classes, dtype=torch.long))
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        ret.update({
+            'num_known_classes': cfg.MODEL.ROI_HEADS.NUM_KNOWN_CLASSES,
+            "max_iters": cfg.SOLVER.MAX_ITER,
+            "up_loss_start_iter": cfg.UPLOSS.START_ITER,
+            "up_loss_sampling_metric": cfg.UPLOSS.SAMPLING_METRIC,
+            "up_loss_topk": cfg.UPLOSS.TOPK,
+            "up_loss_alpha": cfg.UPLOSS.ALPHA,
+            "up_loss_weight": cfg.UPLOSS.WEIGHT,
+            "ic_loss_out_dim": cfg.ICLOSS.OUT_DIM,
+            "ic_loss_queue_size": cfg.ICLOSS.QUEUE_SIZE,
+            "ic_loss_in_queue_size": cfg.ICLOSS.IN_QUEUE_SIZE,
+            "ic_loss_batch_iou_thr": cfg.ICLOSS.BATCH_IOU_THRESH,
+            "ic_loss_queue_iou_thr": cfg.ICLOSS.QUEUE_IOU_THRESH,
+            "ic_loss_queue_tau": cfg.ICLOSS.TEMPERATURE,
+            "ic_loss_weight": cfg.ICLOSS.WEIGHT,
+        })
+        return ret
+    def forward(self, feats):
+        # support shared & sepearte head
+        if isinstance(feats, tuple):
+            reg_x, cls_x = feats
+        else:
+            reg_x = cls_x = feats
+        if reg_x.dim() > 2:
+            reg_x = torch.flatten(reg_x, start_dim=1)
+            cls_x = torch.flatten(cls_x, start_dim=1)
+        x_norm = torch.norm(cls_x, p=2, dim=1).unsqueeze(1).expand_as(cls_x)
+        x_normalized = cls_x.div(x_norm + 1e-5)
+        # normalize weight
+        temp_norm = (
+            torch.norm(self.cls_score.weight.data, p=2, dim=1)
+            .unsqueeze(1)
+            .expand_as(self.cls_score.weight.data)
+        )
+        self.cls_score.weight.data = self.cls_score.weight.data.div(
+            temp_norm + 1e-5
+        )
+        cos_dist = self.cls_score(x_normalized)
+        scores = self.scale * cos_dist
+        proposal_deltas = self.bbox_pred(reg_x)
+        # encode feature with MLP
+        mlp_feat = self.encoder(cls_x)
+        return scores, proposal_deltas, mlp_feat
+    def get_up_loss(self, scores, gt_classes):
+        # start up loss after several warmup iters
+        storage = get_event_storage()
+        if storage.iter > self.up_loss_start_iter:
+            loss_cls_up = self.up_loss(scores, gt_classes)
+        else:
+            loss_cls_up = scores.new_tensor(0.0)
+        return {"loss_cls_up": self.up_loss_weight * loss_cls_up}
+    def get_ic_loss(self, feat, gt_classes, ious):
+        # select foreground and iou > thr instance in a mini-batch
+        pos_inds = (ious > self.ic_loss_batch_iou_thr) & (
+            gt_classes != self.num_classes)
+        feat, gt_classes = feat[pos_inds], gt_classes[pos_inds]
+        queue = self.queue.reshape(-1, self.ic_loss_out_dim)
+        queue_label = self.queue_label.reshape(-1)
+        queue_inds = queue_label != -1  # filter empty queue
+        queue, queue_label = queue[queue_inds], queue_label[queue_inds]
+        loss_ic_loss = self.ic_loss_loss(feat, gt_classes, queue, queue_label)
+        # loss decay
+        storage = get_event_storage()
+        decay_weight = 1.0 - storage.iter / self.max_iters
+        return {"loss_cls_ic": self.ic_loss_weight * decay_weight * loss_ic_loss}
+    @torch.no_grad()
+    def _dequeue_and_enqueue(self, feat, gt_classes, ious, iou_thr=0.7):
+        # 1. gather variable
+        feat = self.concat_all_gather(feat)
+        gt_classes = self.concat_all_gather(gt_classes)
+        ious = self.concat_all_gather(ious)
+        # 2. filter by iou and obj, remove bg
+        keep = (ious > iou_thr) & (gt_classes != self.num_classes)
+        feat, gt_classes = feat[keep], gt_classes[keep]
+        for i in range(self.num_known_classes):
+            ptr = int(self.queue_ptr[i])
+            cls_ind = gt_classes == i
+            cls_feat, cls_gt_classes = feat[cls_ind], gt_classes[cls_ind]
+            # 3. sort by similarity, low sim ranks first
+            cls_queue = self.queue[i, self.queue_label[i] != -1]
+            _, sim_inds = F.cosine_similarity(
+                cls_feat[:, None], cls_queue[None, :], dim=-1).mean(dim=1).sort()
+            top_sim_inds = sim_inds[:self.ic_loss_in_queue_size]
+            cls_feat, cls_gt_classes = cls_feat[top_sim_inds], cls_gt_classes[top_sim_inds]
+            # 4. in queue
+            batch_size = cls_feat.size(
+                0) if ptr + cls_feat.size(0) <= self.ic_loss_queue_size else self.ic_loss_queue_size - ptr
+            self.queue[i, ptr:ptr+batch_size] = cls_feat[:batch_size]
+            self.queue_label[i, ptr:ptr + batch_size] = cls_gt_classes[:batch_size]
+            ptr = ptr + batch_size if ptr + batch_size < self.ic_loss_queue_size else 0
+            self.queue_ptr[i] = ptr
+    @torch.no_grad()
+    def concat_all_gather(self, tensor):
+        world_size = comm.get_world_size()
+        # single GPU, directly return the tensor
+        if world_size == 1:
+            return tensor
+        # multiple GPUs, gather tensors
+        tensors_gather = [torch.ones_like(tensor) for _ in range(world_size)]
+        torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+        output = torch.cat(tensors_gather, dim=0)
+        return output
+    def losses(self, predictions, proposals, input_features=None):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were used
+                to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
+                ``gt_classes`` are expected.
+        Returns:
+            Dict[str, Tensor]: dict of losses
+        """
+        scores, proposal_deltas, mlp_feat = predictions
+        # parse classification outputs
+        gt_classes = (
+            cat([p.gt_classes for p in proposals], dim=0) if len(
+                proposals) else torch.empty(0)
+        )
+        _log_classification_stats(scores, gt_classes)
+        # parse box regression outputs
+        if len(proposals):
+            proposal_boxes = cat(
+                [p.proposal_boxes.tensor for p in proposals], dim=0)  # Nx4
+            assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
+            # If "gt_boxes" does not exist, the proposals must be all negative and
+            # should not be included in regression loss computation.
+            # Here we just use proposal_boxes as an arbitrary placeholder because its
+            # value won't be used in self.box_reg_loss().
+            gt_boxes = cat(
+                [(p.gt_boxes if p.has("gt_boxes")
+                  else p.proposal_boxes).tensor for p in proposals],
+                dim=0,
+            )
+        else:
+            proposal_boxes = gt_boxes = torch.empty(
+                (0, 4), device=proposal_deltas.device)
+        losses = {
+            "loss_cls_ce": cross_entropy(scores, gt_classes, reduction="mean"),
+            "loss_box_reg": self.box_reg_loss(
+                proposal_boxes, gt_boxes, proposal_deltas, gt_classes
+            ),
+        }
+        # up loss
+        losses.update(self.get_up_loss(scores, gt_classes))
+        ious = cat([p.iou for p in proposals], dim=0)
+        # we first store feats in the queue, then cmopute loss
+        self._dequeue_and_enqueue(
+            mlp_feat, gt_classes, ious, iou_thr=self.ic_loss_queue_iou_thr)
+        losses.update(self.get_ic_loss(mlp_feat, gt_classes, ious))
+        return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
+@ROI_BOX_OUTPUT_LAYERS_REGISTRY.register()
+class PROSERFastRCNNOutputLayers(CosineFastRCNNOutputLayers):
+    """PROSER
+    """
+    @configurable
+    def __init__(self, *args, **kargs):
+        super().__init__(*args, **kargs)
+        self.proser_weight = 0.1
+    def get_proser_loss(self, scores, gt_classes):
+        num_sample, num_classes = scores.shape
+        mask = torch.arange(num_classes).repeat(
+            num_sample, 1).to(scores.device)
+        inds = mask != gt_classes[:, None].repeat(1, num_classes)
+        mask = mask[inds].reshape(num_sample, num_classes-1)
+        mask_scores = torch.gather(scores, 1, mask)
+        targets = torch.zeros_like(gt_classes)
+        fg_inds = gt_classes != self.num_classes
+        targets[fg_inds] = self.num_classes-2
+        targets[~fg_inds] = self.num_classes-1
+        loss_cls_proser = cross_entropy(mask_scores, targets)
+        return {"loss_cls_proser": self.proser_weight * loss_cls_proser}
+    def losses(self, predictions, proposals, input_features=None):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were used
+                to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
+                ``gt_classes`` are expected.
+        Returns:
+            Dict[str, Tensor]: dict of losses
+        """
+        scores, proposal_deltas = predictions
+        # parse classification outputs
+        gt_classes = (
+            cat([p.gt_classes for p in proposals], dim=0) if len(
+                proposals) else torch.empty(0)
+        )
+        _log_classification_stats(scores, gt_classes)
+        # parse box regression outputs
+        if len(proposals):
+            proposal_boxes = cat(
+                [p.proposal_boxes.tensor for p in proposals], dim=0)  # Nx4
+            assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
+            # If "gt_boxes" does not exist, the proposals must be all negative and
+            # should not be included in regression loss computation.
+            # Here we just use proposal_boxes as an arbitrary placeholder because its
+            # value won't be used in self.box_reg_loss().
+            gt_boxes = cat(
+                [(p.gt_boxes if p.has("gt_boxes")
+                  else p.proposal_boxes).tensor for p in proposals],
+                dim=0,
+            )
+        else:
+            proposal_boxes = gt_boxes = torch.empty(
+                (0, 4), device=proposal_deltas.device)
+        losses = {
+            "loss_cls_ce": cross_entropy(scores, gt_classes, reduction="mean"),
+            "loss_box_reg": self.box_reg_loss(
+                proposal_boxes, gt_boxes, proposal_deltas, gt_classes
+            ),
+        }
+        losses.update(self.get_proser_loss(scores, gt_classes))
+        return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
+@ROI_BOX_OUTPUT_LAYERS_REGISTRY.register()
+class DropoutFastRCNNOutputLayers(CosineFastRCNNOutputLayers):
+    @configurable
+    def __init__(self, *args, **kargs):
+        super().__init__(*args, **kargs)
+        self.dropout = nn.Dropout(p=0.5)
+        self.entropy_thr = 0.25
+    def forward(self, feats, testing=False):
+        # support shared & sepearte head
+        if isinstance(feats, tuple):
+            reg_x, cls_x = feats
+        else:
+            reg_x = cls_x = feats
+        if reg_x.dim() > 2:
+            reg_x = torch.flatten(reg_x, start_dim=1)
+            cls_x = torch.flatten(cls_x, start_dim=1)
+        x_norm = torch.norm(cls_x, p=2, dim=1).unsqueeze(1).expand_as(cls_x)
+        x_normalized = cls_x.div(x_norm + 1e-5)
+        # normalize weight
+        temp_norm = (
+            torch.norm(self.cls_score.weight.data, p=2, dim=1)
+            .unsqueeze(1)
+            .expand_as(self.cls_score.weight.data)
+        )
+        self.cls_score.weight.data = self.cls_score.weight.data.div(
+            temp_norm + 1e-5
+        )
+        if testing:
+            self.dropout.train()
+            x_normalized = self.dropout(x_normalized)
+        cos_dist = self.cls_score(x_normalized)
+        scores = self.scale * cos_dist
+        proposal_deltas = self.bbox_pred(reg_x)
+        return scores, proposal_deltas
+    def inference(self, predictions: List[Tuple[torch.Tensor, torch.Tensor]], proposals: List[Instances]):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were
+                used to compute predictions. The ``proposal_boxes`` field is expected.
+        Returns:
+            list[Instances]: same as `fast_rcnn_inference`.
+            list[Tensor]: same as `fast_rcnn_inference`.
+        """
+        boxes = self.predict_boxes(predictions[0], proposals)
+        scores = self.predict_probs(predictions, proposals)
+        image_shapes = [x.image_size for x in proposals]
+        return fast_rcnn_inference(
+            boxes,
+            scores,
+            image_shapes,
+            self.test_score_thresh,
+            self.test_nms_thresh,
+            self.test_topk_per_image,
+        )
+    def predict_probs(
+        self, predictions: List[Tuple[torch.Tensor, torch.Tensor]], proposals: List[Instances]
+    ):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were
+                used to compute predictions.
+        Returns:
+            list[Tensor]:
+                A list of Tensors of predicted class probabilities for each image.
+                Element i has shape (Ri, K + 1), where Ri is the number of proposals for image i.
+        """
+        # mean of multiple observations
+        scores = torch.stack([pred[0] for pred in predictions], dim=-1)
+        scores = scores.mean(dim=-1)
+        # threshlod by entropy
+        norm_entropy = dists.Categorical(scores.softmax(
+            dim=1)).entropy() / np.log(self.num_classes)
+        inds = norm_entropy > self.entropy_thr
+        max_scores = scores.max(dim=1)[0]
+        # set those with high entropy unknown objects
+        scores[inds, :] = 0.0
+        scores[inds, self.num_classes-1] = max_scores[inds]
+        num_inst_per_image = [len(p) for p in proposals]
+        probs = F.softmax(scores, dim=-1)
+        return probs.split(num_inst_per_image, dim=0)

opendet2/modeling/roi_heads/roi_heads.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from typing import Dict, List
+import numpy as np
+import torch
+import torch.nn.functional as F
+from detectron2.config import configurable
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.roi_heads.box_head import build_box_head
+from detectron2.modeling.roi_heads.roi_heads import (
+    ROI_HEADS_REGISTRY, StandardROIHeads, add_ground_truth_to_proposals)
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+from torch import nn
+from .fast_rcnn import build_roi_box_output_layers
+logger = logging.getLogger(__name__)
+@ROI_HEADS_REGISTRY.register()
+class OpenSetStandardROIHeads(StandardROIHeads):
+    @torch.no_grad()
+    def label_and_sample_proposals(self, proposals: List[Instances], targets: List[Instances]) -> List[Instances]:
+        if self.proposal_append_gt:
+            proposals = add_ground_truth_to_proposals(targets, proposals)
+        proposals_with_gt = []
+        num_fg_samples = []
+        num_bg_samples = []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            has_gt = len(targets_per_image) > 0
+            match_quality_matrix = pairwise_iou(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
+            )
+            matched_idxs, matched_labels = self.proposal_matcher(
+                match_quality_matrix)
+            sampled_idxs, gt_classes = self._sample_proposals(
+                matched_idxs, matched_labels, targets_per_image.gt_classes
+            )
+            # Set target attributes of the sampled proposals:
+            proposals_per_image = proposals_per_image[sampled_idxs]
+            proposals_per_image.gt_classes = gt_classes
+            # NOTE: add iou of each proposal
+            ious, _ = match_quality_matrix.max(dim=0)
+            proposals_per_image.iou = ious[sampled_idxs]
+            if has_gt:
+                sampled_targets = matched_idxs[sampled_idxs]
+                for (trg_name, trg_value) in targets_per_image.get_fields().items():
+                    if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name):
+                        proposals_per_image.set(
+                            trg_name, trg_value[sampled_targets])
+            num_bg_samples.append(
+                (gt_classes == self.num_classes).sum().item())
+            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
+            proposals_with_gt.append(proposals_per_image)
+        # Log the number of fg/bg samples that are selected for training ROI heads
+        storage = get_event_storage()
+        storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
+        storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
+        return proposals_with_gt
+    @classmethod
+    def _init_box_head(cls, cfg, input_shape):
+        # fmt: off
+        in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features)
+        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        # fmt: on
+        # If StandardROIHeads is applied on multiple feature maps (as in FPN),
+        # then we share the same predictors and therefore the channel counts must be the same
+        in_channels = [input_shape[f].channels for f in in_features]
+        # Check all channel counts are equal
+        assert len(set(in_channels)) == 1, in_channels
+        in_channels = in_channels[0]
+        box_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        box_head = build_box_head(
+            cfg, ShapeSpec(channels=in_channels,
+                           height=pooler_resolution, width=pooler_resolution)
+        )
+        # register output layers
+        box_predictor = build_roi_box_output_layers(cfg, box_head.output_shape)
+        return {
+            "box_in_features": in_features,
+            "box_pooler": box_pooler,
+            "box_head": box_head,
+            "box_predictor": box_predictor,
+        }
+@ROI_HEADS_REGISTRY.register()
+class DropoutStandardROIHeads(OpenSetStandardROIHeads):
+    @configurable
+    def __init__(self, *args, **kwargs,):
+        super().__init__(*args, **kwargs)
+        # num of sampling
+        self.num_sample = 30
+    def _forward_box(self, features: Dict[str, torch.Tensor], proposals: List[Instances], targets=None):
+        features = [features[f] for f in self.box_in_features]
+        box_features = self.box_pooler(
+            features, [x.proposal_boxes for x in proposals])
+        box_features = self.box_head(box_features)
+        # if testing, we run multiple inference for dropout sampling
+        if self.training:
+            predictions = self.box_predictor(box_features)
+        else:
+            predictions = [self.box_predictor(
+                box_features, testing=True) for _ in range(self.num_sample)]
+        del box_features
+        if self.training:
+            losses = self.box_predictor.losses(predictions, proposals)
+            # proposals is modified in-place below, so losses must be computed first.
+            if self.train_on_pred_boxes:
+                with torch.no_grad():
+                    pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
+                        predictions, proposals
+                    )
+                    for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
+                        proposals_per_image.proposal_boxes = Boxes(
+                            pred_boxes_per_image)
+            return losses
+        else:
+            pred_instances, _ = self.box_predictor.inference(
+                predictions, proposals)
+            return pred_instances

opendet2/solver/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .build import *
2	+
3	+ __all__ = list(globals().keys())

opendet2/solver/build.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import Any, Dict, List, Set
+import torch
+from detectron2.config import CfgNode
+from detectron2.solver.build import maybe_add_gradient_clipping
+def build_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer:
+    """
+    Build an optimizer from config.
+    """
+    norm_module_types = (
+        torch.nn.BatchNorm1d,
+        torch.nn.BatchNorm2d,
+        torch.nn.BatchNorm3d,
+        torch.nn.SyncBatchNorm,
+        # NaiveSyncBatchNorm inherits from BatchNorm2d
+        torch.nn.GroupNorm,
+        torch.nn.InstanceNorm1d,
+        torch.nn.InstanceNorm2d,
+        torch.nn.InstanceNorm3d,
+        torch.nn.LayerNorm,
+        torch.nn.LocalResponseNorm,
+    )
+    params: List[Dict[str, Any]] = []
+    memo: Set[torch.nn.parameter.Parameter] = set()
+    for module in model.modules():
+        for key, value in module.named_parameters(recurse=False):
+            if not value.requires_grad:
+                continue
+            # Avoid duplicating parameters
+            if value in memo:
+                continue
+            memo.add(value)
+            lr = cfg.SOLVER.BASE_LR
+            weight_decay = cfg.SOLVER.WEIGHT_DECAY
+            if isinstance(module, norm_module_types):
+                weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM
+            elif key == "bias":
+                # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0
+                # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer
+                # hyperparameters are by default exactly the same as for regular
+                # weights.
+                lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
+                weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
+            params += [{"params": [value], "lr": lr,
+                        "weight_decay": weight_decay}]
+    # To support AdamW for swin_transformer
+    if cfg.SOLVER.OPTIMIZER == "ADAMW":
+        optimizer = torch.optim.AdamW(
+            params, lr=cfg.SOLVER.BASE_LR, betas=cfg.SOLVER.BETAS, weight_decay=cfg.SOLVER.WEIGHT_DECAY)
+    else:
+        optimizer = torch.optim.SGD(
+            params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM)
+    optimizer = maybe_add_gradient_clipping(cfg, optimizer)
+    return optimizer

setup.py ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/usr/bin/env python
+from setuptools import setup
+setup(
+    name="opendet2",
+    version=0.1,
+    author="csuhan",
+    url="https://github.com/csuhan/opendet2",
+    description="Codebase for open set object detection",
+    python_requires=">=3.6",
+    install_requires=[
+        'timm', 'opencv-python'
+    ],
+)

tools/convert_swin_to_d2.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import argparse
+import os
+import torch
+def parse_args():
+    parser = argparse.ArgumentParser("Convert Swin Transformer to Detectron2")
+    parser.add_argument("source_model", default="", type=str,
+                        help="Source model")
+    parser.add_argument("output_model", default="", type=str,
+                        help="Output model")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    if os.path.splitext(args.source_model)[-1] != ".pth":
+        raise ValueError("You should save weights as pth file")
+    source_weights = torch.load(
+        args.source_model, map_location=torch.device('cpu'))["model"]
+    converted_weights = {}
+    keys = list(source_weights.keys())
+    prefix = 'backbone.bottom_up.'
+    for key in keys:
+        converted_weights[prefix+key] = source_weights[key]
+    torch.save(converted_weights, args.output_model)
+if __name__ == "__main__":
+    main()

tools/train_net.py ADDED Viewed

	@@ -0,0 +1,79 @@

+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.engine import (default_argument_parser, default_setup, hooks,
+                               launch)
+from detectron2.evaluation import verify_results
+from detectron2.utils.logger import setup_logger
+from opendet2 import OpenDetTrainer, add_opendet_config, builtin
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    # add opendet config
+    add_opendet_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    # Note: we use the key ROI_HEAD.NUM_KNOWN_CLASSES
+    # for open-set data processing and evaluation.
+    if 'RetinaNet' in cfg.MODEL.META_ARCHITECTURE:
+        cfg.MODEL.ROI_HEADS.NUM_KNOWN_CLASSES = cfg.MODEL.RETINANET.NUM_KNOWN_CLASSES
+    # add output dir if not exist
+    if cfg.OUTPUT_DIR == "./output":
+        config_name = os.path.basename(args.config_file).split(".yaml")[0]
+        cfg.OUTPUT_DIR = os.path.join(cfg.OUTPUT_DIR, config_name)
+    cfg.freeze()
+    default_setup(cfg, args)
+    setup_logger(output=cfg.OUTPUT_DIR,
+                 distributed_rank=comm.get_rank(), name="opendet2")
+    return cfg
+def main(args):
+    cfg = setup(args)
+    if args.eval_only:
+        model = OpenDetTrainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = OpenDetTrainer.test(cfg, model)
+        if cfg.TEST.AUG.ENABLED:
+            res.update(OpenDetTrainer.test_with_TTA(cfg, model))
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+    """
+    If you'd like to do anything fancier than the standard training logic,
+    consider writing your own training loop (see plain_train_net.py) or
+    subclassing the trainer.
+    """
+    trainer = OpenDetTrainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    if cfg.TEST.AUG.ENABLED:
+        trainer.register_hooks(
+            [hooks.EvalHook(
+                0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
+        )
+    return trainer.train()
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )