diff --git a/groundingdino/.ipynb_checkpoints/__init__-checkpoint.py b/groundingdino/.ipynb_checkpoints/__init__-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/groundingdino/_C.cpython-310-x86_64-linux-gnu.so b/groundingdino/_C.cpython-310-x86_64-linux-gnu.so new file mode 100755 index 0000000000000000000000000000000000000000..12c9c7bceff09304006965fe4a2c8c73ad465685 --- /dev/null +++ b/groundingdino/_C.cpython-310-x86_64-linux-gnu.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee590993174e3c9b9b4110d8c90aee2873edec5a61da94c577d381b320791520 +size 9940696 diff --git a/groundingdino/__init__.py b/groundingdino/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/groundingdino/__pycache__/__init__.cpython-310.pyc b/groundingdino/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f099950a89c0fb7f204d20b5887d9e36f78f3f1 Binary files /dev/null and b/groundingdino/__pycache__/__init__.cpython-310.pyc differ diff --git a/groundingdino/config/.ipynb_checkpoints/GroundingDINO_SwinB_cfg-checkpoint.py b/groundingdino/config/.ipynb_checkpoints/GroundingDINO_SwinB_cfg-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..f490c4bbd598a35de43d36ceafcbd769e7ff21bf --- /dev/null +++ b/groundingdino/config/.ipynb_checkpoints/GroundingDINO_SwinB_cfg-checkpoint.py @@ -0,0 +1,43 @@ +batch_size = 1 +modelname = "groundingdino" +backbone = "swin_B_384_22k" +position_embedding = "sine" +pe_temperatureH = 20 +pe_temperatureW = 20 +return_interm_indices = [1, 2, 3] +backbone_freeze_keywords = None +enc_layers = 6 +dec_layers = 6 +pre_norm = False +dim_feedforward = 2048 +hidden_dim = 256 +dropout = 0.0 +nheads = 8 +num_queries = 900 +query_dim = 4 +num_patterns = 0 +num_feature_levels = 4 +enc_n_points = 4 +dec_n_points = 4 +two_stage_type = "standard" +two_stage_bbox_embed_share = False +two_stage_class_embed_share = False +transformer_activation = "relu" +dec_pred_bbox_embed_share = True +dn_box_noise_scale = 1.0 +dn_label_noise_ratio = 0.5 +dn_label_coef = 1.0 +dn_bbox_coef = 1.0 +embed_init_tgt = True +dn_labelbook_size = 2000 +max_text_len = 256 +text_encoder_type = "bert-base-uncased" +use_text_enhancer = True +use_fusion_layer = True +use_checkpoint = True +use_transformer_ckpt = True +use_text_cross_attention = True +text_dropout = 0.0 +fusion_dropout = 0.0 +fusion_droppath = 0.1 +sub_sentence_present = True diff --git a/groundingdino/config/GroundingDINO_SwinB_cfg.py b/groundingdino/config/GroundingDINO_SwinB_cfg.py new file mode 100644 index 0000000000000000000000000000000000000000..f490c4bbd598a35de43d36ceafcbd769e7ff21bf --- /dev/null +++ b/groundingdino/config/GroundingDINO_SwinB_cfg.py @@ -0,0 +1,43 @@ +batch_size = 1 +modelname = "groundingdino" +backbone = "swin_B_384_22k" +position_embedding = "sine" +pe_temperatureH = 20 +pe_temperatureW = 20 +return_interm_indices = [1, 2, 3] +backbone_freeze_keywords = None +enc_layers = 6 +dec_layers = 6 +pre_norm = False +dim_feedforward = 2048 +hidden_dim = 256 +dropout = 0.0 +nheads = 8 +num_queries = 900 +query_dim = 4 +num_patterns = 0 +num_feature_levels = 4 +enc_n_points = 4 +dec_n_points = 4 +two_stage_type = "standard" +two_stage_bbox_embed_share = False +two_stage_class_embed_share = False +transformer_activation = "relu" +dec_pred_bbox_embed_share = True +dn_box_noise_scale = 1.0 +dn_label_noise_ratio = 0.5 +dn_label_coef = 1.0 +dn_bbox_coef = 1.0 +embed_init_tgt = True +dn_labelbook_size = 2000 +max_text_len = 256 +text_encoder_type = "bert-base-uncased" +use_text_enhancer = True +use_fusion_layer = True +use_checkpoint = True +use_transformer_ckpt = True +use_text_cross_attention = True +text_dropout = 0.0 +fusion_dropout = 0.0 +fusion_droppath = 0.1 +sub_sentence_present = True diff --git a/groundingdino/config/GroundingDINO_SwinT_OGC.py b/groundingdino/config/GroundingDINO_SwinT_OGC.py new file mode 100644 index 0000000000000000000000000000000000000000..9158d5f6260ec74bded95377d382387430d7cd70 --- /dev/null +++ b/groundingdino/config/GroundingDINO_SwinT_OGC.py @@ -0,0 +1,43 @@ +batch_size = 1 +modelname = "groundingdino" +backbone = "swin_T_224_1k" +position_embedding = "sine" +pe_temperatureH = 20 +pe_temperatureW = 20 +return_interm_indices = [1, 2, 3] +backbone_freeze_keywords = None +enc_layers = 6 +dec_layers = 6 +pre_norm = False +dim_feedforward = 2048 +hidden_dim = 256 +dropout = 0.0 +nheads = 8 +num_queries = 900 +query_dim = 4 +num_patterns = 0 +num_feature_levels = 4 +enc_n_points = 4 +dec_n_points = 4 +two_stage_type = "standard" +two_stage_bbox_embed_share = False +two_stage_class_embed_share = False +transformer_activation = "relu" +dec_pred_bbox_embed_share = True +dn_box_noise_scale = 1.0 +dn_label_noise_ratio = 0.5 +dn_label_coef = 1.0 +dn_bbox_coef = 1.0 +embed_init_tgt = True +dn_labelbook_size = 2000 +max_text_len = 256 +text_encoder_type = "bert-base-uncased" +use_text_enhancer = True +use_fusion_layer = True +use_checkpoint = True +use_transformer_ckpt = True +use_text_cross_attention = True +text_dropout = 0.0 +fusion_dropout = 0.0 +fusion_droppath = 0.1 +sub_sentence_present = True diff --git a/groundingdino/config/__init__.py b/groundingdino/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/groundingdino/datasets/__init__.py b/groundingdino/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc b/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07427fc13f16fdaea1da83b30827bd6bbedfb3c5 Binary files /dev/null and b/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc differ diff --git a/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc b/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d320cf2fd0f97a0572577b2ac2d477dfe867ef5a Binary files /dev/null and b/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc differ diff --git a/groundingdino/datasets/cocogrounding_eval.py b/groundingdino/datasets/cocogrounding_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..7693a182d86fcb2b7f707d28371849f019b883c3 --- /dev/null +++ b/groundingdino/datasets/cocogrounding_eval.py @@ -0,0 +1,269 @@ +# ------------------------------------------------------------------------ +# Grounding DINO. Midified by Shilong Liu. +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +COCO evaluator that works in distributed mode. + +Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py +The difference is that there is less copy-pasting from pycocotools +in the end of the file, as python3 can suppress prints with contextlib +""" +import contextlib +import copy +import os + +import numpy as np +import pycocotools.mask as mask_util +import torch +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval + +from groundingdino.util.misc import all_gather + + +class CocoGroundingEvaluator(object): + def __init__(self, coco_gt, iou_types, useCats=True): + assert isinstance(iou_types, (list, tuple)) + coco_gt = copy.deepcopy(coco_gt) + self.coco_gt = coco_gt + + self.iou_types = iou_types + self.coco_eval = {} + for iou_type in iou_types: + self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type) + self.coco_eval[iou_type].useCats = useCats + + self.img_ids = [] + self.eval_imgs = {k: [] for k in iou_types} + self.useCats = useCats + + def update(self, predictions): + img_ids = list(np.unique(list(predictions.keys()))) + self.img_ids.extend(img_ids) + + for iou_type in self.iou_types: + results = self.prepare(predictions, iou_type) + + # suppress pycocotools prints + with open(os.devnull, "w") as devnull: + with contextlib.redirect_stdout(devnull): + coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO() + + coco_eval = self.coco_eval[iou_type] + + coco_eval.cocoDt = coco_dt + coco_eval.params.imgIds = list(img_ids) + coco_eval.params.useCats = self.useCats + img_ids, eval_imgs = evaluate(coco_eval) + + self.eval_imgs[iou_type].append(eval_imgs) + + def synchronize_between_processes(self): + for iou_type in self.iou_types: + self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2) + create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type]) + + def accumulate(self): + for coco_eval in self.coco_eval.values(): + coco_eval.accumulate() + + def summarize(self): + for iou_type, coco_eval in self.coco_eval.items(): + print("IoU metric: {}".format(iou_type)) + coco_eval.summarize() + + def prepare(self, predictions, iou_type): + if iou_type == "bbox": + return self.prepare_for_coco_detection(predictions) + elif iou_type == "segm": + return self.prepare_for_coco_segmentation(predictions) + elif iou_type == "keypoints": + return self.prepare_for_coco_keypoint(predictions) + else: + raise ValueError("Unknown iou type {}".format(iou_type)) + + def prepare_for_coco_detection(self, predictions): + coco_results = [] + for original_id, prediction in predictions.items(): + if len(prediction) == 0: + continue + + boxes = prediction["boxes"] + boxes = convert_to_xywh(boxes).tolist() + scores = prediction["scores"].tolist() + labels = prediction["labels"].tolist() + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": labels[k], + "bbox": box, + "score": scores[k], + } + for k, box in enumerate(boxes) + ] + ) + return coco_results + + def prepare_for_coco_segmentation(self, predictions): + coco_results = [] + for original_id, prediction in predictions.items(): + if len(prediction) == 0: + continue + + scores = prediction["scores"] + labels = prediction["labels"] + masks = prediction["masks"] + + masks = masks > 0.5 + + scores = prediction["scores"].tolist() + labels = prediction["labels"].tolist() + + rles = [ + mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] + for mask in masks + ] + for rle in rles: + rle["counts"] = rle["counts"].decode("utf-8") + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": labels[k], + "segmentation": rle, + "score": scores[k], + } + for k, rle in enumerate(rles) + ] + ) + return coco_results + + def prepare_for_coco_keypoint(self, predictions): + coco_results = [] + for original_id, prediction in predictions.items(): + if len(prediction) == 0: + continue + + boxes = prediction["boxes"] + boxes = convert_to_xywh(boxes).tolist() + scores = prediction["scores"].tolist() + labels = prediction["labels"].tolist() + keypoints = prediction["keypoints"] + keypoints = keypoints.flatten(start_dim=1).tolist() + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": labels[k], + "keypoints": keypoint, + "score": scores[k], + } + for k, keypoint in enumerate(keypoints) + ] + ) + return coco_results + + +def convert_to_xywh(boxes): + xmin, ymin, xmax, ymax = boxes.unbind(1) + return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1) + + +def merge(img_ids, eval_imgs): + all_img_ids = all_gather(img_ids) + all_eval_imgs = all_gather(eval_imgs) + + merged_img_ids = [] + for p in all_img_ids: + merged_img_ids.extend(p) + + merged_eval_imgs = [] + for p in all_eval_imgs: + merged_eval_imgs.append(p) + + merged_img_ids = np.array(merged_img_ids) + merged_eval_imgs = np.concatenate(merged_eval_imgs, 2) + + # keep only unique (and in sorted order) images + merged_img_ids, idx = np.unique(merged_img_ids, return_index=True) + merged_eval_imgs = merged_eval_imgs[..., idx] + + return merged_img_ids, merged_eval_imgs + + +def create_common_coco_eval(coco_eval, img_ids, eval_imgs): + img_ids, eval_imgs = merge(img_ids, eval_imgs) + img_ids = list(img_ids) + eval_imgs = list(eval_imgs.flatten()) + + coco_eval.evalImgs = eval_imgs + coco_eval.params.imgIds = img_ids + coco_eval._paramsEval = copy.deepcopy(coco_eval.params) + + +################################################################# +# From pycocotools, just removed the prints and fixed +# a Python3 bug about unicode not defined +################################################################# + + +def evaluate(self): + """ + Run per image evaluation on given images and store results (a list of dict) in self.evalImgs + :return: None + """ + # tic = time.time() + # print('Running per image evaluation...') + p = self.params + # add backward compatibility if useSegm is specified in params + if p.useSegm is not None: + p.iouType = "segm" if p.useSegm == 1 else "bbox" + print("useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType)) + # print('Evaluate annotation type *{}*'.format(p.iouType)) + p.imgIds = list(np.unique(p.imgIds)) + if p.useCats: + p.catIds = list(np.unique(p.catIds)) + p.maxDets = sorted(p.maxDets) + self.params = p + + self._prepare() + # loop through images, area range, max detection number + catIds = p.catIds if p.useCats else [-1] + + if p.iouType == "segm" or p.iouType == "bbox": + computeIoU = self.computeIoU + elif p.iouType == "keypoints": + computeIoU = self.computeOks + self.ious = { + (imgId, catId): computeIoU(imgId, catId) + for imgId in p.imgIds + for catId in catIds} + + evaluateImg = self.evaluateImg + maxDet = p.maxDets[-1] + evalImgs = [ + evaluateImg(imgId, catId, areaRng, maxDet) + for catId in catIds + for areaRng in p.areaRng + for imgId in p.imgIds + ] + # this is NOT in the pycocotools code, but could be done outside + evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds)) + self._paramsEval = copy.deepcopy(self.params) + # toc = time.time() + # print('DONE (t={:0.2f}s).'.format(toc-tic)) + return p.imgIds, evalImgs + + +################################################################# +# end of straight copy from pycocotools, just removing the prints +################################################################# diff --git a/groundingdino/datasets/transforms.py b/groundingdino/datasets/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..91cf9269e4b31008a3ddca34a19b038a9b399991 --- /dev/null +++ b/groundingdino/datasets/transforms.py @@ -0,0 +1,311 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Transforms and data augmentation for both image + bbox. +""" +import os +import random + +import PIL +import torch +import torchvision.transforms as T +import torchvision.transforms.functional as F + +from groundingdino.util.box_ops import box_xyxy_to_cxcywh +from groundingdino.util.misc import interpolate + + +def crop(image, target, region): + cropped_image = F.crop(image, *region) + + target = target.copy() + i, j, h, w = region + + # should we do something wrt the original size? + target["size"] = torch.tensor([h, w]) + + fields = ["labels", "area", "iscrowd", "positive_map"] + + if "boxes" in target: + boxes = target["boxes"] + max_size = torch.as_tensor([w, h], dtype=torch.float32) + cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) + cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) + cropped_boxes = cropped_boxes.clamp(min=0) + area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) + target["boxes"] = cropped_boxes.reshape(-1, 4) + target["area"] = area + fields.append("boxes") + + if "masks" in target: + # FIXME should we update the area here if there are no boxes? + target["masks"] = target["masks"][:, i : i + h, j : j + w] + fields.append("masks") + + # remove elements for which the boxes or masks that have zero area + if "boxes" in target or "masks" in target: + # favor boxes selection when defining which elements to keep + # this is compatible with previous implementation + if "boxes" in target: + cropped_boxes = target["boxes"].reshape(-1, 2, 2) + keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) + else: + keep = target["masks"].flatten(1).any(1) + + for field in fields: + if field in target: + target[field] = target[field][keep] + + if os.environ.get("IPDB_SHILONG_DEBUG", None) == "INFO": + # for debug and visualization only. + if "strings_positive" in target: + target["strings_positive"] = [ + _i for _i, _j in zip(target["strings_positive"], keep) if _j + ] + + return cropped_image, target + + +def hflip(image, target): + flipped_image = F.hflip(image) + + w, h = image.size + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor( + [w, 0, w, 0] + ) + target["boxes"] = boxes + + if "masks" in target: + target["masks"] = target["masks"].flip(-1) + + return flipped_image, target + + +def resize(image, target, size, max_size=None): + # size can be min_size (scalar) or (w, h) tuple + + def get_size_with_aspect_ratio(image_size, size, max_size=None): + w, h = image_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def get_size(image_size, size, max_size=None): + if isinstance(size, (list, tuple)): + return size[::-1] + else: + return get_size_with_aspect_ratio(image_size, size, max_size) + + size = get_size(image.size, size, max_size) + rescaled_image = F.resize(image, size) + + if target is None: + return rescaled_image, None + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratio_width, ratio_height = ratios + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + scaled_boxes = boxes * torch.as_tensor( + [ratio_width, ratio_height, ratio_width, ratio_height] + ) + target["boxes"] = scaled_boxes + + if "area" in target: + area = target["area"] + scaled_area = area * (ratio_width * ratio_height) + target["area"] = scaled_area + + h, w = size + target["size"] = torch.tensor([h, w]) + + if "masks" in target: + target["masks"] = ( + interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5 + ) + + return rescaled_image, target + + +def pad(image, target, padding): + # assumes that we only pad on the bottom right corners + padded_image = F.pad(image, (0, 0, padding[0], padding[1])) + if target is None: + return padded_image, None + target = target.copy() + # should we do something wrt the original size? + target["size"] = torch.tensor(padded_image.size[::-1]) + if "masks" in target: + target["masks"] = torch.nn.functional.pad(target["masks"], (0, padding[0], 0, padding[1])) + return padded_image, target + + +class ResizeDebug(object): + def __init__(self, size): + self.size = size + + def __call__(self, img, target): + return resize(img, target, self.size) + + +class RandomCrop(object): + def __init__(self, size): + self.size = size + + def __call__(self, img, target): + region = T.RandomCrop.get_params(img, self.size) + return crop(img, target, region) + + +class RandomSizeCrop(object): + def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False): + # respect_boxes: True to keep all boxes + # False to tolerence box filter + self.min_size = min_size + self.max_size = max_size + self.respect_boxes = respect_boxes + + def __call__(self, img: PIL.Image.Image, target: dict): + init_boxes = len(target["boxes"]) + max_patience = 10 + for i in range(max_patience): + w = random.randint(self.min_size, min(img.width, self.max_size)) + h = random.randint(self.min_size, min(img.height, self.max_size)) + region = T.RandomCrop.get_params(img, [h, w]) + result_img, result_target = crop(img, target, region) + if ( + not self.respect_boxes + or len(result_target["boxes"]) == init_boxes + or i == max_patience - 1 + ): + return result_img, result_target + return result_img, result_target + + +class CenterCrop(object): + def __init__(self, size): + self.size = size + + def __call__(self, img, target): + image_width, image_height = img.size + crop_height, crop_width = self.size + crop_top = int(round((image_height - crop_height) / 2.0)) + crop_left = int(round((image_width - crop_width) / 2.0)) + return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) + + +class RandomHorizontalFlip(object): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, img, target): + if random.random() < self.p: + return hflip(img, target) + return img, target + + +class RandomResize(object): + def __init__(self, sizes, max_size=None): + assert isinstance(sizes, (list, tuple)) + self.sizes = sizes + self.max_size = max_size + + def __call__(self, img, target=None): + size = random.choice(self.sizes) + return resize(img, target, size, self.max_size) + + +class RandomPad(object): + def __init__(self, max_pad): + self.max_pad = max_pad + + def __call__(self, img, target): + pad_x = random.randint(0, self.max_pad) + pad_y = random.randint(0, self.max_pad) + return pad(img, target, (pad_x, pad_y)) + + +class RandomSelect(object): + """ + Randomly selects between transforms1 and transforms2, + with probability p for transforms1 and (1 - p) for transforms2 + """ + + def __init__(self, transforms1, transforms2, p=0.5): + self.transforms1 = transforms1 + self.transforms2 = transforms2 + self.p = p + + def __call__(self, img, target): + if random.random() < self.p: + return self.transforms1(img, target) + return self.transforms2(img, target) + + +class ToTensor(object): + def __call__(self, img, target): + return F.to_tensor(img), target + + +class RandomErasing(object): + def __init__(self, *args, **kwargs): + self.eraser = T.RandomErasing(*args, **kwargs) + + def __call__(self, img, target): + return self.eraser(img), target + + +class Normalize(object): + def __init__(self, mean, std): + self.mean = mean + self.std = std + + def __call__(self, image, target=None): + image = F.normalize(image, mean=self.mean, std=self.std) + if target is None: + return image, None + target = target.copy() + h, w = image.shape[-2:] + if "boxes" in target: + boxes = target["boxes"] + boxes = box_xyxy_to_cxcywh(boxes) + boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) + target["boxes"] = boxes + return image, target + + +class Compose(object): + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, target): + for t in self.transforms: + image, target = t(image, target) + return image, target + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string diff --git a/groundingdino/models/.ipynb_checkpoints/__init__-checkpoint.py b/groundingdino/models/.ipynb_checkpoints/__init__-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..e3413961d1d184b99835eb1e919b052d70298bc6 --- /dev/null +++ b/groundingdino/models/.ipynb_checkpoints/__init__-checkpoint.py @@ -0,0 +1,18 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .GroundingDINO import build_groundingdino + + +def build_model(args): + # we use register to maintain models from catdet6 on. + from .registry import MODULE_BUILD_FUNCS + + assert args.modelname in MODULE_BUILD_FUNCS._module_dict + build_func = MODULE_BUILD_FUNCS.get(args.modelname) + model = build_func(args) + return model diff --git a/groundingdino/models/.ipynb_checkpoints/registry-checkpoint.py b/groundingdino/models/.ipynb_checkpoints/registry-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..2d22a59eec79a2a19b83fa1779f2adaf5753aec6 --- /dev/null +++ b/groundingdino/models/.ipynb_checkpoints/registry-checkpoint.py @@ -0,0 +1,66 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# -*- coding: utf-8 -*- +# @Author: Yihao Chen +# @Date: 2021-08-16 16:03:17 +# @Last Modified by: Shilong Liu +# @Last Modified time: 2022-01-23 15:26 +# modified from mmcv + +import inspect +from functools import partial + + +class Registry(object): + def __init__(self, name): + self._name = name + self._module_dict = dict() + + def __repr__(self): + format_str = self.__class__.__name__ + "(name={}, items={})".format( + self._name, list(self._module_dict.keys()) + ) + return format_str + + def __len__(self): + return len(self._module_dict) + + @property + def name(self): + return self._name + + @property + def module_dict(self): + return self._module_dict + + def get(self, key): + return self._module_dict.get(key, None) + + def registe_with_name(self, module_name=None, force=False): + return partial(self.register, module_name=module_name, force=force) + + def register(self, module_build_function, module_name=None, force=False): + """Register a module build function. + Args: + module (:obj:`nn.Module`): Module to be registered. + """ + if not inspect.isfunction(module_build_function): + raise TypeError( + "module_build_function must be a function, but got {}".format( + type(module_build_function) + ) + ) + if module_name is None: + module_name = module_build_function.__name__ + if not force and module_name in self._module_dict: + raise KeyError("{} is already registered in {}".format(module_name, self.name)) + self._module_dict[module_name] = module_build_function + + return module_build_function + + +MODULE_BUILD_FUNCS = Registry("model build functions") diff --git a/groundingdino/models/GroundingDINO/.ipynb_checkpoints/fuse_modules-checkpoint.py b/groundingdino/models/GroundingDINO/.ipynb_checkpoints/fuse_modules-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..5a29bd5f7a18533e7f9133cc993b2d6994a7704c --- /dev/null +++ b/groundingdino/models/GroundingDINO/.ipynb_checkpoints/fuse_modules-checkpoint.py @@ -0,0 +1,298 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ + +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm.models.layers import DropPath +import loralib as lora + +class FeatureResizer(nn.Module): + """ + This class takes as input a set of embeddings of dimension C1 and outputs a set of + embedding of dimension C2, after a linear transformation, dropout and normalization (LN). + """ + + def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True): + super().__init__() + self.do_ln = do_ln + # Object feature encoding + r = 16 + self.fc = lora.Linear(input_feat_size, output_feat_size,r=r , bias=True) + self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12) + self.dropout = nn.Dropout(dropout) + + def forward(self, encoder_features): + x = self.fc(encoder_features) + if self.do_ln: + x = self.layer_norm(x) + output = self.dropout(x) + return output + + +def l1norm(X, dim, eps=1e-8): + """L1-normalize columns of X""" + norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps + X = torch.div(X, norm) + return X + + +def l2norm(X, dim, eps=1e-8): + """L2-normalize columns of X""" + norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps + X = torch.div(X, norm) + return X + + +def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8): + """ + query: (n_context, queryL, d) + context: (n_context, sourceL, d) + """ + batch_size_q, queryL = query.size(0), query.size(1) + batch_size, sourceL = context.size(0), context.size(1) + + # Get attention + # --> (batch, d, queryL) + queryT = torch.transpose(query, 1, 2) + + # (batch, sourceL, d)(batch, d, queryL) + # --> (batch, sourceL, queryL) + attn = torch.bmm(context, queryT) + if raw_feature_norm == "softmax": + # --> (batch*sourceL, queryL) + attn = attn.view(batch_size * sourceL, queryL) + attn = nn.Softmax()(attn) + # --> (batch, sourceL, queryL) + attn = attn.view(batch_size, sourceL, queryL) + elif raw_feature_norm == "l2norm": + attn = l2norm(attn, 2) + elif raw_feature_norm == "clipped_l2norm": + attn = nn.LeakyReLU(0.1)(attn) + attn = l2norm(attn, 2) + else: + raise ValueError("unknown first norm type:", raw_feature_norm) + # --> (batch, queryL, sourceL) + attn = torch.transpose(attn, 1, 2).contiguous() + # --> (batch*queryL, sourceL) + attn = attn.view(batch_size * queryL, sourceL) + attn = nn.Softmax()(attn * smooth) + # --> (batch, queryL, sourceL) + attn = attn.view(batch_size, queryL, sourceL) + # --> (batch, sourceL, queryL) + attnT = torch.transpose(attn, 1, 2).contiguous() + + # --> (batch, d, sourceL) + contextT = torch.transpose(context, 1, 2) + # (batch x d x sourceL)(batch x sourceL x queryL) + # --> (batch, d, queryL) + weightedContext = torch.bmm(contextT, attnT) + # --> (batch, queryL, d) + weightedContext = torch.transpose(weightedContext, 1, 2) + + return weightedContext, attnT + + +class BiMultiHeadAttention(nn.Module): + def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None): + super(BiMultiHeadAttention, self).__init__() + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.v_dim = v_dim + self.l_dim = l_dim + + assert ( + self.head_dim * self.num_heads == self.embed_dim + ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + self.scale = self.head_dim ** (-0.5) + self.dropout = dropout + r = 16 + self.v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r) + self.l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r) + self.values_v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r) + self.values_l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r) + + self.out_v_proj = lora.Linear(self.embed_dim, self.v_dim , r=r) + self.out_l_proj = lora.Linear(self.embed_dim, self.l_dim , r=r) + + self.stable_softmax_2d = True + self.clamp_min_for_underflow = True + self.clamp_max_for_overflow = True + + self._reset_parameters() + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def _reset_parameters(self): + nn.init.xavier_uniform_(self.v_proj.weight) + self.v_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.l_proj.weight) + self.l_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.values_v_proj.weight) + self.values_v_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.values_l_proj.weight) + self.values_l_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_v_proj.weight) + self.out_v_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_l_proj.weight) + self.out_l_proj.bias.data.fill_(0) + + def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): + """_summary_ + + Args: + v (_type_): bs, n_img, dim + l (_type_): bs, n_text, dim + attention_mask_v (_type_, optional): _description_. bs, n_img + attention_mask_l (_type_, optional): _description_. bs, n_text + + Returns: + _type_: _description_ + """ + # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': + # import ipdb; ipdb.set_trace() + bsz, tgt_len, _ = v.size() + + query_states = self.v_proj(v) * self.scale + key_states = self._shape(self.l_proj(l), -1, bsz) + value_v_states = self._shape(self.values_v_proj(v), -1, bsz) + value_l_states = self._shape(self.values_l_proj(l), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_v_states = value_v_states.view(*proj_shape) + value_l_states = value_l_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + ) + + if self.stable_softmax_2d: + attn_weights = attn_weights - attn_weights.max() + + if self.clamp_min_for_underflow: + attn_weights = torch.clamp( + attn_weights, min=-50000 + ) # Do not increase -50000, data type half has quite limited range + if self.clamp_max_for_overflow: + attn_weights = torch.clamp( + attn_weights, max=50000 + ) # Do not increase 50000, data type half has quite limited range + + attn_weights_T = attn_weights.transpose(1, 2) + attn_weights_l = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0] + if self.clamp_min_for_underflow: + attn_weights_l = torch.clamp( + attn_weights_l, min=-50000 + ) # Do not increase -50000, data type half has quite limited range + if self.clamp_max_for_overflow: + attn_weights_l = torch.clamp( + attn_weights_l, max=50000 + ) # Do not increase 50000, data type half has quite limited range + + # mask vison for language + if attention_mask_v is not None: + attention_mask_v = ( + attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + attn_weights_l.masked_fill_(attention_mask_v, float("-inf")) + + attn_weights_l = attn_weights_l.softmax(dim=-1) + + # mask language for vision + if attention_mask_l is not None: + attention_mask_l = ( + attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + attn_weights.masked_fill_(attention_mask_l, float("-inf")) + attn_weights_v = attn_weights.softmax(dim=-1) + + attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training) + attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training) + + attn_output_v = torch.bmm(attn_probs_v, value_l_states) + attn_output_l = torch.bmm(attn_probs_l, value_v_states) + + if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}" + ) + + if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim): + raise ValueError( + f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}" + ) + + attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output_v = attn_output_v.transpose(1, 2) + attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim) + + attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim) + attn_output_l = attn_output_l.transpose(1, 2) + attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim) + + attn_output_v = self.out_v_proj(attn_output_v) + attn_output_l = self.out_l_proj(attn_output_l) + + return attn_output_v, attn_output_l + + +# Bi-Direction MHA (text->image, image->text) +class BiAttentionBlock(nn.Module): + def __init__( + self, + v_dim, + l_dim, + embed_dim, + num_heads, + dropout=0.1, + drop_path=0.0, + init_values=1e-4, + cfg=None, + ): + """ + Inputs: + embed_dim - Dimensionality of input and attention feature vectors + hidden_dim - Dimensionality of hidden layer in feed-forward network + (usually 2-4x larger than embed_dim) + num_heads - Number of heads to use in the Multi-Head Attention block + dropout - Amount of dropout to apply in the feed-forward network + """ + super(BiAttentionBlock, self).__init__() + + # pre layer norm + self.layer_norm_v = nn.LayerNorm(v_dim) + self.layer_norm_l = nn.LayerNorm(l_dim) + self.attn = BiMultiHeadAttention( + v_dim=v_dim, l_dim=l_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout + ) + + # add layer scale for training stability + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=True) + self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=True) + + def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): + v = self.layer_norm_v(v) + l = self.layer_norm_l(l) + delta_v, delta_l = self.attn( + v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l + ) + # v, l = v + delta_v, l + delta_l + v = v + self.drop_path(self.gamma_v * delta_v) + l = l + self.drop_path(self.gamma_l * delta_l) + return v, l + + # def forward(self, v:List[torch.Tensor], l, attention_mask_v=None, attention_mask_l=None) diff --git a/groundingdino/models/GroundingDINO/.ipynb_checkpoints/groundingdino-checkpoint.py b/groundingdino/models/GroundingDINO/.ipynb_checkpoints/groundingdino-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..46d17087ce7fe06b57c7697d6c3d393c3da23c5a --- /dev/null +++ b/groundingdino/models/GroundingDINO/.ipynb_checkpoints/groundingdino-checkpoint.py @@ -0,0 +1,412 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR model and criterion classes. +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Modified from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# ------------------------------------------------------------------------ +import copy +from typing import List +import loralib as lora +import torch +import torch.nn.functional as F +from torch import nn +from torchvision.ops.boxes import nms +from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast + +from groundingdino.util import box_ops, get_tokenlizer +from groundingdino.util.misc import ( + NestedTensor, + accuracy, + get_world_size, + interpolate, + inverse_sigmoid, + is_dist_avail_and_initialized, + nested_tensor_from_tensor_list, +) +from groundingdino.util.utils import get_phrases_from_posmap +from groundingdino.util.visualizer import COCOVisualizer +from groundingdino.util.vl_utils import create_positive_map_from_span + +from ..registry import MODULE_BUILD_FUNCS +from .backbone import build_backbone +from .bertwarper import ( + BertModelWarper, + generate_masks_with_special_tokens, + generate_masks_with_special_tokens_and_transfer_map, +) +from .transformer import build_transformer +from .utils import MLP, ContrastiveEmbed, sigmoid_focal_loss + + +class GroundingDINO(nn.Module): + """This is the Cross-Attention Detector module that performs object detection""" + + def __init__( + self, + backbone, + transformer, + num_queries, + aux_loss=False, + iter_update=False, + query_dim=2, + num_feature_levels=1, + nheads=8, + # two stage + two_stage_type="no", # ['no', 'standard'] + dec_pred_bbox_embed_share=True, + two_stage_class_embed_share=True, + two_stage_bbox_embed_share=True, + num_patterns=0, + dn_number=100, + dn_box_noise_scale=0.4, + dn_label_noise_ratio=0.5, + dn_labelbook_size=100, + text_encoder_type="bert-base-uncased", + sub_sentence_present=True, + max_text_len=256, + ): + """Initializes the model. + Parameters: + backbone: torch module of the backbone to be used. See backbone.py + transformer: torch module of the transformer architecture. See transformer.py + num_queries: number of object queries, ie detection slot. This is the maximal number of objects + Conditional DETR can detect in a single image. For COCO, we recommend 100 queries. + aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. + """ + super().__init__() + self.num_queries = num_queries + self.transformer = transformer + self.hidden_dim = hidden_dim = transformer.d_model + self.num_feature_levels = num_feature_levels + self.nheads = nheads + self.max_text_len = 256 + self.sub_sentence_present = sub_sentence_present + + # setting query dim + self.query_dim = query_dim + assert query_dim == 4 + + # for dn training + self.num_patterns = num_patterns + self.dn_number = dn_number + self.dn_box_noise_scale = dn_box_noise_scale + self.dn_label_noise_ratio = dn_label_noise_ratio + self.dn_labelbook_size = dn_labelbook_size + + # bert + self.tokenizer = get_tokenlizer.get_tokenlizer(text_encoder_type) + self.bert = get_tokenlizer.get_pretrained_language_model(text_encoder_type) + self.bert.pooler.dense.weight.requires_grad_(False) + self.bert.pooler.dense.bias.requires_grad_(False) + self.bert = BertModelWarper(bert_model=self.bert) + + self.feat_map = nn.Linear(self.bert.config.hidden_size, self.hidden_dim, bias=True) + nn.init.constant_(self.feat_map.bias.data, 0) + nn.init.xavier_uniform_(self.feat_map.weight.data) + # freeze + + # special tokens + self.specical_tokens = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) + + # prepare input projection layers + if num_feature_levels > 1: + num_backbone_outs = len(backbone.num_channels) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = backbone.num_channels[_] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + ) + ) + for _ in range(num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, hidden_dim), + ) + ) + in_channels = hidden_dim + self.input_proj = nn.ModuleList(input_proj_list) + else: + assert two_stage_type == "no", "two_stage_type should be no if num_feature_levels=1 !!!" + self.input_proj = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d(backbone.num_channels[-1], hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + ) + ] + ) + + self.backbone = backbone + self.aux_loss = aux_loss + self.box_pred_damping = box_pred_damping = None + + self.iter_update = iter_update + assert iter_update, "Why not iter_update?" + + # prepare pred layers + self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share + # prepare class & box embed + _class_embed = ContrastiveEmbed() + + _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) + nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0) + + if dec_pred_bbox_embed_share: + box_embed_layerlist = [_bbox_embed for i in range(transformer.num_decoder_layers)] + else: + box_embed_layerlist = [ + copy.deepcopy(_bbox_embed) for i in range(transformer.num_decoder_layers) + ] + class_embed_layerlist = [_class_embed for i in range(transformer.num_decoder_layers)] + self.bbox_embed = nn.ModuleList(box_embed_layerlist) + self.class_embed = nn.ModuleList(class_embed_layerlist) + self.transformer.decoder.bbox_embed = self.bbox_embed + self.transformer.decoder.class_embed = self.class_embed + + # two stage + self.two_stage_type = two_stage_type + assert two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format( + two_stage_type + ) + if two_stage_type != "no": + if two_stage_bbox_embed_share: + assert dec_pred_bbox_embed_share + self.transformer.enc_out_bbox_embed = _bbox_embed + else: + self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed) + + if two_stage_class_embed_share: + assert dec_pred_bbox_embed_share + self.transformer.enc_out_class_embed = _class_embed + else: + self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed) + + self.refpoint_embed = None + + self._reset_parameters() + + def _reset_parameters(self): + # init input_proj + for proj in self.input_proj: + nn.init.xavier_uniform_(proj[0].weight, gain=1) + nn.init.constant_(proj[0].bias, 0) + + def set_image_tensor(self, samples: NestedTensor): + if isinstance(samples, (list, torch.Tensor)): + samples = nested_tensor_from_tensor_list(samples) + self.features, self.poss = self.backbone(samples) + + def unset_image_tensor(self): + if hasattr(self, 'features'): + del self.features + if hasattr(self,'poss'): + del self.poss + + def set_image_features(self, features , poss): + self.features = features + self.poss = poss + + def init_ref_points(self, use_num_queries): + self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim) + + def forward(self, samples: NestedTensor, targets: List = None, **kw): + """The forward expects a NestedTensor, which consists of: + - samples.tensor: batched images, of shape [batch_size x 3 x H x W] + - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels + + It returns a dict with the following elements: + - "pred_logits": the classification logits (including no-object) for all queries. + Shape= [batch_size x num_queries x num_classes] + - "pred_boxes": The normalized boxes coordinates for all queries, represented as + (center_x, center_y, width, height). These values are normalized in [0, 1], + relative to the size of each individual image (disregarding possible padding). + See PostProcess for information on how to retrieve the unnormalized bounding box. + - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of + dictionnaries containing the two above keys for each decoder layer. + """ + if targets is None: + captions = kw["captions"] + else: + captions = [t["caption"] for t in targets] + + # encoder texts + tokenized = self.tokenizer(captions, padding="longest", return_tensors="pt").to( + samples.device + ) + ( + text_self_attention_masks, + position_ids, + cate_to_token_mask_list, + ) = generate_masks_with_special_tokens_and_transfer_map( + tokenized, self.specical_tokens, self.tokenizer + ) + + if text_self_attention_masks.shape[1] > self.max_text_len: + text_self_attention_masks = text_self_attention_masks[ + :, : self.max_text_len, : self.max_text_len + ] + position_ids = position_ids[:, : self.max_text_len] + tokenized["input_ids"] = tokenized["input_ids"][:, : self.max_text_len] + tokenized["attention_mask"] = tokenized["attention_mask"][:, : self.max_text_len] + tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : self.max_text_len] + + # extract text embeddings + if self.sub_sentence_present: + tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} + tokenized_for_encoder["attention_mask"] = text_self_attention_masks + tokenized_for_encoder["position_ids"] = position_ids + else: + # import ipdb; ipdb.set_trace() + tokenized_for_encoder = tokenized + + bert_output = self.bert(**tokenized_for_encoder) # bs, 195, 768 + + encoded_text = self.feat_map(bert_output["last_hidden_state"]) # bs, 195, d_model + text_token_mask = tokenized.attention_mask.bool() # bs, 195 + # text_token_mask: True for nomask, False for mask + # text_self_attention_masks: True for nomask, False for mask + + if encoded_text.shape[1] > self.max_text_len: + encoded_text = encoded_text[:, : self.max_text_len, :] + text_token_mask = text_token_mask[:, : self.max_text_len] + position_ids = position_ids[:, : self.max_text_len] + text_self_attention_masks = text_self_attention_masks[ + :, : self.max_text_len, : self.max_text_len + ] + + text_dict = { + "encoded_text": encoded_text, # bs, 195, d_model + "text_token_mask": text_token_mask, # bs, 195 + "position_ids": position_ids, # bs, 195 + "text_self_attention_masks": text_self_attention_masks, # bs, 195,195 + } + + # import ipdb; ipdb.set_trace() + if isinstance(samples, (list, torch.Tensor)): + samples = nested_tensor_from_tensor_list(samples) + if not hasattr(self, 'features') or not hasattr(self, 'poss'): + self.set_image_tensor(samples) + + srcs = [] + masks = [] + for l, feat in enumerate(self.features): + src, mask = feat.decompose() + srcs.append(self.input_proj[l](src)) + masks.append(mask) + assert mask is not None + if self.num_feature_levels > len(srcs): + _len_srcs = len(srcs) + for l in range(_len_srcs, self.num_feature_levels): + if l == _len_srcs: + src = self.input_proj[l](self.features[-1].tensors) + else: + src = self.input_proj[l](srcs[-1]) + m = samples.mask + mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] + pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) + srcs.append(src) + masks.append(mask) + self.poss.append(pos_l) + + input_query_bbox = input_query_label = attn_mask = dn_meta = None + hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer( + srcs, masks, input_query_bbox, self.poss, input_query_label, attn_mask, text_dict + ) + + # deformable-detr-like anchor update + outputs_coord_list = [] + for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate( + zip(reference[:-1], self.bbox_embed, hs) + ): + layer_delta_unsig = layer_bbox_embed(layer_hs) + layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig) + layer_outputs_unsig = layer_outputs_unsig.sigmoid() + outputs_coord_list.append(layer_outputs_unsig) + outputs_coord_list = torch.stack(outputs_coord_list) + + # output + outputs_class = torch.stack( + [ + layer_cls_embed(layer_hs, text_dict) + for layer_cls_embed, layer_hs in zip(self.class_embed, hs) + ] + ) + out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord_list[-1]} + + # # for intermediate outputs + # if self.aux_loss: + # out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord_list) + + # # for encoder output + # if hs_enc is not None: + # # prepare intermediate outputs + # interm_coord = ref_enc[-1] + # interm_class = self.transformer.enc_out_class_embed(hs_enc[-1], text_dict) + # out['interm_outputs'] = {'pred_logits': interm_class, 'pred_boxes': interm_coord} + # out['interm_outputs_for_matching_pre'] = {'pred_logits': interm_class, 'pred_boxes': init_box_proposal} + unset_image_tensor = kw.get('unset_image_tensor', True) + if unset_image_tensor: + self.unset_image_tensor() ## If necessary + return out + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [ + {"pred_logits": a, "pred_boxes": b} + for a, b in zip(outputs_class[:-1], outputs_coord[:-1]) + ] + + +@MODULE_BUILD_FUNCS.registe_with_name(module_name="groundingdino") +def build_groundingdino(args): + + backbone = build_backbone(args) + transformer = build_transformer(args) + + dn_labelbook_size = args.dn_labelbook_size + dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share + sub_sentence_present = args.sub_sentence_present + + model = GroundingDINO( + backbone, + transformer, + num_queries=args.num_queries, + aux_loss=True, + iter_update=True, + query_dim=4, + num_feature_levels=args.num_feature_levels, + nheads=args.nheads, + dec_pred_bbox_embed_share=dec_pred_bbox_embed_share, + two_stage_type=args.two_stage_type, + two_stage_bbox_embed_share=args.two_stage_bbox_embed_share, + two_stage_class_embed_share=args.two_stage_class_embed_share, + num_patterns=args.num_patterns, + dn_number=0, + dn_box_noise_scale=args.dn_box_noise_scale, + dn_label_noise_ratio=args.dn_label_noise_ratio, + dn_labelbook_size=dn_labelbook_size, + text_encoder_type=args.text_encoder_type, + sub_sentence_present=sub_sentence_present, + max_text_len=args.max_text_len, + ) + + return model + diff --git a/groundingdino/models/GroundingDINO/.ipynb_checkpoints/ms_deform_attn-checkpoint.py b/groundingdino/models/GroundingDINO/.ipynb_checkpoints/ms_deform_attn-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..5b2a08de645db11b9952171fd25d3a5a816e4fa8 --- /dev/null +++ b/groundingdino/models/GroundingDINO/.ipynb_checkpoints/ms_deform_attn-checkpoint.py @@ -0,0 +1,414 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from: +# https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/functions/ms_deform_attn_func.py +# https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py +# https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/multi_scale_deform_attn.py +# ------------------------------------------------------------------------------------------------ + +import math +import warnings +from typing import Optional +import loralib as lora +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.init import constant_, xavier_uniform_ + +try: + from groundingdino import _C +except: + warnings.warn("Failed to load custom C++ ops. Running on CPU mode Only!") + + +# helpers +def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + +class MultiScaleDeformableAttnFunction(Function): + @staticmethod + def forward( + ctx, + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step, + ): + ctx.im2col_step = im2col_step + output = _C.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ctx.im2col_step, + ) + ctx.save_for_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + ( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) = ctx.saved_tensors + grad_value, grad_sampling_loc, grad_attn_weight = _C.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output, + ctx.im2col_step, + ) + + return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None + + +def multi_scale_deformable_attn_pytorch( + value: torch.Tensor, + value_spatial_shapes: torch.Tensor, + sampling_locations: torch.Tensor, + attention_weights: torch.Tensor, +) -> torch.Tensor: + + bs, _, num_heads, embed_dims = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level, (H_, W_) in enumerate(value_spatial_shapes): + # bs, H_*W_, num_heads, embed_dims -> + # bs, H_*W_, num_heads*embed_dims -> + # bs, num_heads*embed_dims, H_*W_ -> + # bs*num_heads, embed_dims, H_, W_ + value_l_ = ( + value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_) + ) + # bs, num_queries, num_heads, num_points, 2 -> + # bs, num_heads, num_queries, num_points, 2 -> + # bs*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1) + # bs*num_heads, embed_dims, num_queries, num_points + sampling_value_l_ = F.grid_sample( + value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False + ) + sampling_value_list.append(sampling_value_l_) + # (bs, num_queries, num_heads, num_levels, num_points) -> + # (bs, num_heads, num_queries, num_levels, num_points) -> + # (bs, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + bs * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(bs, num_heads * embed_dims, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +class MultiScaleDeformableAttention(nn.Module): + """Multi-Scale Deformable Attention Module used in Deformable-DETR + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dim (int): The embedding dimension of Attention. Default: 256. + num_heads (int): The number of attention heads. Default: 8. + num_levels (int): The number of feature map used in Attention. Default: 4. + num_points (int): The number of sampling points for each query + in each head. Default: 4. + img2col_steps (int): The step used in image_to_column. Defualt: 64. + dropout (float): Dropout layer used in output. Default: 0.1. + batch_first (bool): if ``True``, then the input and output tensor will be + provided as `(bs, n, embed_dim)`. Default: False. `(n, bs, embed_dim)` + """ + + def __init__( + self, + embed_dim: int = 256, + num_heads: int = 8, + num_levels: int = 4, + num_points: int = 4, + img2col_step: int = 64, + batch_first: bool = False, + ): + super().__init__() + if embed_dim % num_heads != 0: + raise ValueError( + "embed_dim must be divisible by num_heads, but got {} and {}".format( + embed_dim, num_heads + ) + ) + head_dim = embed_dim // num_heads + + self.batch_first = batch_first + + if not _is_power_of_2(head_dim): + warnings.warn( + """ + You'd better set d_model in MSDeformAttn to make sure that + each dim of the attention head a power of 2, which is more efficient. + """ + ) + + self.im2col_step = img2col_step + self.embed_dim = embed_dim + self.num_heads = num_heads + self.num_levels = num_levels + self.num_points = num_points + r = 16 + self.sampling_offsets = lora.Linear(embed_dim, num_heads * num_levels * num_points * 2 , r=r) + self.attention_weights = lora.Linear(embed_dim, num_heads * num_levels * num_points , r=r) + self.value_proj = lora.Linear(embed_dim, embed_dim , r=r) + self.output_proj = lora.Linear(embed_dim, embed_dim , r=r) + + self.init_weights() + + def _reset_parameters(self): + return self.init_weights() + + def init_weights(self): + """ + Default initialization for Parameters of Module. + """ + constant_(self.sampling_offsets.weight.data, 0.0) + thetas = torch.arange(self.num_heads, dtype=torch.float32) * ( + 2.0 * math.pi / self.num_heads + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(self.num_heads, 1, 1, 2) + .repeat(1, self.num_levels, self.num_points, 1) + ) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + constant_(self.attention_weights.weight.data, 0.0) + constant_(self.attention_weights.bias.data, 0.0) + xavier_uniform_(self.value_proj.weight.data) + constant_(self.value_proj.bias.data, 0.0) + xavier_uniform_(self.output_proj.weight.data) + constant_(self.output_proj.bias.data, 0.0) + + def freeze_sampling_offsets(self): + print("Freeze sampling offsets") + self.sampling_offsets.weight.requires_grad = False + self.sampling_offsets.bias.requires_grad = False + + def freeze_attention_weights(self): + print("Freeze attention weights") + self.attention_weights.weight.requires_grad = False + self.attention_weights.bias.requires_grad = False + + def forward( + self, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + value: Optional[torch.Tensor] = None, + query_pos: Optional[torch.Tensor] = None, + key_padding_mask: Optional[torch.Tensor] = None, + reference_points: Optional[torch.Tensor] = None, + spatial_shapes: Optional[torch.Tensor] = None, + level_start_index: Optional[torch.Tensor] = None, + **kwargs + ) -> torch.Tensor: + + """Forward Function of MultiScaleDeformableAttention + + Args: + query (torch.Tensor): Query embeddings with shape + `(num_query, bs, embed_dim)` + key (torch.Tensor): Key embeddings with shape + `(num_key, bs, embed_dim)` + value (torch.Tensor): Value embeddings with shape + `(num_key, bs, embed_dim)` + query_pos (torch.Tensor): The position embedding for `query`. Default: None. + key_padding_mask (torch.Tensor): ByteTensor for `query`, with shape `(bs, num_key)`, + indicating which elements within `key` to be ignored in attention. + reference_points (torch.Tensor): The normalized reference points + with shape `(bs, num_query, num_levels, 2)`, + all elements is range in [0, 1], top-left (0, 0), + bottom-right (1, 1), including padding are. + or `(N, Length_{query}, num_levels, 4)`, add additional + two dimensions `(h, w)` to form reference boxes. + spatial_shapes (torch.Tensor): Spatial shape of features in different levels. + With shape `(num_levels, 2)`, last dimension represents `(h, w)`. + level_start_index (torch.Tensor): The start index of each level. A tensor with + shape `(num_levels, )` which can be represented as + `[0, h_0 * w_0, h_0 * w_0 + h_1 * w_1, ...]`. + + Returns: + torch.Tensor: forward results with shape `(num_query, bs, embed_dim)` + """ + + if value is None: + value = query + + if query_pos is not None: + query = query + query_pos + + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], float(0)) + value = value.view(bs, num_value, self.num_heads, -1) + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2 + ) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points + ) + attention_weights = attention_weights.softmax(-1) + attention_weights = attention_weights.view( + bs, + num_query, + self.num_heads, + self.num_levels, + self.num_points, + ) + + # bs, num_query, num_heads, num_levels, num_points, 2 + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif reference_points.shape[-1] == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets + / self.num_points + * reference_points[:, :, None, :, None, 2:] + * 0.5 + ) + else: + raise ValueError( + "Last dim of reference_points must be 2 or 4, but get {} instead.".format( + reference_points.shape[-1] + ) + ) + + if torch.cuda.is_available() and value.is_cuda: + halffloat = False + if value.dtype == torch.float16: + halffloat = True + value = value.float() + sampling_locations = sampling_locations.float() + attention_weights = attention_weights.float() + + output = MultiScaleDeformableAttnFunction.apply( + value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + + if halffloat: + output = output.half() + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights + ) + + output = self.output_proj(output) + + if not self.batch_first: + output = output.permute(1, 0, 2) + + return output + + +def create_dummy_class(klass, dependency, message=""): + """ + When a dependency of a class is not available, create a dummy class which throws ImportError + when used. + + Args: + klass (str): name of the class. + dependency (str): name of the dependency. + message: extra message to print + Returns: + class: a class object + """ + err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, klass) + if message: + err = err + " " + message + + class _DummyMetaClass(type): + # throw error on class attribute access + def __getattr__(_, __): # noqa: B902 + raise ImportError(err) + + class _Dummy(object, metaclass=_DummyMetaClass): + # throw error on constructor + def __init__(self, *args, **kwargs): + raise ImportError(err) + + return _Dummy + + +def create_dummy_func(func, dependency, message=""): + """ + When a dependency of a function is not available, create a dummy function which throws + ImportError when used. + + Args: + func (str): name of the function. + dependency (str or list[str]): name(s) of the dependency. + message: extra message to print + Returns: + function: a function object + """ + err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, func) + if message: + err = err + " " + message + + if isinstance(dependency, (list, tuple)): + dependency = ",".join(dependency) + + def _dummy(*args, **kwargs): + raise ImportError(err) + + return _dummy diff --git a/groundingdino/models/GroundingDINO/.ipynb_checkpoints/transformer-checkpoint.py b/groundingdino/models/GroundingDINO/.ipynb_checkpoints/transformer-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..e3da439a39c2c82ae135b36275f8b22c70fbd904 --- /dev/null +++ b/groundingdino/models/GroundingDINO/.ipynb_checkpoints/transformer-checkpoint.py @@ -0,0 +1,961 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# DINO +# Copyright (c) 2022 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR Transformer class. +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Modified from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ + +from typing import Optional + +import torch +import torch.utils.checkpoint as checkpoint +from torch import Tensor, nn +import loralib as lora +from groundingdino.util.misc import inverse_sigmoid + +from .fuse_modules import BiAttentionBlock +from .ms_deform_attn import MultiScaleDeformableAttention as MSDeformAttn +from .transformer_vanilla import TransformerEncoderLayer +from .utils import ( + MLP, + _get_activation_fn, + _get_clones, + gen_encoder_output_proposals, + gen_sineembed_for_position, + get_sine_pos_embed, +) + + +class Transformer(nn.Module): + def __init__( + self, + d_model=256, + nhead=8, + num_queries=300, + num_encoder_layers=6, + num_unicoder_layers=0, + num_decoder_layers=6, + dim_feedforward=2048, + dropout=0.0, + activation="relu", + normalize_before=False, + return_intermediate_dec=False, + query_dim=4, + num_patterns=0, + # for deformable encoder + num_feature_levels=1, + enc_n_points=4, + dec_n_points=4, + # init query + learnable_tgt_init=False, + # two stage + two_stage_type="no", # ['no', 'standard', 'early', 'combine', 'enceachlayer', 'enclayer1'] + embed_init_tgt=False, + # for text + use_text_enhancer=False, + use_fusion_layer=False, + use_checkpoint=False, + use_transformer_ckpt=False, + use_text_cross_attention=False, + text_dropout=0.1, + fusion_dropout=0.1, + fusion_droppath=0.0, + ): + super().__init__() + self.num_feature_levels = num_feature_levels + self.num_encoder_layers = num_encoder_layers + self.num_unicoder_layers = num_unicoder_layers + self.num_decoder_layers = num_decoder_layers + self.num_queries = num_queries + assert query_dim == 4 + + # choose encoder layer type + encoder_layer = DeformableTransformerEncoderLayer( + d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, enc_n_points + ) + + if use_text_enhancer: + text_enhance_layer = TransformerEncoderLayer( + d_model=d_model, + nhead=nhead // 2, + dim_feedforward=dim_feedforward // 2, + dropout=text_dropout, + ) + else: + text_enhance_layer = None + + if use_fusion_layer: + feature_fusion_layer = BiAttentionBlock( + v_dim=d_model, + l_dim=d_model, + embed_dim=dim_feedforward // 2, + num_heads=nhead // 2, + dropout=fusion_dropout, + drop_path=fusion_droppath, + ) + else: + feature_fusion_layer = None + + encoder_norm = nn.LayerNorm(d_model) if normalize_before else None + assert encoder_norm is None + self.encoder = TransformerEncoder( + encoder_layer, + num_encoder_layers, + d_model=d_model, + num_queries=num_queries, + text_enhance_layer=text_enhance_layer, + feature_fusion_layer=feature_fusion_layer, + use_checkpoint=use_checkpoint, + use_transformer_ckpt=use_transformer_ckpt, + ) + + # choose decoder layer type + decoder_layer = DeformableTransformerDecoderLayer( + d_model, + dim_feedforward, + dropout, + activation, + num_feature_levels, + nhead, + dec_n_points, + use_text_cross_attention=use_text_cross_attention, + ) + + decoder_norm = nn.LayerNorm(d_model) + self.decoder = TransformerDecoder( + decoder_layer, + num_decoder_layers, + decoder_norm, + return_intermediate=return_intermediate_dec, + d_model=d_model, + query_dim=query_dim, + num_feature_levels=num_feature_levels, + ) + + self.d_model = d_model + self.nhead = nhead + self.dec_layers = num_decoder_layers + self.num_queries = num_queries # useful for single stage model only + self.num_patterns = num_patterns + if not isinstance(num_patterns, int): + Warning("num_patterns should be int but {}".format(type(num_patterns))) + self.num_patterns = 0 + + if num_feature_levels > 1: + if self.num_encoder_layers > 0: + self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) + else: + self.level_embed = None + + self.learnable_tgt_init = learnable_tgt_init + assert learnable_tgt_init, "why not learnable_tgt_init" + self.embed_init_tgt = embed_init_tgt + if (two_stage_type != "no" and embed_init_tgt) or (two_stage_type == "no"): + self.tgt_embed = nn.Embedding(self.num_queries, d_model) + nn.init.normal_(self.tgt_embed.weight.data) + else: + self.tgt_embed = None + + # for two stage + self.two_stage_type = two_stage_type + assert two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format( + two_stage_type + ) + if two_stage_type == "standard": + # anchor selection at the output of encoder + self.enc_output = nn.Linear(d_model, d_model) + self.enc_output_norm = nn.LayerNorm(d_model) + self.two_stage_wh_embedding = None + + if two_stage_type == "no": + self.init_ref_points(num_queries) # init self.refpoint_embed + + self.enc_out_class_embed = None + self.enc_out_bbox_embed = None + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MSDeformAttn): + m._reset_parameters() + if self.num_feature_levels > 1 and self.level_embed is not None: + nn.init.normal_(self.level_embed) + + def get_valid_ratio(self, mask): + _, H, W = mask.shape + valid_H = torch.sum(~mask[:, :, 0], 1) + valid_W = torch.sum(~mask[:, 0, :], 1) + valid_ratio_h = valid_H.float() / H + valid_ratio_w = valid_W.float() / W + valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) + return valid_ratio + + def init_ref_points(self, use_num_queries): + self.refpoint_embed = nn.Embedding(use_num_queries, 4) + + def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, text_dict=None): + """ + Input: + - srcs: List of multi features [bs, ci, hi, wi] + - masks: List of multi masks [bs, hi, wi] + - refpoint_embed: [bs, num_dn, 4]. None in infer + - pos_embeds: List of multi pos embeds [bs, ci, hi, wi] + - tgt: [bs, num_dn, d_model]. None in infer + + """ + # prepare input for encoder + src_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): + bs, c, h, w = src.shape + spatial_shape = (h, w) + spatial_shapes.append(spatial_shape) + + src = src.flatten(2).transpose(1, 2) # bs, hw, c + mask = mask.flatten(1) # bs, hw + pos_embed = pos_embed.flatten(2).transpose(1, 2) # bs, hw, c + if self.num_feature_levels > 1 and self.level_embed is not None: + lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) + else: + lvl_pos_embed = pos_embed + lvl_pos_embed_flatten.append(lvl_pos_embed) + src_flatten.append(src) + mask_flatten.append(mask) + src_flatten = torch.cat(src_flatten, 1) # bs, \sum{hxw}, c + mask_flatten = torch.cat(mask_flatten, 1) # bs, \sum{hxw} + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) # bs, \sum{hxw}, c + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=src_flatten.device + ) + level_start_index = torch.cat( + (spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]) + ) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) + + # two stage + enc_topk_proposals = enc_refpoint_embed = None + + ######################################################### + # Begin Encoder + ######################################################### + memory, memory_text = self.encoder( + src_flatten, + pos=lvl_pos_embed_flatten, + level_start_index=level_start_index, + spatial_shapes=spatial_shapes, + valid_ratios=valid_ratios, + key_padding_mask=mask_flatten, + memory_text=text_dict["encoded_text"], + text_attention_mask=~text_dict["text_token_mask"], + # we ~ the mask . False means use the token; True means pad the token + position_ids=text_dict["position_ids"], + text_self_attention_masks=text_dict["text_self_attention_masks"], + ) + ######################################################### + # End Encoder + # - memory: bs, \sum{hw}, c + # - mask_flatten: bs, \sum{hw} + # - lvl_pos_embed_flatten: bs, \sum{hw}, c + # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c) + # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c) + ######################################################### + text_dict["encoded_text"] = memory_text + # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': + # if memory.isnan().any() | memory.isinf().any(): + # import ipdb; ipdb.set_trace() + + if self.two_stage_type == "standard": + output_memory, output_proposals = gen_encoder_output_proposals( + memory, mask_flatten, spatial_shapes + ) + output_memory = self.enc_output_norm(self.enc_output(output_memory)) + + if text_dict is not None: + enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict) + else: + enc_outputs_class_unselected = self.enc_out_class_embed(output_memory) + + topk_logits = enc_outputs_class_unselected.max(-1)[0] + enc_outputs_coord_unselected = ( + self.enc_out_bbox_embed(output_memory) + output_proposals + ) # (bs, \sum{hw}, 4) unsigmoid + topk = self.num_queries + + topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] # bs, nq + + # gather boxes + refpoint_embed_undetach = torch.gather( + enc_outputs_coord_unselected, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) # unsigmoid + refpoint_embed_ = refpoint_embed_undetach.detach() + init_box_proposal = torch.gather( + output_proposals, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ).sigmoid() # sigmoid + + # gather tgt + tgt_undetach = torch.gather( + output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) + ) + if self.embed_init_tgt: + tgt_ = ( + self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) + ) # nq, bs, d_model + else: + tgt_ = tgt_undetach.detach() + + if refpoint_embed is not None: + refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1) + tgt = torch.cat([tgt, tgt_], dim=1) + else: + refpoint_embed, tgt = refpoint_embed_, tgt_ + + elif self.two_stage_type == "no": + tgt_ = ( + self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) + ) # nq, bs, d_model + refpoint_embed_ = ( + self.refpoint_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) + ) # nq, bs, 4 + + if refpoint_embed is not None: + refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1) + tgt = torch.cat([tgt, tgt_], dim=1) + else: + refpoint_embed, tgt = refpoint_embed_, tgt_ + + if self.num_patterns > 0: + tgt_embed = tgt.repeat(1, self.num_patterns, 1) + refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1) + tgt_pat = self.patterns.weight[None, :, :].repeat_interleave( + self.num_queries, 1 + ) # 1, n_q*n_pat, d_model + tgt = tgt_embed + tgt_pat + + init_box_proposal = refpoint_embed_.sigmoid() + + else: + raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type)) + ######################################################### + # End preparing tgt + # - tgt: bs, NQ, d_model + # - refpoint_embed(unsigmoid): bs, NQ, d_model + ######################################################### + + ######################################################### + # Begin Decoder + ######################################################### + hs, references = self.decoder( + tgt=tgt.transpose(0, 1), + memory=memory.transpose(0, 1), + memory_key_padding_mask=mask_flatten, + pos=lvl_pos_embed_flatten.transpose(0, 1), + refpoints_unsigmoid=refpoint_embed.transpose(0, 1), + level_start_index=level_start_index, + spatial_shapes=spatial_shapes, + valid_ratios=valid_ratios, + tgt_mask=attn_mask, + memory_text=text_dict["encoded_text"], + text_attention_mask=~text_dict["text_token_mask"], + # we ~ the mask . False means use the token; True means pad the token + ) + ######################################################### + # End Decoder + # hs: n_dec, bs, nq, d_model + # references: n_dec+1, bs, nq, query_dim + ######################################################### + + ######################################################### + # Begin postprocess + ######################################################### + if self.two_stage_type == "standard": + hs_enc = tgt_undetach.unsqueeze(0) + ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0) + else: + hs_enc = ref_enc = None + ######################################################### + # End postprocess + # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None + # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None + ######################################################### + + return hs, references, hs_enc, ref_enc, init_box_proposal + # hs: (n_dec, bs, nq, d_model) + # references: sigmoid coordinates. (n_dec+1, bs, bq, 4) + # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None + # ref_enc: sigmoid coordinates. \ + # (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None + + +class TransformerEncoder(nn.Module): + def __init__( + self, + encoder_layer, + num_layers, + d_model=256, + num_queries=300, + enc_layer_share=False, + text_enhance_layer=None, + feature_fusion_layer=None, + use_checkpoint=False, + use_transformer_ckpt=False, + ): + """_summary_ + + Args: + encoder_layer (_type_): _description_ + num_layers (_type_): _description_ + norm (_type_, optional): _description_. Defaults to None. + d_model (int, optional): _description_. Defaults to 256. + num_queries (int, optional): _description_. Defaults to 300. + enc_layer_share (bool, optional): _description_. Defaults to False. + + """ + super().__init__() + # prepare layers + self.layers = [] + self.text_layers = [] + self.fusion_layers = [] + if num_layers > 0: + self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share) + + if text_enhance_layer is not None: + self.text_layers = _get_clones( + text_enhance_layer, num_layers, layer_share=enc_layer_share + ) + if feature_fusion_layer is not None: + self.fusion_layers = _get_clones( + feature_fusion_layer, num_layers, layer_share=enc_layer_share + ) + else: + self.layers = [] + del encoder_layer + + if text_enhance_layer is not None: + self.text_layers = [] + del text_enhance_layer + if feature_fusion_layer is not None: + self.fusion_layers = [] + del feature_fusion_layer + + self.query_scale = None + self.num_queries = num_queries + self.num_layers = num_layers + self.d_model = d_model + + self.use_checkpoint = use_checkpoint + self.use_transformer_ckpt = use_transformer_ckpt + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + reference_points_list = [] + for lvl, (H_, W_) in enumerate(spatial_shapes): + + ref_y, ref_x = torch.meshgrid( + torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), + torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device), + ) + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + # for images + src: Tensor, + pos: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + valid_ratios: Tensor, + key_padding_mask: Tensor, + # for texts + memory_text: Tensor = None, + text_attention_mask: Tensor = None, + pos_text: Tensor = None, + text_self_attention_masks: Tensor = None, + position_ids: Tensor = None, + ): + """ + Input: + - src: [bs, sum(hi*wi), 256] + - pos: pos embed for src. [bs, sum(hi*wi), 256] + - spatial_shapes: h,w of each level [num_level, 2] + - level_start_index: [num_level] start point of level in sum(hi*wi). + - valid_ratios: [bs, num_level, 2] + - key_padding_mask: [bs, sum(hi*wi)] + + - memory_text: bs, n_text, 256 + - text_attention_mask: bs, n_text + False for no padding; True for padding + - pos_text: bs, n_text, 256 + + - position_ids: bs, n_text + Intermedia: + - reference_points: [bs, sum(hi*wi), num_level, 2] + Outpus: + - output: [bs, sum(hi*wi), 256] + """ + + output = src + + # preparation and reshape + if self.num_layers > 0: + reference_points = self.get_reference_points( + spatial_shapes, valid_ratios, device=src.device + ) + + if self.text_layers: + # generate pos_text + bs, n_text, text_dim = memory_text.shape + if pos_text is None and position_ids is None: + pos_text = ( + torch.arange(n_text, device=memory_text.device) + .float() + .unsqueeze(0) + .unsqueeze(-1) + .repeat(bs, 1, 1) + ) + pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False) + if position_ids is not None: + pos_text = get_sine_pos_embed( + position_ids[..., None], num_pos_feats=256, exchange_xy=False + ) + + # main process + for layer_id, layer in enumerate(self.layers): + # if output.isnan().any() or memory_text.isnan().any(): + # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': + # import ipdb; ipdb.set_trace() + if self.fusion_layers: + if self.use_checkpoint: + output, memory_text = checkpoint.checkpoint( + self.fusion_layers[layer_id], + output, + memory_text, + key_padding_mask, + text_attention_mask, + ) + else: + output, memory_text = self.fusion_layers[layer_id]( + v=output, + l=memory_text, + attention_mask_v=key_padding_mask, + attention_mask_l=text_attention_mask, + ) + + if self.text_layers: + memory_text = self.text_layers[layer_id]( + src=memory_text.transpose(0, 1), + src_mask=~text_self_attention_masks, # note we use ~ for mask here + src_key_padding_mask=text_attention_mask, + pos=(pos_text.transpose(0, 1) if pos_text is not None else None), + ).transpose(0, 1) + + # main process + if self.use_transformer_ckpt: + output = checkpoint.checkpoint( + layer, + output, + pos, + reference_points, + spatial_shapes, + level_start_index, + key_padding_mask, + ) + else: + output = layer( + src=output, + pos=pos, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + key_padding_mask=key_padding_mask, + ) + + return output, memory_text + + +class TransformerDecoder(nn.Module): + def __init__( + self, + decoder_layer, + num_layers, + norm=None, + return_intermediate=False, + d_model=256, + query_dim=4, + num_feature_levels=1, + ): + super().__init__() + if num_layers > 0: + self.layers = _get_clones(decoder_layer, num_layers) + else: + self.layers = [] + self.num_layers = num_layers + self.norm = norm + self.return_intermediate = return_intermediate + assert return_intermediate, "support return_intermediate only" + self.query_dim = query_dim + assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim) + self.num_feature_levels = num_feature_levels + + self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2) + self.query_pos_sine_scale = None + + self.query_scale = None + self.bbox_embed = None + self.class_embed = None + + self.d_model = d_model + + self.ref_anchor_head = None + + def forward( + self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + refpoints_unsigmoid: Optional[Tensor] = None, # num_queries, bs, 2 + # for memory + level_start_index: Optional[Tensor] = None, # num_levels + spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2 + valid_ratios: Optional[Tensor] = None, + # for text + memory_text: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + ): + """ + Input: + - tgt: nq, bs, d_model + - memory: hw, bs, d_model + - pos: hw, bs, d_model + - refpoints_unsigmoid: nq, bs, 2/4 + - valid_ratios/spatial_shapes: bs, nlevel, 2 + """ + output = tgt + + intermediate = [] + reference_points = refpoints_unsigmoid.sigmoid() + ref_points = [reference_points] + + for layer_id, layer in enumerate(self.layers): + + if reference_points.shape[-1] == 4: + reference_points_input = ( + reference_points[:, :, None] + * torch.cat([valid_ratios, valid_ratios], -1)[None, :] + ) # nq, bs, nlevel, 4 + else: + assert reference_points.shape[-1] == 2 + reference_points_input = reference_points[:, :, None] * valid_ratios[None, :] + query_sine_embed = gen_sineembed_for_position( + reference_points_input[:, :, 0, :] + ) # nq, bs, 256*2 + + # conditional query + raw_query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256 + pos_scale = self.query_scale(output) if self.query_scale is not None else 1 + query_pos = pos_scale * raw_query_pos + # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': + # if query_pos.isnan().any() | query_pos.isinf().any(): + # import ipdb; ipdb.set_trace() + + # main process + output = layer( + tgt=output, + tgt_query_pos=query_pos, + tgt_query_sine_embed=query_sine_embed, + tgt_key_padding_mask=tgt_key_padding_mask, + tgt_reference_points=reference_points_input, + memory_text=memory_text, + text_attention_mask=text_attention_mask, + memory=memory, + memory_key_padding_mask=memory_key_padding_mask, + memory_level_start_index=level_start_index, + memory_spatial_shapes=spatial_shapes, + memory_pos=pos, + self_attn_mask=tgt_mask, + cross_attn_mask=memory_mask, + ) + if output.isnan().any() | output.isinf().any(): + print(f"output layer_id {layer_id} is nan") + try: + num_nan = output.isnan().sum().item() + num_inf = output.isinf().sum().item() + print(f"num_nan {num_nan}, num_inf {num_inf}") + except Exception as e: + print(e) + # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': + # import ipdb; ipdb.set_trace() + + # iter update + if self.bbox_embed is not None: + # box_holder = self.bbox_embed(output) + # box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points) + # new_reference_points = box_holder[..., :self.query_dim].sigmoid() + + reference_before_sigmoid = inverse_sigmoid(reference_points) + delta_unsig = self.bbox_embed[layer_id](output) + outputs_unsig = delta_unsig + reference_before_sigmoid + new_reference_points = outputs_unsig.sigmoid() + + reference_points = new_reference_points.detach() + # if layer_id != self.num_layers - 1: + ref_points.append(new_reference_points) + + intermediate.append(self.norm(output)) + + return [ + [itm_out.transpose(0, 1) for itm_out in intermediate], + [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points], + ] + + +class DeformableTransformerEncoderLayer(nn.Module): + def __init__( + self, + d_model=256, + d_ffn=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_heads=8, + n_points=4, + ): + super().__init__() + + # self attention + self.self_attn = MSDeformAttn( + embed_dim=d_model, + num_levels=n_levels, + num_heads=n_heads, + num_points=n_points, + batch_first=True, + ) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + + # ffn + r = 16 + self.linear1 = lora.Linear(d_model, d_ffn , r=r) + self.activation = _get_activation_fn(activation, d_model=d_ffn) + self.dropout2 = nn.Dropout(dropout) + self.linear2 = lora.Linear(d_ffn, d_model , r=r) + self.dropout3 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm(d_model) + + @staticmethod + def with_pos_embed(tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, src): + src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) + src = src + self.dropout3(src2) + src = self.norm2(src) + return src + + def forward( + self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None + ): + # self attention + # import ipdb; ipdb.set_trace() + src2 = self.self_attn( + query=self.with_pos_embed(src, pos), + reference_points=reference_points, + value=src, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + key_padding_mask=key_padding_mask, + ) + src = src + self.dropout1(src2) + src = self.norm1(src) + + # ffn + src = self.forward_ffn(src) + + return src + + +class DeformableTransformerDecoderLayer(nn.Module): + def __init__( + self, + d_model=256, + d_ffn=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_heads=8, + n_points=4, + use_text_feat_guide=False, + use_text_cross_attention=False, + ): + super().__init__() + + # cross attention + self.cross_attn = MSDeformAttn( + embed_dim=d_model, + num_levels=n_levels, + num_heads=n_heads, + num_points=n_points, + batch_first=True, + ) + self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.norm1 = nn.LayerNorm(d_model) + + # cross attention text + if use_text_cross_attention: + self.ca_text = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) + self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.catext_norm = nn.LayerNorm(d_model) + + # self attention + self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) + self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.norm2 = nn.LayerNorm(d_model) + + # ffn + r = 16 + self.linear1 = lora.Linear(d_model, d_ffn , r=r) + self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1) + self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.linear2 = lora.Linear(d_ffn, d_model , r=r) + self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.norm3 = nn.LayerNorm(d_model) + + self.key_aware_proj = None + self.use_text_feat_guide = use_text_feat_guide + assert not use_text_feat_guide + self.use_text_cross_attention = use_text_cross_attention + + def rm_self_attn_modules(self): + self.self_attn = None + self.dropout2 = None + self.norm2 = None + + @staticmethod + def with_pos_embed(tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt): + with torch.cuda.amp.autocast(enabled=False): + tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout4(tgt2) + tgt = self.norm3(tgt) + return tgt + + def forward( + self, + # for tgt + tgt: Optional[Tensor], # nq, bs, d_model + tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos)) + tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos) + tgt_key_padding_mask: Optional[Tensor] = None, + tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4 + memory_text: Optional[Tensor] = None, # bs, num_token, d_model + text_attention_mask: Optional[Tensor] = None, # bs, num_token + # for memory + memory: Optional[Tensor] = None, # hw, bs, d_model + memory_key_padding_mask: Optional[Tensor] = None, + memory_level_start_index: Optional[Tensor] = None, # num_levels + memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2 + memory_pos: Optional[Tensor] = None, # pos for memory + # sa + self_attn_mask: Optional[Tensor] = None, # mask used for self-attention + cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention + ): + """ + Input: + - tgt/tgt_query_pos: nq, bs, d_model + - + """ + assert cross_attn_mask is None + + # self attention + if self.self_attn is not None: + # import ipdb; ipdb.set_trace() + q = k = self.with_pos_embed(tgt, tgt_query_pos) + tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0] + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + if self.use_text_cross_attention: + tgt2 = self.ca_text( + self.with_pos_embed(tgt, tgt_query_pos), + memory_text.transpose(0, 1), + memory_text.transpose(0, 1), + key_padding_mask=text_attention_mask, + )[0] + tgt = tgt + self.catext_dropout(tgt2) + tgt = self.catext_norm(tgt) + + tgt2 = self.cross_attn( + query=self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1), + reference_points=tgt_reference_points.transpose(0, 1).contiguous(), + value=memory.transpose(0, 1), + spatial_shapes=memory_spatial_shapes, + level_start_index=memory_level_start_index, + key_padding_mask=memory_key_padding_mask, + ).transpose(0, 1) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + + # ffn + tgt = self.forward_ffn(tgt) + + return tgt + + +def build_transformer(args): + return Transformer( + d_model=args.hidden_dim, + dropout=args.dropout, + nhead=args.nheads, + num_queries=args.num_queries, + dim_feedforward=args.dim_feedforward, + num_encoder_layers=args.enc_layers, + num_decoder_layers=args.dec_layers, + normalize_before=args.pre_norm, + return_intermediate_dec=True, + query_dim=args.query_dim, + activation=args.transformer_activation, + num_patterns=args.num_patterns, + num_feature_levels=args.num_feature_levels, + enc_n_points=args.enc_n_points, + dec_n_points=args.dec_n_points, + learnable_tgt_init=True, + # two stage + two_stage_type=args.two_stage_type, # ['no', 'standard', 'early'] + embed_init_tgt=args.embed_init_tgt, + use_text_enhancer=args.use_text_enhancer, + use_fusion_layer=args.use_fusion_layer, + use_checkpoint=args.use_checkpoint, + use_transformer_ckpt=args.use_transformer_ckpt, + use_text_cross_attention=args.use_text_cross_attention, + text_dropout=args.text_dropout, + fusion_dropout=args.fusion_dropout, + fusion_droppath=args.fusion_droppath, + ) diff --git a/groundingdino/models/GroundingDINO/.ipynb_checkpoints/transformer_vanilla-checkpoint.py b/groundingdino/models/GroundingDINO/.ipynb_checkpoints/transformer_vanilla-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..710e1c21c87281def4da9af9670e1ad880dc0f5f --- /dev/null +++ b/groundingdino/models/GroundingDINO/.ipynb_checkpoints/transformer_vanilla-checkpoint.py @@ -0,0 +1,124 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +DETR Transformer class. + +Copy-paste from torch.nn.Transformer with modifications: + * positional encodings are passed in MHattention + * extra LN at the end of encoder is removed + * decoder returns a stack of activations from all decoding layers +""" +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import Tensor, nn +import loralib as lora +from .utils import ( + MLP, + _get_activation_fn, + _get_clones, + gen_encoder_output_proposals, + gen_sineembed_for_position, + sigmoid_focal_loss, +) + + +class TextTransformer(nn.Module): + def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1): + super().__init__() + self.num_layers = num_layers + self.d_model = d_model + self.nheads = nheads + self.dim_feedforward = dim_feedforward + self.norm = None + + single_encoder_layer = TransformerEncoderLayer( + d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout + ) + self.layers = _get_clones(single_encoder_layer, num_layers) + + def forward(self, memory_text: torch.Tensor, text_attention_mask: torch.Tensor): + """ + + Args: + text_attention_mask: bs, num_token + memory_text: bs, num_token, d_model + + Raises: + RuntimeError: _description_ + + Returns: + output: bs, num_token, d_model + """ + + output = memory_text.transpose(0, 1) + + for layer in self.layers: + output = layer(output, src_key_padding_mask=text_attention_mask) + + if self.norm is not None: + output = self.norm(output) + + return output.transpose(0, 1) + + +class TransformerEncoderLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + r=16 + self.linear1 = lora.Linear(d_model, dim_feedforward , r=r) + self.dropout = nn.Dropout(dropout) + self.linear2 = lora.Linear(dim_feedforward, d_model , r=r) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + self.nhead = nhead + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward( + self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): + # repeat attn mask + if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]: + # bs, num_q, num_k + src_mask = src_mask.repeat(self.nhead, 1, 1) + + q = k = self.with_pos_embed(src, pos) + + src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0] + + # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + return src diff --git a/groundingdino/models/GroundingDINO/.ipynb_checkpoints/utils-checkpoint.py b/groundingdino/models/GroundingDINO/.ipynb_checkpoints/utils-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..319c02da18b1da17f64bd6e03372ccab8ec21c87 --- /dev/null +++ b/groundingdino/models/GroundingDINO/.ipynb_checkpoints/utils-checkpoint.py @@ -0,0 +1,269 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ + +import copy +import math + +import torch +import torch.nn.functional as F +from torch import Tensor, nn +import loralib as lora + +def _get_clones(module, N, layer_share=False): + # import ipdb; ipdb.set_trace() + if layer_share: + return nn.ModuleList([module for i in range(N)]) + else: + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def get_sine_pos_embed( + pos_tensor: torch.Tensor, + num_pos_feats: int = 128, + temperature: int = 10000, + exchange_xy: bool = True, +): + """generate sine position embedding from a position tensor + Args: + pos_tensor (torch.Tensor): shape: [..., n]. + num_pos_feats (int): projected shape for each float in the tensor. + temperature (int): temperature in the sine/cosine function. + exchange_xy (bool, optional): exchange pos x and pos y. \ + For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True. + Returns: + pos_embed (torch.Tensor): shape: [..., n*num_pos_feats]. + """ + scale = 2 * math.pi + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + + def sine_func(x: torch.Tensor): + sin_x = x * scale / dim_t + sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2) + return sin_x + + pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)] + if exchange_xy: + pos_res[0], pos_res[1] = pos_res[1], pos_res[0] + pos_res = torch.cat(pos_res, dim=-1) + return pos_res + + +def gen_encoder_output_proposals( + memory: Tensor, memory_padding_mask: Tensor, spatial_shapes: Tensor, learnedwh=None +): + """ + Input: + - memory: bs, \sum{hw}, d_model + - memory_padding_mask: bs, \sum{hw} + - spatial_shapes: nlevel, 2 + - learnedwh: 2 + Output: + - output_memory: bs, \sum{hw}, d_model + - output_proposals: bs, \sum{hw}, 4 + """ + N_, S_, C_ = memory.shape + proposals = [] + _cur = 0 + for lvl, (H_, W_) in enumerate(spatial_shapes): + mask_flatten_ = memory_padding_mask[:, _cur : (_cur + H_ * W_)].view(N_, H_, W_, 1) + valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + # import ipdb; ipdb.set_trace() + + grid_y, grid_x = torch.meshgrid( + torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device), + torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device), + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2 + + scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale + + if learnedwh is not None: + # import ipdb; ipdb.set_trace() + wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0**lvl) + else: + wh = torch.ones_like(grid) * 0.05 * (2.0**lvl) + + # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1) + # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale + # wh = torch.ones_like(grid) / scale + proposal = torch.cat((grid, wh), -1).view(N_, -1, 4) + proposals.append(proposal) + _cur += H_ * W_ + # import ipdb; ipdb.set_trace() + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all( + -1, keepdim=True + ) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid + output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + output_memory = memory + output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0)) + output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) + + # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf')) + # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf')) + + return output_memory, output_proposals + + +class RandomBoxPerturber: + def __init__( + self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2 + ) -> None: + self.noise_scale = torch.Tensor( + [x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale] + ) + + def __call__(self, refanchors: Tensor) -> Tensor: + nq, bs, query_dim = refanchors.shape + device = refanchors.device + + noise_raw = torch.rand_like(refanchors) + noise_scale = self.noise_scale.to(device)[:query_dim] + + new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale) + return new_refanchors.clamp_(0, 1) + + +def sigmoid_focal_loss( + inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False +): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + alpha: (optional) Weighting factor in range (0,1) to balance + positive vs negative examples. Default = -1 (no weighting). + gamma: Exponent of the modulating factor (1 - p_t) to + balance easy vs hard examples. + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + if no_reduction: + return loss + + return loss.mean(1).sum() / num_boxes + + +class MLP(nn.Module): + """Very simple multi-layer perceptron (also called FFN)""" + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + r=16 + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList( + [lora.Linear(n, k, r=r) for n, k in zip([input_dim] + h, h + [output_dim])] + ) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +def _get_activation_fn(activation, d_model=256, batch_dim=0): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + if activation == "prelu": + return nn.PReLU() + if activation == "selu": + return F.selu + + raise RuntimeError(f"activation should be relu/gelu, not {activation}.") + + +def gen_sineembed_for_position(pos_tensor): + # n_query, bs, _ = pos_tensor.size() + # sineembed_tensor = torch.zeros(n_query, bs, 256) + scale = 2 * math.pi + dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device) + dim_t = 10000 ** (2 * (torch.div(dim_t, 2, rounding_mode='floor')) / 128) + x_embed = pos_tensor[:, :, 0] * scale + y_embed = pos_tensor[:, :, 1] * scale + pos_x = x_embed[:, :, None] / dim_t + pos_y = y_embed[:, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) + pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) + if pos_tensor.size(-1) == 2: + pos = torch.cat((pos_y, pos_x), dim=2) + elif pos_tensor.size(-1) == 4: + w_embed = pos_tensor[:, :, 2] * scale + pos_w = w_embed[:, :, None] / dim_t + pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) + + h_embed = pos_tensor[:, :, 3] * scale + pos_h = h_embed[:, :, None] / dim_t + pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) + + pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) + else: + raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1))) + return pos + + +class ContrastiveEmbed(nn.Module): + def __init__(self, max_text_len=256): + """ + Args: + max_text_len: max length of text. + """ + super().__init__() + self.max_text_len = max_text_len + + def forward(self, x, text_dict): + """_summary_ + + Args: + x (_type_): _description_ + text_dict (_type_): _description_ + { + 'encoded_text': encoded_text, # bs, 195, d_model + 'text_token_mask': text_token_mask, # bs, 195 + # True for used tokens. False for padding tokens + } + Returns: + _type_: _description_ + """ + assert isinstance(text_dict, dict) + + y = text_dict["encoded_text"] + text_token_mask = text_dict["text_token_mask"] + + res = x @ y.transpose(-1, -2) + res.masked_fill_(~text_token_mask[:, None, :], float("-inf")) + + # padding to max_text_len + new_res = torch.full((*res.shape[:-1], self.max_text_len), float("-inf"), device=res.device) + new_res[..., : res.shape[-1]] = res + + return new_res diff --git a/groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-310.pyc b/groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b27a34a90883ace0581c3cf53e54c122e4027060 Binary files /dev/null and b/groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-310.pyc differ diff --git a/groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-310.pyc b/groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44c68f77e5c5dfad2797910e76782a6c1b94b5f4 Binary files /dev/null and b/groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-310.pyc differ diff --git a/groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-310.pyc b/groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..561ecd957e3462f9f0dbc309fd1d6c902ad10ff0 Binary files /dev/null and b/groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-310.pyc differ diff --git a/groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-310.pyc b/groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c5328df4104890b8f3700c46601d696dd33353f Binary files /dev/null and b/groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-310.pyc differ diff --git a/groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-310.pyc b/groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91f517e3908449fe0d9f61c44da767a68f39c3ee Binary files /dev/null and b/groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-310.pyc differ diff --git a/groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-310.pyc b/groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f9a93a09f4bf3217989d2d6b323540f5e89266e Binary files /dev/null and b/groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-310.pyc differ diff --git a/groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-310.pyc b/groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6fdc5340968220b32f526672be6bc22e0edcd90 Binary files /dev/null and b/groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-310.pyc differ diff --git a/groundingdino/models/GroundingDINO/__pycache__/utils.cpython-310.pyc b/groundingdino/models/GroundingDINO/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1cf0c2d69982b93dae2217ca59587a2a4fcf534 Binary files /dev/null and b/groundingdino/models/GroundingDINO/__pycache__/utils.cpython-310.pyc differ diff --git a/groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/backbone-checkpoint.py b/groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/backbone-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..59b851deacb80a9496b5082db8fb82c186c1e822 --- /dev/null +++ b/groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/backbone-checkpoint.py @@ -0,0 +1,220 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copied from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ + + +# Backbone modules. + +from typing import Dict, List +import loralib as lora +import torch +import torch.nn.functional as F +import torchvision +from torch import nn +from torchvision.models._utils import IntermediateLayerGetter +import loralib as lora +from groundingdino.util.misc import NestedTensor, clean_state_dict, is_main_process + +from .position_encoding import build_position_encoding +from .swin_transformer import build_swin_transformer + + +class FrozenBatchNorm2d(torch.nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, + without which any other models than torchvision.models.resnet[18,34,50,101] + produce nans. + """ + + def __init__(self, n): + super(FrozenBatchNorm2d, self).__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super(FrozenBatchNorm2d, self)._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it fuser-friendly + w = self.weight.reshape(1, -1, 1, 1) + b = self.bias.reshape(1, -1, 1, 1) + rv = self.running_var.reshape(1, -1, 1, 1) + rm = self.running_mean.reshape(1, -1, 1, 1) + eps = 1e-5 + scale = w * (rv + eps).rsqrt() + bias = b - rm * scale + return x * scale + bias + + +class BackboneBase(nn.Module): + def __init__( + self, + backbone: nn.Module, + train_backbone: bool, + num_channels: int, + return_interm_indices: list, + ): + super().__init__() + for name, parameter in backbone.named_parameters(): + if ( + not train_backbone + or "layer2" not in name + and "layer3" not in name + and "layer4" not in name + ): + parameter.requires_grad_(False) + + return_layers = {} + for idx, layer_index in enumerate(return_interm_indices): + return_layers.update( + {"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)} + ) + + # if len: + # if use_stage1_feature: + # return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} + # else: + # return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"} + # else: + # return_layers = {'layer4': "0"} + self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) + self.num_channels = num_channels + + def forward(self, tensor_list: NestedTensor): + xs = self.body(tensor_list.tensors) + out: Dict[str, NestedTensor] = {} + for name, x in xs.items(): + m = tensor_list.mask + assert m is not None + mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] + out[name] = NestedTensor(x, mask) + # import ipdb; ipdb.set_trace() + return out + + +class Backbone(BackboneBase): + """ResNet backbone with frozen BatchNorm.""" + + def __init__( + self, + name: str, + train_backbone: bool, + dilation: bool, + return_interm_indices: list, + batch_norm=FrozenBatchNorm2d, + ): + if name in ["resnet18", "resnet34", "resnet50", "resnet101"]: + backbone = getattr(torchvision.models, name)( + replace_stride_with_dilation=[False, False, dilation], + pretrained=is_main_process(), + norm_layer=batch_norm, + ) + else: + raise NotImplementedError("Why you can get here with name {}".format(name)) + # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048 + assert name not in ("resnet18", "resnet34"), "Only resnet50 and resnet101 are available." + assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]] + num_channels_all = [256, 512, 1024, 2048] + num_channels = num_channels_all[4 - len(return_interm_indices) :] + super().__init__(backbone, train_backbone, num_channels, return_interm_indices) + + +class Joiner(nn.Sequential): + def __init__(self, backbone, position_embedding): + super().__init__(backbone, position_embedding) + + def forward(self, tensor_list: NestedTensor): + xs = self[0](tensor_list) + out: List[NestedTensor] = [] + pos = [] + for name, x in xs.items(): + out.append(x) + # position encoding + pos.append(self[1](x).to(x.tensors.dtype)) + + return out, pos + + +def build_backbone(args): + """ + Useful args: + - backbone: backbone name + - lr_backbone: + - dilation + - return_interm_indices: available: [0,1,2,3], [1,2,3], [3] + - backbone_freeze_keywords: + - use_checkpoint: for swin only for now + + """ + position_embedding = build_position_encoding(args) + train_backbone = True + if not train_backbone: + raise ValueError("Please set lr_backbone > 0") + return_interm_indices = args.return_interm_indices + assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]] + args.backbone_freeze_keywords + use_checkpoint = getattr(args, "use_checkpoint", False) + + if args.backbone in ["resnet50", "resnet101"]: + backbone = Backbone( + args.backbone, + train_backbone, + args.dilation, + return_interm_indices, + batch_norm=FrozenBatchNorm2d, + ) + bb_num_channels = backbone.num_channels + elif args.backbone in [ + "swin_T_224_1k", + "swin_B_224_22k", + "swin_B_384_22k", + "swin_L_224_22k", + "swin_L_384_22k", + ]: + pretrain_img_size = int(args.backbone.split("_")[-2]) + backbone = build_swin_transformer( + args.backbone, + pretrain_img_size=pretrain_img_size, + out_indices=tuple(return_interm_indices), + dilation=False, + use_checkpoint=use_checkpoint, + ) + + bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :] + else: + raise NotImplementedError("Unknown backbone {}".format(args.backbone)) + + assert len(bb_num_channels) == len( + return_interm_indices + ), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}" + + model = Joiner(backbone, position_embedding) + model.num_channels = bb_num_channels + assert isinstance( + bb_num_channels, List + ), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels)) + # import ipdb; ipdb.set_trace() + return model diff --git a/groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/position_encoding-checkpoint.py b/groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/position_encoding-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..1de975730d80b78552e7345c78c923ac9fcad52d --- /dev/null +++ b/groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/position_encoding-checkpoint.py @@ -0,0 +1,186 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# DINO +# Copyright (c) 2022 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copied from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ + +""" +Various positional encodings for the transformer. +""" +import math + +import torch +from torch import nn +import loralib as lora +from groundingdino.util.misc import NestedTensor + + +class PositionEmbeddingSine(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + mask = tensor_list.mask + assert mask is not None + not_mask = ~mask + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + eps = 1e-6 + # if os.environ.get("SHILONG_AMP", None) == '1': + # eps = 1e-4 + # else: + # eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class PositionEmbeddingSineHW(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__( + self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None + ): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperatureH = temperatureH + self.temperatureW = temperatureW + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + mask = tensor_list.mask + assert mask is not None + not_mask = ~mask + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + + # import ipdb; ipdb.set_trace() + + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_tx = self.temperatureW ** (2 * (torch.div(dim_tx, 2, rounding_mode='floor')) / self.num_pos_feats) + pos_x = x_embed[:, :, :, None] / dim_tx + + dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_ty = self.temperatureH ** (2 * (torch.div(dim_ty, 2, rounding_mode='floor')) / self.num_pos_feats) + pos_y = y_embed[:, :, :, None] / dim_ty + + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + + # import ipdb; ipdb.set_trace() + + return pos + + +class PositionEmbeddingLearned(nn.Module): + """ + Absolute pos embedding, learned. + """ + + def __init__(self, num_pos_feats=256): + super().__init__() + self.row_embed = nn.Embedding(50, num_pos_feats) + self.col_embed = nn.Embedding(50, num_pos_feats) + self.reset_parameters() + + def reset_parameters(self): + nn.init.uniform_(self.row_embed.weight) + nn.init.uniform_(self.col_embed.weight) + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + h, w = x.shape[-2:] + i = torch.arange(w, device=x.device) + j = torch.arange(h, device=x.device) + x_emb = self.col_embed(i) + y_emb = self.row_embed(j) + pos = ( + torch.cat( + [ + x_emb.unsqueeze(0).repeat(h, 1, 1), + y_emb.unsqueeze(1).repeat(1, w, 1), + ], + dim=-1, + ) + .permute(2, 0, 1) + .unsqueeze(0) + .repeat(x.shape[0], 1, 1, 1) + ) + return pos + + +def build_position_encoding(args): + N_steps = args.hidden_dim // 2 + if args.position_embedding in ("v2", "sine"): + # TODO find a better way of exposing other arguments + position_embedding = PositionEmbeddingSineHW( + N_steps, + temperatureH=args.pe_temperatureH, + temperatureW=args.pe_temperatureW, + normalize=True, + ) + elif args.position_embedding in ("v3", "learned"): + position_embedding = PositionEmbeddingLearned(N_steps) + else: + raise ValueError(f"not supported {args.position_embedding}") + + return position_embedding diff --git a/groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/swin_transformer-checkpoint.py b/groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/swin_transformer-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..d75fe2d979f60cecc88b59711eb39accf24e6a8c --- /dev/null +++ b/groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/swin_transformer-checkpoint.py @@ -0,0 +1,804 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# DINO +# Copyright (c) 2022 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# -------------------------------------------------------- +# modified from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py +# -------------------------------------------------------- + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ +import loralib as lora +from groundingdino.util.misc import NestedTensor + + +class Mlp(nn.Module): + """Multilayer perceptron.""" + + def __init__( + self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + r = 16 + self.fc1 = lora.Linear(in_features, hidden_features , r=r) + self.act = act_layer() + self.fc2 = lora.Linear(hidden_features, out_features , r=r) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + """Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__( + self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + r =16 + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = lora.Linear(dim, dim * 3,r=r , bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = lora.Linear(dim, dim , r=r) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=0.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """Forward function. + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B_, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Module): + """Swin Transformer Block. + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__( + self, + dim, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop + ) + + self.H = None + self.W = None + + def forward(self, x, mask_matrix): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + mask_matrix: Attention mask for cyclic shift. + """ + B, L, C = x.shape + H, W = self.H, self.W + assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # pad feature maps to multiples of window size + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + _, Hp, Wp, _ = x.shape + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size + ) # nW*B, window_size, window_size, C + x_windows = x_windows.view( + -1, self.window_size * self.window_size, C + ) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchMerging(nn.Module): + """Patch Merging Layer + Args: + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + r = 16 + self.reduction = lora.Linear(4 * dim, 2 * dim, r=r, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + x = x.view(B, H, W, C) + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class BasicLayer(nn.Module): + """A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (int): Local window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + dim, + depth, + num_heads, + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + ): + super().__init__() + self.window_size = window_size + self.shift_size = window_size // 2 + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList( + [ + SwinTransformerBlock( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + + # calculate attention mask for SW-MSA + Hp = int(np.ceil(H / self.window_size)) * self.window_size + Wp = int(np.ceil(W / self.window_size)) * self.window_size + img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size + ) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( + attn_mask == 0, float(0.0) + ) + + for blk in self.blocks: + blk.H, blk.W = H, W + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, attn_mask) + else: + x = blk(x, attn_mask) + if self.downsample is not None: + x_down = self.downsample(x, H, W) + Wh, Ww = (H + 1) // 2, (W + 1) // 2 + return x, H, W, x_down, Wh, Ww + else: + return x, H, W, x, H, W + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding + Args: + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + patch_size = to_2tuple(patch_size) + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, H, W = x.size() + if W % self.patch_size[1] != 0: + x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) + if H % self.patch_size[0] != 0: + x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) + + x = self.proj(x) # B C Wh Ww + if self.norm is not None: + Wh, Ww = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) + + return x + + +class SwinTransformer(nn.Module): + """Swin Transformer backbone. + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + Args: + pretrain_img_size (int): Input image size for training the pretrained model, + used in absolute postion embedding. Default 224. + patch_size (int | tuple(int)): Patch size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + depths (tuple[int]): Depths of each Swin Transformer stage. + num_heads (tuple[int]): Number of attention head of each stage. + window_size (int): Window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. + patch_norm (bool): If True, add normalization after patch embedding. Default: True. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + dilation (bool): if True, the output size if 16x downsample, ow 32x downsample. + """ + + def __init__( + self, + pretrain_img_size=224, + patch_size=4, + in_chans=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + ape=False, + patch_norm=True, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + dilation=False, + use_checkpoint=False, + ): + super().__init__() + + self.pretrain_img_size = pretrain_img_size + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.dilation = dilation + + # if use_checkpoint: + # print("use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!") + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None, + ) + + # absolute position embedding + if self.ape: + pretrain_img_size = to_2tuple(pretrain_img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + pretrain_img_size[0] // patch_size[0], + pretrain_img_size[1] // patch_size[1], + ] + + self.absolute_pos_embed = nn.Parameter( + torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]) + ) + trunc_normal_(self.absolute_pos_embed, std=0.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + # prepare downsample list + downsamplelist = [PatchMerging for i in range(self.num_layers)] + downsamplelist[-1] = None + num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)] + if self.dilation: + downsamplelist[-2] = None + num_features[-1] = int(embed_dim * 2 ** (self.num_layers - 1)) // 2 + for i_layer in range(self.num_layers): + layer = BasicLayer( + # dim=int(embed_dim * 2 ** i_layer), + dim=num_features[i_layer], + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], + norm_layer=norm_layer, + # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + downsample=downsamplelist[i_layer], + use_checkpoint=use_checkpoint, + ) + self.layers.append(layer) + + # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)] + self.num_features = num_features + + # add a norm layer for each output + for i_layer in out_indices: + layer = norm_layer(num_features[i_layer]) + layer_name = f"norm{i_layer}" + self.add_module(layer_name, layer) + + self._freeze_stages() + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 1 and self.ape: + self.absolute_pos_embed.requires_grad = False + + if self.frozen_stages >= 2: + self.pos_drop.eval() + for i in range(0, self.frozen_stages - 1): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + # def init_weights(self, pretrained=None): + # """Initialize the weights in backbone. + # Args: + # pretrained (str, optional): Path to pre-trained weights. + # Defaults to None. + # """ + + # def _init_weights(m): + # if isinstance(m, nn.Linear): + # trunc_normal_(m.weight, std=.02) + # if isinstance(m, nn.Linear) and m.bias is not None: + # nn.init.constant_(m.bias, 0) + # elif isinstance(m, nn.LayerNorm): + # nn.init.constant_(m.bias, 0) + # nn.init.constant_(m.weight, 1.0) + + # if isinstance(pretrained, str): + # self.apply(_init_weights) + # logger = get_root_logger() + # load_checkpoint(self, pretrained, strict=False, logger=logger) + # elif pretrained is None: + # self.apply(_init_weights) + # else: + # raise TypeError('pretrained must be a str or None') + + def forward_raw(self, x): + """Forward function.""" + x = self.patch_embed(x) + + Wh, Ww = x.size(2), x.size(3) + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" + ) + x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C + else: + x = x.flatten(2).transpose(1, 2) + x = self.pos_drop(x) + + outs = [] + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + # import ipdb; ipdb.set_trace() + + if i in self.out_indices: + norm_layer = getattr(self, f"norm{i}") + x_out = norm_layer(x_out) + + out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() + outs.append(out) + # in: + # torch.Size([2, 3, 1024, 1024]) + # outs: + # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \ + # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])] + return tuple(outs) + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + + """Forward function.""" + x = self.patch_embed(x) + + Wh, Ww = x.size(2), x.size(3) + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" + ) + x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C + else: + x = x.flatten(2).transpose(1, 2) + x = self.pos_drop(x) + + outs = [] + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + + if i in self.out_indices: + norm_layer = getattr(self, f"norm{i}") + x_out = norm_layer(x_out) + + out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() + outs.append(out) + # in: + # torch.Size([2, 3, 1024, 1024]) + # out: + # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \ + # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])] + + # collect for nesttensors + outs_dict = {} + for idx, out_i in enumerate(outs): + m = tensor_list.mask + assert m is not None + mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0] + outs_dict[idx] = NestedTensor(out_i, mask) + + return outs_dict + + def train(self, mode=True): + """Convert the model into training mode while keep layers freezed.""" + super(SwinTransformer, self).train(mode) + self._freeze_stages() + + +def build_swin_transformer(modelname, pretrain_img_size, **kw): + assert modelname in [ + "swin_T_224_1k", + "swin_B_224_22k", + "swin_B_384_22k", + "swin_L_224_22k", + "swin_L_384_22k", + ] + + model_para_dict = { + "swin_T_224_1k": dict( + embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7 + ), + "swin_B_224_22k": dict( + embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7 + ), + "swin_B_384_22k": dict( + embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=12 + ), + "swin_L_224_22k": dict( + embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7 + ), + "swin_L_384_22k": dict( + embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12 + ), + } + kw_cgf = model_para_dict[modelname] + kw_cgf.update(kw) + model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf) + return model + + +if __name__ == "__main__": + model = build_swin_transformer("swin_L_384_22k", 384, dilation=True) + x = torch.rand(2, 3, 1024, 1024) + y = model.forward_raw(x) + import ipdb + + ipdb.set_trace() + x = torch.rand(2, 3, 384, 384) + y = model.forward_raw(x) diff --git a/groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-310.pyc b/groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3565744062569bb0e8076163f8cbcc8ea2891c64 Binary files /dev/null and b/groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-310.pyc differ diff --git a/groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-310.pyc b/groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e855e72d35bfbee61cfb63ed91d6583117f228a Binary files /dev/null and b/groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-310.pyc differ diff --git a/groundingdino/models/GroundingDINO/backbone/__pycache__/position_encoding.cpython-310.pyc b/groundingdino/models/GroundingDINO/backbone/__pycache__/position_encoding.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10c886908eb329fe35ffa3b8897959ad6f4cd229 Binary files /dev/null and b/groundingdino/models/GroundingDINO/backbone/__pycache__/position_encoding.cpython-310.pyc differ diff --git a/groundingdino/models/GroundingDINO/backbone/__pycache__/swin_transformer.cpython-310.pyc b/groundingdino/models/GroundingDINO/backbone/__pycache__/swin_transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e16bc132f85455151acda1929671efeb789bdff Binary files /dev/null and b/groundingdino/models/GroundingDINO/backbone/__pycache__/swin_transformer.cpython-310.pyc differ diff --git a/groundingdino/models/GroundingDINO/backbone/backbone.py b/groundingdino/models/GroundingDINO/backbone/backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..59b851deacb80a9496b5082db8fb82c186c1e822 --- /dev/null +++ b/groundingdino/models/GroundingDINO/backbone/backbone.py @@ -0,0 +1,220 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copied from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ + + +# Backbone modules. + +from typing import Dict, List +import loralib as lora +import torch +import torch.nn.functional as F +import torchvision +from torch import nn +from torchvision.models._utils import IntermediateLayerGetter +import loralib as lora +from groundingdino.util.misc import NestedTensor, clean_state_dict, is_main_process + +from .position_encoding import build_position_encoding +from .swin_transformer import build_swin_transformer + + +class FrozenBatchNorm2d(torch.nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, + without which any other models than torchvision.models.resnet[18,34,50,101] + produce nans. + """ + + def __init__(self, n): + super(FrozenBatchNorm2d, self).__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super(FrozenBatchNorm2d, self)._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it fuser-friendly + w = self.weight.reshape(1, -1, 1, 1) + b = self.bias.reshape(1, -1, 1, 1) + rv = self.running_var.reshape(1, -1, 1, 1) + rm = self.running_mean.reshape(1, -1, 1, 1) + eps = 1e-5 + scale = w * (rv + eps).rsqrt() + bias = b - rm * scale + return x * scale + bias + + +class BackboneBase(nn.Module): + def __init__( + self, + backbone: nn.Module, + train_backbone: bool, + num_channels: int, + return_interm_indices: list, + ): + super().__init__() + for name, parameter in backbone.named_parameters(): + if ( + not train_backbone + or "layer2" not in name + and "layer3" not in name + and "layer4" not in name + ): + parameter.requires_grad_(False) + + return_layers = {} + for idx, layer_index in enumerate(return_interm_indices): + return_layers.update( + {"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)} + ) + + # if len: + # if use_stage1_feature: + # return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} + # else: + # return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"} + # else: + # return_layers = {'layer4': "0"} + self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) + self.num_channels = num_channels + + def forward(self, tensor_list: NestedTensor): + xs = self.body(tensor_list.tensors) + out: Dict[str, NestedTensor] = {} + for name, x in xs.items(): + m = tensor_list.mask + assert m is not None + mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] + out[name] = NestedTensor(x, mask) + # import ipdb; ipdb.set_trace() + return out + + +class Backbone(BackboneBase): + """ResNet backbone with frozen BatchNorm.""" + + def __init__( + self, + name: str, + train_backbone: bool, + dilation: bool, + return_interm_indices: list, + batch_norm=FrozenBatchNorm2d, + ): + if name in ["resnet18", "resnet34", "resnet50", "resnet101"]: + backbone = getattr(torchvision.models, name)( + replace_stride_with_dilation=[False, False, dilation], + pretrained=is_main_process(), + norm_layer=batch_norm, + ) + else: + raise NotImplementedError("Why you can get here with name {}".format(name)) + # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048 + assert name not in ("resnet18", "resnet34"), "Only resnet50 and resnet101 are available." + assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]] + num_channels_all = [256, 512, 1024, 2048] + num_channels = num_channels_all[4 - len(return_interm_indices) :] + super().__init__(backbone, train_backbone, num_channels, return_interm_indices) + + +class Joiner(nn.Sequential): + def __init__(self, backbone, position_embedding): + super().__init__(backbone, position_embedding) + + def forward(self, tensor_list: NestedTensor): + xs = self[0](tensor_list) + out: List[NestedTensor] = [] + pos = [] + for name, x in xs.items(): + out.append(x) + # position encoding + pos.append(self[1](x).to(x.tensors.dtype)) + + return out, pos + + +def build_backbone(args): + """ + Useful args: + - backbone: backbone name + - lr_backbone: + - dilation + - return_interm_indices: available: [0,1,2,3], [1,2,3], [3] + - backbone_freeze_keywords: + - use_checkpoint: for swin only for now + + """ + position_embedding = build_position_encoding(args) + train_backbone = True + if not train_backbone: + raise ValueError("Please set lr_backbone > 0") + return_interm_indices = args.return_interm_indices + assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]] + args.backbone_freeze_keywords + use_checkpoint = getattr(args, "use_checkpoint", False) + + if args.backbone in ["resnet50", "resnet101"]: + backbone = Backbone( + args.backbone, + train_backbone, + args.dilation, + return_interm_indices, + batch_norm=FrozenBatchNorm2d, + ) + bb_num_channels = backbone.num_channels + elif args.backbone in [ + "swin_T_224_1k", + "swin_B_224_22k", + "swin_B_384_22k", + "swin_L_224_22k", + "swin_L_384_22k", + ]: + pretrain_img_size = int(args.backbone.split("_")[-2]) + backbone = build_swin_transformer( + args.backbone, + pretrain_img_size=pretrain_img_size, + out_indices=tuple(return_interm_indices), + dilation=False, + use_checkpoint=use_checkpoint, + ) + + bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :] + else: + raise NotImplementedError("Unknown backbone {}".format(args.backbone)) + + assert len(bb_num_channels) == len( + return_interm_indices + ), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}" + + model = Joiner(backbone, position_embedding) + model.num_channels = bb_num_channels + assert isinstance( + bb_num_channels, List + ), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels)) + # import ipdb; ipdb.set_trace() + return model diff --git a/groundingdino/models/GroundingDINO/backbone/position_encoding.py b/groundingdino/models/GroundingDINO/backbone/position_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..1de975730d80b78552e7345c78c923ac9fcad52d --- /dev/null +++ b/groundingdino/models/GroundingDINO/backbone/position_encoding.py @@ -0,0 +1,186 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# DINO +# Copyright (c) 2022 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copied from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ + +""" +Various positional encodings for the transformer. +""" +import math + +import torch +from torch import nn +import loralib as lora +from groundingdino.util.misc import NestedTensor + + +class PositionEmbeddingSine(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + mask = tensor_list.mask + assert mask is not None + not_mask = ~mask + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + eps = 1e-6 + # if os.environ.get("SHILONG_AMP", None) == '1': + # eps = 1e-4 + # else: + # eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class PositionEmbeddingSineHW(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__( + self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None + ): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperatureH = temperatureH + self.temperatureW = temperatureW + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + mask = tensor_list.mask + assert mask is not None + not_mask = ~mask + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + + # import ipdb; ipdb.set_trace() + + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_tx = self.temperatureW ** (2 * (torch.div(dim_tx, 2, rounding_mode='floor')) / self.num_pos_feats) + pos_x = x_embed[:, :, :, None] / dim_tx + + dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_ty = self.temperatureH ** (2 * (torch.div(dim_ty, 2, rounding_mode='floor')) / self.num_pos_feats) + pos_y = y_embed[:, :, :, None] / dim_ty + + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + + # import ipdb; ipdb.set_trace() + + return pos + + +class PositionEmbeddingLearned(nn.Module): + """ + Absolute pos embedding, learned. + """ + + def __init__(self, num_pos_feats=256): + super().__init__() + self.row_embed = nn.Embedding(50, num_pos_feats) + self.col_embed = nn.Embedding(50, num_pos_feats) + self.reset_parameters() + + def reset_parameters(self): + nn.init.uniform_(self.row_embed.weight) + nn.init.uniform_(self.col_embed.weight) + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + h, w = x.shape[-2:] + i = torch.arange(w, device=x.device) + j = torch.arange(h, device=x.device) + x_emb = self.col_embed(i) + y_emb = self.row_embed(j) + pos = ( + torch.cat( + [ + x_emb.unsqueeze(0).repeat(h, 1, 1), + y_emb.unsqueeze(1).repeat(1, w, 1), + ], + dim=-1, + ) + .permute(2, 0, 1) + .unsqueeze(0) + .repeat(x.shape[0], 1, 1, 1) + ) + return pos + + +def build_position_encoding(args): + N_steps = args.hidden_dim // 2 + if args.position_embedding in ("v2", "sine"): + # TODO find a better way of exposing other arguments + position_embedding = PositionEmbeddingSineHW( + N_steps, + temperatureH=args.pe_temperatureH, + temperatureW=args.pe_temperatureW, + normalize=True, + ) + elif args.position_embedding in ("v3", "learned"): + position_embedding = PositionEmbeddingLearned(N_steps) + else: + raise ValueError(f"not supported {args.position_embedding}") + + return position_embedding diff --git a/groundingdino/models/GroundingDINO/backbone/swin_transformer.py b/groundingdino/models/GroundingDINO/backbone/swin_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..d75fe2d979f60cecc88b59711eb39accf24e6a8c --- /dev/null +++ b/groundingdino/models/GroundingDINO/backbone/swin_transformer.py @@ -0,0 +1,804 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# DINO +# Copyright (c) 2022 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# -------------------------------------------------------- +# modified from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py +# -------------------------------------------------------- + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ +import loralib as lora +from groundingdino.util.misc import NestedTensor + + +class Mlp(nn.Module): + """Multilayer perceptron.""" + + def __init__( + self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + r = 16 + self.fc1 = lora.Linear(in_features, hidden_features , r=r) + self.act = act_layer() + self.fc2 = lora.Linear(hidden_features, out_features , r=r) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + """Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__( + self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + r =16 + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = lora.Linear(dim, dim * 3,r=r , bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = lora.Linear(dim, dim , r=r) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=0.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """Forward function. + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B_, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Module): + """Swin Transformer Block. + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__( + self, + dim, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop + ) + + self.H = None + self.W = None + + def forward(self, x, mask_matrix): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + mask_matrix: Attention mask for cyclic shift. + """ + B, L, C = x.shape + H, W = self.H, self.W + assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # pad feature maps to multiples of window size + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + _, Hp, Wp, _ = x.shape + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size + ) # nW*B, window_size, window_size, C + x_windows = x_windows.view( + -1, self.window_size * self.window_size, C + ) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchMerging(nn.Module): + """Patch Merging Layer + Args: + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + r = 16 + self.reduction = lora.Linear(4 * dim, 2 * dim, r=r, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + x = x.view(B, H, W, C) + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class BasicLayer(nn.Module): + """A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (int): Local window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + dim, + depth, + num_heads, + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + ): + super().__init__() + self.window_size = window_size + self.shift_size = window_size // 2 + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList( + [ + SwinTransformerBlock( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + + # calculate attention mask for SW-MSA + Hp = int(np.ceil(H / self.window_size)) * self.window_size + Wp = int(np.ceil(W / self.window_size)) * self.window_size + img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size + ) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( + attn_mask == 0, float(0.0) + ) + + for blk in self.blocks: + blk.H, blk.W = H, W + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, attn_mask) + else: + x = blk(x, attn_mask) + if self.downsample is not None: + x_down = self.downsample(x, H, W) + Wh, Ww = (H + 1) // 2, (W + 1) // 2 + return x, H, W, x_down, Wh, Ww + else: + return x, H, W, x, H, W + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding + Args: + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + patch_size = to_2tuple(patch_size) + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, H, W = x.size() + if W % self.patch_size[1] != 0: + x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) + if H % self.patch_size[0] != 0: + x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) + + x = self.proj(x) # B C Wh Ww + if self.norm is not None: + Wh, Ww = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) + + return x + + +class SwinTransformer(nn.Module): + """Swin Transformer backbone. + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + Args: + pretrain_img_size (int): Input image size for training the pretrained model, + used in absolute postion embedding. Default 224. + patch_size (int | tuple(int)): Patch size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + depths (tuple[int]): Depths of each Swin Transformer stage. + num_heads (tuple[int]): Number of attention head of each stage. + window_size (int): Window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. + patch_norm (bool): If True, add normalization after patch embedding. Default: True. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + dilation (bool): if True, the output size if 16x downsample, ow 32x downsample. + """ + + def __init__( + self, + pretrain_img_size=224, + patch_size=4, + in_chans=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + ape=False, + patch_norm=True, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + dilation=False, + use_checkpoint=False, + ): + super().__init__() + + self.pretrain_img_size = pretrain_img_size + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.dilation = dilation + + # if use_checkpoint: + # print("use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!") + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None, + ) + + # absolute position embedding + if self.ape: + pretrain_img_size = to_2tuple(pretrain_img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + pretrain_img_size[0] // patch_size[0], + pretrain_img_size[1] // patch_size[1], + ] + + self.absolute_pos_embed = nn.Parameter( + torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]) + ) + trunc_normal_(self.absolute_pos_embed, std=0.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + # prepare downsample list + downsamplelist = [PatchMerging for i in range(self.num_layers)] + downsamplelist[-1] = None + num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)] + if self.dilation: + downsamplelist[-2] = None + num_features[-1] = int(embed_dim * 2 ** (self.num_layers - 1)) // 2 + for i_layer in range(self.num_layers): + layer = BasicLayer( + # dim=int(embed_dim * 2 ** i_layer), + dim=num_features[i_layer], + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], + norm_layer=norm_layer, + # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + downsample=downsamplelist[i_layer], + use_checkpoint=use_checkpoint, + ) + self.layers.append(layer) + + # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)] + self.num_features = num_features + + # add a norm layer for each output + for i_layer in out_indices: + layer = norm_layer(num_features[i_layer]) + layer_name = f"norm{i_layer}" + self.add_module(layer_name, layer) + + self._freeze_stages() + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 1 and self.ape: + self.absolute_pos_embed.requires_grad = False + + if self.frozen_stages >= 2: + self.pos_drop.eval() + for i in range(0, self.frozen_stages - 1): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + # def init_weights(self, pretrained=None): + # """Initialize the weights in backbone. + # Args: + # pretrained (str, optional): Path to pre-trained weights. + # Defaults to None. + # """ + + # def _init_weights(m): + # if isinstance(m, nn.Linear): + # trunc_normal_(m.weight, std=.02) + # if isinstance(m, nn.Linear) and m.bias is not None: + # nn.init.constant_(m.bias, 0) + # elif isinstance(m, nn.LayerNorm): + # nn.init.constant_(m.bias, 0) + # nn.init.constant_(m.weight, 1.0) + + # if isinstance(pretrained, str): + # self.apply(_init_weights) + # logger = get_root_logger() + # load_checkpoint(self, pretrained, strict=False, logger=logger) + # elif pretrained is None: + # self.apply(_init_weights) + # else: + # raise TypeError('pretrained must be a str or None') + + def forward_raw(self, x): + """Forward function.""" + x = self.patch_embed(x) + + Wh, Ww = x.size(2), x.size(3) + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" + ) + x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C + else: + x = x.flatten(2).transpose(1, 2) + x = self.pos_drop(x) + + outs = [] + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + # import ipdb; ipdb.set_trace() + + if i in self.out_indices: + norm_layer = getattr(self, f"norm{i}") + x_out = norm_layer(x_out) + + out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() + outs.append(out) + # in: + # torch.Size([2, 3, 1024, 1024]) + # outs: + # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \ + # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])] + return tuple(outs) + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + + """Forward function.""" + x = self.patch_embed(x) + + Wh, Ww = x.size(2), x.size(3) + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" + ) + x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C + else: + x = x.flatten(2).transpose(1, 2) + x = self.pos_drop(x) + + outs = [] + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + + if i in self.out_indices: + norm_layer = getattr(self, f"norm{i}") + x_out = norm_layer(x_out) + + out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() + outs.append(out) + # in: + # torch.Size([2, 3, 1024, 1024]) + # out: + # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \ + # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])] + + # collect for nesttensors + outs_dict = {} + for idx, out_i in enumerate(outs): + m = tensor_list.mask + assert m is not None + mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0] + outs_dict[idx] = NestedTensor(out_i, mask) + + return outs_dict + + def train(self, mode=True): + """Convert the model into training mode while keep layers freezed.""" + super(SwinTransformer, self).train(mode) + self._freeze_stages() + + +def build_swin_transformer(modelname, pretrain_img_size, **kw): + assert modelname in [ + "swin_T_224_1k", + "swin_B_224_22k", + "swin_B_384_22k", + "swin_L_224_22k", + "swin_L_384_22k", + ] + + model_para_dict = { + "swin_T_224_1k": dict( + embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7 + ), + "swin_B_224_22k": dict( + embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7 + ), + "swin_B_384_22k": dict( + embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=12 + ), + "swin_L_224_22k": dict( + embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7 + ), + "swin_L_384_22k": dict( + embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12 + ), + } + kw_cgf = model_para_dict[modelname] + kw_cgf.update(kw) + model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf) + return model + + +if __name__ == "__main__": + model = build_swin_transformer("swin_L_384_22k", 384, dilation=True) + x = torch.rand(2, 3, 1024, 1024) + y = model.forward_raw(x) + import ipdb + + ipdb.set_trace() + x = torch.rand(2, 3, 384, 384) + y = model.forward_raw(x) diff --git a/groundingdino/models/GroundingDINO/fuse_modules.py b/groundingdino/models/GroundingDINO/fuse_modules.py new file mode 100644 index 0000000000000000000000000000000000000000..5a29bd5f7a18533e7f9133cc993b2d6994a7704c --- /dev/null +++ b/groundingdino/models/GroundingDINO/fuse_modules.py @@ -0,0 +1,298 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ + +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm.models.layers import DropPath +import loralib as lora + +class FeatureResizer(nn.Module): + """ + This class takes as input a set of embeddings of dimension C1 and outputs a set of + embedding of dimension C2, after a linear transformation, dropout and normalization (LN). + """ + + def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True): + super().__init__() + self.do_ln = do_ln + # Object feature encoding + r = 16 + self.fc = lora.Linear(input_feat_size, output_feat_size,r=r , bias=True) + self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12) + self.dropout = nn.Dropout(dropout) + + def forward(self, encoder_features): + x = self.fc(encoder_features) + if self.do_ln: + x = self.layer_norm(x) + output = self.dropout(x) + return output + + +def l1norm(X, dim, eps=1e-8): + """L1-normalize columns of X""" + norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps + X = torch.div(X, norm) + return X + + +def l2norm(X, dim, eps=1e-8): + """L2-normalize columns of X""" + norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps + X = torch.div(X, norm) + return X + + +def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8): + """ + query: (n_context, queryL, d) + context: (n_context, sourceL, d) + """ + batch_size_q, queryL = query.size(0), query.size(1) + batch_size, sourceL = context.size(0), context.size(1) + + # Get attention + # --> (batch, d, queryL) + queryT = torch.transpose(query, 1, 2) + + # (batch, sourceL, d)(batch, d, queryL) + # --> (batch, sourceL, queryL) + attn = torch.bmm(context, queryT) + if raw_feature_norm == "softmax": + # --> (batch*sourceL, queryL) + attn = attn.view(batch_size * sourceL, queryL) + attn = nn.Softmax()(attn) + # --> (batch, sourceL, queryL) + attn = attn.view(batch_size, sourceL, queryL) + elif raw_feature_norm == "l2norm": + attn = l2norm(attn, 2) + elif raw_feature_norm == "clipped_l2norm": + attn = nn.LeakyReLU(0.1)(attn) + attn = l2norm(attn, 2) + else: + raise ValueError("unknown first norm type:", raw_feature_norm) + # --> (batch, queryL, sourceL) + attn = torch.transpose(attn, 1, 2).contiguous() + # --> (batch*queryL, sourceL) + attn = attn.view(batch_size * queryL, sourceL) + attn = nn.Softmax()(attn * smooth) + # --> (batch, queryL, sourceL) + attn = attn.view(batch_size, queryL, sourceL) + # --> (batch, sourceL, queryL) + attnT = torch.transpose(attn, 1, 2).contiguous() + + # --> (batch, d, sourceL) + contextT = torch.transpose(context, 1, 2) + # (batch x d x sourceL)(batch x sourceL x queryL) + # --> (batch, d, queryL) + weightedContext = torch.bmm(contextT, attnT) + # --> (batch, queryL, d) + weightedContext = torch.transpose(weightedContext, 1, 2) + + return weightedContext, attnT + + +class BiMultiHeadAttention(nn.Module): + def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None): + super(BiMultiHeadAttention, self).__init__() + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.v_dim = v_dim + self.l_dim = l_dim + + assert ( + self.head_dim * self.num_heads == self.embed_dim + ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + self.scale = self.head_dim ** (-0.5) + self.dropout = dropout + r = 16 + self.v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r) + self.l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r) + self.values_v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r) + self.values_l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r) + + self.out_v_proj = lora.Linear(self.embed_dim, self.v_dim , r=r) + self.out_l_proj = lora.Linear(self.embed_dim, self.l_dim , r=r) + + self.stable_softmax_2d = True + self.clamp_min_for_underflow = True + self.clamp_max_for_overflow = True + + self._reset_parameters() + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def _reset_parameters(self): + nn.init.xavier_uniform_(self.v_proj.weight) + self.v_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.l_proj.weight) + self.l_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.values_v_proj.weight) + self.values_v_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.values_l_proj.weight) + self.values_l_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_v_proj.weight) + self.out_v_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_l_proj.weight) + self.out_l_proj.bias.data.fill_(0) + + def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): + """_summary_ + + Args: + v (_type_): bs, n_img, dim + l (_type_): bs, n_text, dim + attention_mask_v (_type_, optional): _description_. bs, n_img + attention_mask_l (_type_, optional): _description_. bs, n_text + + Returns: + _type_: _description_ + """ + # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': + # import ipdb; ipdb.set_trace() + bsz, tgt_len, _ = v.size() + + query_states = self.v_proj(v) * self.scale + key_states = self._shape(self.l_proj(l), -1, bsz) + value_v_states = self._shape(self.values_v_proj(v), -1, bsz) + value_l_states = self._shape(self.values_l_proj(l), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_v_states = value_v_states.view(*proj_shape) + value_l_states = value_l_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + ) + + if self.stable_softmax_2d: + attn_weights = attn_weights - attn_weights.max() + + if self.clamp_min_for_underflow: + attn_weights = torch.clamp( + attn_weights, min=-50000 + ) # Do not increase -50000, data type half has quite limited range + if self.clamp_max_for_overflow: + attn_weights = torch.clamp( + attn_weights, max=50000 + ) # Do not increase 50000, data type half has quite limited range + + attn_weights_T = attn_weights.transpose(1, 2) + attn_weights_l = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0] + if self.clamp_min_for_underflow: + attn_weights_l = torch.clamp( + attn_weights_l, min=-50000 + ) # Do not increase -50000, data type half has quite limited range + if self.clamp_max_for_overflow: + attn_weights_l = torch.clamp( + attn_weights_l, max=50000 + ) # Do not increase 50000, data type half has quite limited range + + # mask vison for language + if attention_mask_v is not None: + attention_mask_v = ( + attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + attn_weights_l.masked_fill_(attention_mask_v, float("-inf")) + + attn_weights_l = attn_weights_l.softmax(dim=-1) + + # mask language for vision + if attention_mask_l is not None: + attention_mask_l = ( + attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + attn_weights.masked_fill_(attention_mask_l, float("-inf")) + attn_weights_v = attn_weights.softmax(dim=-1) + + attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training) + attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training) + + attn_output_v = torch.bmm(attn_probs_v, value_l_states) + attn_output_l = torch.bmm(attn_probs_l, value_v_states) + + if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}" + ) + + if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim): + raise ValueError( + f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}" + ) + + attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output_v = attn_output_v.transpose(1, 2) + attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim) + + attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim) + attn_output_l = attn_output_l.transpose(1, 2) + attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim) + + attn_output_v = self.out_v_proj(attn_output_v) + attn_output_l = self.out_l_proj(attn_output_l) + + return attn_output_v, attn_output_l + + +# Bi-Direction MHA (text->image, image->text) +class BiAttentionBlock(nn.Module): + def __init__( + self, + v_dim, + l_dim, + embed_dim, + num_heads, + dropout=0.1, + drop_path=0.0, + init_values=1e-4, + cfg=None, + ): + """ + Inputs: + embed_dim - Dimensionality of input and attention feature vectors + hidden_dim - Dimensionality of hidden layer in feed-forward network + (usually 2-4x larger than embed_dim) + num_heads - Number of heads to use in the Multi-Head Attention block + dropout - Amount of dropout to apply in the feed-forward network + """ + super(BiAttentionBlock, self).__init__() + + # pre layer norm + self.layer_norm_v = nn.LayerNorm(v_dim) + self.layer_norm_l = nn.LayerNorm(l_dim) + self.attn = BiMultiHeadAttention( + v_dim=v_dim, l_dim=l_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout + ) + + # add layer scale for training stability + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=True) + self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=True) + + def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): + v = self.layer_norm_v(v) + l = self.layer_norm_l(l) + delta_v, delta_l = self.attn( + v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l + ) + # v, l = v + delta_v, l + delta_l + v = v + self.drop_path(self.gamma_v * delta_v) + l = l + self.drop_path(self.gamma_l * delta_l) + return v, l + + # def forward(self, v:List[torch.Tensor], l, attention_mask_v=None, attention_mask_l=None) diff --git a/groundingdino/models/GroundingDINO/groundingdino.py b/groundingdino/models/GroundingDINO/groundingdino.py new file mode 100644 index 0000000000000000000000000000000000000000..46d17087ce7fe06b57c7697d6c3d393c3da23c5a --- /dev/null +++ b/groundingdino/models/GroundingDINO/groundingdino.py @@ -0,0 +1,412 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR model and criterion classes. +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Modified from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# ------------------------------------------------------------------------ +import copy +from typing import List +import loralib as lora +import torch +import torch.nn.functional as F +from torch import nn +from torchvision.ops.boxes import nms +from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast + +from groundingdino.util import box_ops, get_tokenlizer +from groundingdino.util.misc import ( + NestedTensor, + accuracy, + get_world_size, + interpolate, + inverse_sigmoid, + is_dist_avail_and_initialized, + nested_tensor_from_tensor_list, +) +from groundingdino.util.utils import get_phrases_from_posmap +from groundingdino.util.visualizer import COCOVisualizer +from groundingdino.util.vl_utils import create_positive_map_from_span + +from ..registry import MODULE_BUILD_FUNCS +from .backbone import build_backbone +from .bertwarper import ( + BertModelWarper, + generate_masks_with_special_tokens, + generate_masks_with_special_tokens_and_transfer_map, +) +from .transformer import build_transformer +from .utils import MLP, ContrastiveEmbed, sigmoid_focal_loss + + +class GroundingDINO(nn.Module): + """This is the Cross-Attention Detector module that performs object detection""" + + def __init__( + self, + backbone, + transformer, + num_queries, + aux_loss=False, + iter_update=False, + query_dim=2, + num_feature_levels=1, + nheads=8, + # two stage + two_stage_type="no", # ['no', 'standard'] + dec_pred_bbox_embed_share=True, + two_stage_class_embed_share=True, + two_stage_bbox_embed_share=True, + num_patterns=0, + dn_number=100, + dn_box_noise_scale=0.4, + dn_label_noise_ratio=0.5, + dn_labelbook_size=100, + text_encoder_type="bert-base-uncased", + sub_sentence_present=True, + max_text_len=256, + ): + """Initializes the model. + Parameters: + backbone: torch module of the backbone to be used. See backbone.py + transformer: torch module of the transformer architecture. See transformer.py + num_queries: number of object queries, ie detection slot. This is the maximal number of objects + Conditional DETR can detect in a single image. For COCO, we recommend 100 queries. + aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. + """ + super().__init__() + self.num_queries = num_queries + self.transformer = transformer + self.hidden_dim = hidden_dim = transformer.d_model + self.num_feature_levels = num_feature_levels + self.nheads = nheads + self.max_text_len = 256 + self.sub_sentence_present = sub_sentence_present + + # setting query dim + self.query_dim = query_dim + assert query_dim == 4 + + # for dn training + self.num_patterns = num_patterns + self.dn_number = dn_number + self.dn_box_noise_scale = dn_box_noise_scale + self.dn_label_noise_ratio = dn_label_noise_ratio + self.dn_labelbook_size = dn_labelbook_size + + # bert + self.tokenizer = get_tokenlizer.get_tokenlizer(text_encoder_type) + self.bert = get_tokenlizer.get_pretrained_language_model(text_encoder_type) + self.bert.pooler.dense.weight.requires_grad_(False) + self.bert.pooler.dense.bias.requires_grad_(False) + self.bert = BertModelWarper(bert_model=self.bert) + + self.feat_map = nn.Linear(self.bert.config.hidden_size, self.hidden_dim, bias=True) + nn.init.constant_(self.feat_map.bias.data, 0) + nn.init.xavier_uniform_(self.feat_map.weight.data) + # freeze + + # special tokens + self.specical_tokens = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) + + # prepare input projection layers + if num_feature_levels > 1: + num_backbone_outs = len(backbone.num_channels) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = backbone.num_channels[_] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + ) + ) + for _ in range(num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, hidden_dim), + ) + ) + in_channels = hidden_dim + self.input_proj = nn.ModuleList(input_proj_list) + else: + assert two_stage_type == "no", "two_stage_type should be no if num_feature_levels=1 !!!" + self.input_proj = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d(backbone.num_channels[-1], hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + ) + ] + ) + + self.backbone = backbone + self.aux_loss = aux_loss + self.box_pred_damping = box_pred_damping = None + + self.iter_update = iter_update + assert iter_update, "Why not iter_update?" + + # prepare pred layers + self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share + # prepare class & box embed + _class_embed = ContrastiveEmbed() + + _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) + nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0) + + if dec_pred_bbox_embed_share: + box_embed_layerlist = [_bbox_embed for i in range(transformer.num_decoder_layers)] + else: + box_embed_layerlist = [ + copy.deepcopy(_bbox_embed) for i in range(transformer.num_decoder_layers) + ] + class_embed_layerlist = [_class_embed for i in range(transformer.num_decoder_layers)] + self.bbox_embed = nn.ModuleList(box_embed_layerlist) + self.class_embed = nn.ModuleList(class_embed_layerlist) + self.transformer.decoder.bbox_embed = self.bbox_embed + self.transformer.decoder.class_embed = self.class_embed + + # two stage + self.two_stage_type = two_stage_type + assert two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format( + two_stage_type + ) + if two_stage_type != "no": + if two_stage_bbox_embed_share: + assert dec_pred_bbox_embed_share + self.transformer.enc_out_bbox_embed = _bbox_embed + else: + self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed) + + if two_stage_class_embed_share: + assert dec_pred_bbox_embed_share + self.transformer.enc_out_class_embed = _class_embed + else: + self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed) + + self.refpoint_embed = None + + self._reset_parameters() + + def _reset_parameters(self): + # init input_proj + for proj in self.input_proj: + nn.init.xavier_uniform_(proj[0].weight, gain=1) + nn.init.constant_(proj[0].bias, 0) + + def set_image_tensor(self, samples: NestedTensor): + if isinstance(samples, (list, torch.Tensor)): + samples = nested_tensor_from_tensor_list(samples) + self.features, self.poss = self.backbone(samples) + + def unset_image_tensor(self): + if hasattr(self, 'features'): + del self.features + if hasattr(self,'poss'): + del self.poss + + def set_image_features(self, features , poss): + self.features = features + self.poss = poss + + def init_ref_points(self, use_num_queries): + self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim) + + def forward(self, samples: NestedTensor, targets: List = None, **kw): + """The forward expects a NestedTensor, which consists of: + - samples.tensor: batched images, of shape [batch_size x 3 x H x W] + - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels + + It returns a dict with the following elements: + - "pred_logits": the classification logits (including no-object) for all queries. + Shape= [batch_size x num_queries x num_classes] + - "pred_boxes": The normalized boxes coordinates for all queries, represented as + (center_x, center_y, width, height). These values are normalized in [0, 1], + relative to the size of each individual image (disregarding possible padding). + See PostProcess for information on how to retrieve the unnormalized bounding box. + - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of + dictionnaries containing the two above keys for each decoder layer. + """ + if targets is None: + captions = kw["captions"] + else: + captions = [t["caption"] for t in targets] + + # encoder texts + tokenized = self.tokenizer(captions, padding="longest", return_tensors="pt").to( + samples.device + ) + ( + text_self_attention_masks, + position_ids, + cate_to_token_mask_list, + ) = generate_masks_with_special_tokens_and_transfer_map( + tokenized, self.specical_tokens, self.tokenizer + ) + + if text_self_attention_masks.shape[1] > self.max_text_len: + text_self_attention_masks = text_self_attention_masks[ + :, : self.max_text_len, : self.max_text_len + ] + position_ids = position_ids[:, : self.max_text_len] + tokenized["input_ids"] = tokenized["input_ids"][:, : self.max_text_len] + tokenized["attention_mask"] = tokenized["attention_mask"][:, : self.max_text_len] + tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : self.max_text_len] + + # extract text embeddings + if self.sub_sentence_present: + tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} + tokenized_for_encoder["attention_mask"] = text_self_attention_masks + tokenized_for_encoder["position_ids"] = position_ids + else: + # import ipdb; ipdb.set_trace() + tokenized_for_encoder = tokenized + + bert_output = self.bert(**tokenized_for_encoder) # bs, 195, 768 + + encoded_text = self.feat_map(bert_output["last_hidden_state"]) # bs, 195, d_model + text_token_mask = tokenized.attention_mask.bool() # bs, 195 + # text_token_mask: True for nomask, False for mask + # text_self_attention_masks: True for nomask, False for mask + + if encoded_text.shape[1] > self.max_text_len: + encoded_text = encoded_text[:, : self.max_text_len, :] + text_token_mask = text_token_mask[:, : self.max_text_len] + position_ids = position_ids[:, : self.max_text_len] + text_self_attention_masks = text_self_attention_masks[ + :, : self.max_text_len, : self.max_text_len + ] + + text_dict = { + "encoded_text": encoded_text, # bs, 195, d_model + "text_token_mask": text_token_mask, # bs, 195 + "position_ids": position_ids, # bs, 195 + "text_self_attention_masks": text_self_attention_masks, # bs, 195,195 + } + + # import ipdb; ipdb.set_trace() + if isinstance(samples, (list, torch.Tensor)): + samples = nested_tensor_from_tensor_list(samples) + if not hasattr(self, 'features') or not hasattr(self, 'poss'): + self.set_image_tensor(samples) + + srcs = [] + masks = [] + for l, feat in enumerate(self.features): + src, mask = feat.decompose() + srcs.append(self.input_proj[l](src)) + masks.append(mask) + assert mask is not None + if self.num_feature_levels > len(srcs): + _len_srcs = len(srcs) + for l in range(_len_srcs, self.num_feature_levels): + if l == _len_srcs: + src = self.input_proj[l](self.features[-1].tensors) + else: + src = self.input_proj[l](srcs[-1]) + m = samples.mask + mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] + pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype) + srcs.append(src) + masks.append(mask) + self.poss.append(pos_l) + + input_query_bbox = input_query_label = attn_mask = dn_meta = None + hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer( + srcs, masks, input_query_bbox, self.poss, input_query_label, attn_mask, text_dict + ) + + # deformable-detr-like anchor update + outputs_coord_list = [] + for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate( + zip(reference[:-1], self.bbox_embed, hs) + ): + layer_delta_unsig = layer_bbox_embed(layer_hs) + layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig) + layer_outputs_unsig = layer_outputs_unsig.sigmoid() + outputs_coord_list.append(layer_outputs_unsig) + outputs_coord_list = torch.stack(outputs_coord_list) + + # output + outputs_class = torch.stack( + [ + layer_cls_embed(layer_hs, text_dict) + for layer_cls_embed, layer_hs in zip(self.class_embed, hs) + ] + ) + out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord_list[-1]} + + # # for intermediate outputs + # if self.aux_loss: + # out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord_list) + + # # for encoder output + # if hs_enc is not None: + # # prepare intermediate outputs + # interm_coord = ref_enc[-1] + # interm_class = self.transformer.enc_out_class_embed(hs_enc[-1], text_dict) + # out['interm_outputs'] = {'pred_logits': interm_class, 'pred_boxes': interm_coord} + # out['interm_outputs_for_matching_pre'] = {'pred_logits': interm_class, 'pred_boxes': init_box_proposal} + unset_image_tensor = kw.get('unset_image_tensor', True) + if unset_image_tensor: + self.unset_image_tensor() ## If necessary + return out + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [ + {"pred_logits": a, "pred_boxes": b} + for a, b in zip(outputs_class[:-1], outputs_coord[:-1]) + ] + + +@MODULE_BUILD_FUNCS.registe_with_name(module_name="groundingdino") +def build_groundingdino(args): + + backbone = build_backbone(args) + transformer = build_transformer(args) + + dn_labelbook_size = args.dn_labelbook_size + dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share + sub_sentence_present = args.sub_sentence_present + + model = GroundingDINO( + backbone, + transformer, + num_queries=args.num_queries, + aux_loss=True, + iter_update=True, + query_dim=4, + num_feature_levels=args.num_feature_levels, + nheads=args.nheads, + dec_pred_bbox_embed_share=dec_pred_bbox_embed_share, + two_stage_type=args.two_stage_type, + two_stage_bbox_embed_share=args.two_stage_bbox_embed_share, + two_stage_class_embed_share=args.two_stage_class_embed_share, + num_patterns=args.num_patterns, + dn_number=0, + dn_box_noise_scale=args.dn_box_noise_scale, + dn_label_noise_ratio=args.dn_label_noise_ratio, + dn_labelbook_size=dn_labelbook_size, + text_encoder_type=args.text_encoder_type, + sub_sentence_present=sub_sentence_present, + max_text_len=args.max_text_len, + ) + + return model + diff --git a/groundingdino/models/GroundingDINO/ms_deform_attn.py b/groundingdino/models/GroundingDINO/ms_deform_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..5b2a08de645db11b9952171fd25d3a5a816e4fa8 --- /dev/null +++ b/groundingdino/models/GroundingDINO/ms_deform_attn.py @@ -0,0 +1,414 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from: +# https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/functions/ms_deform_attn_func.py +# https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py +# https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/multi_scale_deform_attn.py +# ------------------------------------------------------------------------------------------------ + +import math +import warnings +from typing import Optional +import loralib as lora +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.init import constant_, xavier_uniform_ + +try: + from groundingdino import _C +except: + warnings.warn("Failed to load custom C++ ops. Running on CPU mode Only!") + + +# helpers +def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + +class MultiScaleDeformableAttnFunction(Function): + @staticmethod + def forward( + ctx, + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step, + ): + ctx.im2col_step = im2col_step + output = _C.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ctx.im2col_step, + ) + ctx.save_for_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + ( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) = ctx.saved_tensors + grad_value, grad_sampling_loc, grad_attn_weight = _C.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output, + ctx.im2col_step, + ) + + return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None + + +def multi_scale_deformable_attn_pytorch( + value: torch.Tensor, + value_spatial_shapes: torch.Tensor, + sampling_locations: torch.Tensor, + attention_weights: torch.Tensor, +) -> torch.Tensor: + + bs, _, num_heads, embed_dims = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level, (H_, W_) in enumerate(value_spatial_shapes): + # bs, H_*W_, num_heads, embed_dims -> + # bs, H_*W_, num_heads*embed_dims -> + # bs, num_heads*embed_dims, H_*W_ -> + # bs*num_heads, embed_dims, H_, W_ + value_l_ = ( + value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_) + ) + # bs, num_queries, num_heads, num_points, 2 -> + # bs, num_heads, num_queries, num_points, 2 -> + # bs*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1) + # bs*num_heads, embed_dims, num_queries, num_points + sampling_value_l_ = F.grid_sample( + value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False + ) + sampling_value_list.append(sampling_value_l_) + # (bs, num_queries, num_heads, num_levels, num_points) -> + # (bs, num_heads, num_queries, num_levels, num_points) -> + # (bs, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + bs * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(bs, num_heads * embed_dims, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +class MultiScaleDeformableAttention(nn.Module): + """Multi-Scale Deformable Attention Module used in Deformable-DETR + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dim (int): The embedding dimension of Attention. Default: 256. + num_heads (int): The number of attention heads. Default: 8. + num_levels (int): The number of feature map used in Attention. Default: 4. + num_points (int): The number of sampling points for each query + in each head. Default: 4. + img2col_steps (int): The step used in image_to_column. Defualt: 64. + dropout (float): Dropout layer used in output. Default: 0.1. + batch_first (bool): if ``True``, then the input and output tensor will be + provided as `(bs, n, embed_dim)`. Default: False. `(n, bs, embed_dim)` + """ + + def __init__( + self, + embed_dim: int = 256, + num_heads: int = 8, + num_levels: int = 4, + num_points: int = 4, + img2col_step: int = 64, + batch_first: bool = False, + ): + super().__init__() + if embed_dim % num_heads != 0: + raise ValueError( + "embed_dim must be divisible by num_heads, but got {} and {}".format( + embed_dim, num_heads + ) + ) + head_dim = embed_dim // num_heads + + self.batch_first = batch_first + + if not _is_power_of_2(head_dim): + warnings.warn( + """ + You'd better set d_model in MSDeformAttn to make sure that + each dim of the attention head a power of 2, which is more efficient. + """ + ) + + self.im2col_step = img2col_step + self.embed_dim = embed_dim + self.num_heads = num_heads + self.num_levels = num_levels + self.num_points = num_points + r = 16 + self.sampling_offsets = lora.Linear(embed_dim, num_heads * num_levels * num_points * 2 , r=r) + self.attention_weights = lora.Linear(embed_dim, num_heads * num_levels * num_points , r=r) + self.value_proj = lora.Linear(embed_dim, embed_dim , r=r) + self.output_proj = lora.Linear(embed_dim, embed_dim , r=r) + + self.init_weights() + + def _reset_parameters(self): + return self.init_weights() + + def init_weights(self): + """ + Default initialization for Parameters of Module. + """ + constant_(self.sampling_offsets.weight.data, 0.0) + thetas = torch.arange(self.num_heads, dtype=torch.float32) * ( + 2.0 * math.pi / self.num_heads + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(self.num_heads, 1, 1, 2) + .repeat(1, self.num_levels, self.num_points, 1) + ) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + constant_(self.attention_weights.weight.data, 0.0) + constant_(self.attention_weights.bias.data, 0.0) + xavier_uniform_(self.value_proj.weight.data) + constant_(self.value_proj.bias.data, 0.0) + xavier_uniform_(self.output_proj.weight.data) + constant_(self.output_proj.bias.data, 0.0) + + def freeze_sampling_offsets(self): + print("Freeze sampling offsets") + self.sampling_offsets.weight.requires_grad = False + self.sampling_offsets.bias.requires_grad = False + + def freeze_attention_weights(self): + print("Freeze attention weights") + self.attention_weights.weight.requires_grad = False + self.attention_weights.bias.requires_grad = False + + def forward( + self, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + value: Optional[torch.Tensor] = None, + query_pos: Optional[torch.Tensor] = None, + key_padding_mask: Optional[torch.Tensor] = None, + reference_points: Optional[torch.Tensor] = None, + spatial_shapes: Optional[torch.Tensor] = None, + level_start_index: Optional[torch.Tensor] = None, + **kwargs + ) -> torch.Tensor: + + """Forward Function of MultiScaleDeformableAttention + + Args: + query (torch.Tensor): Query embeddings with shape + `(num_query, bs, embed_dim)` + key (torch.Tensor): Key embeddings with shape + `(num_key, bs, embed_dim)` + value (torch.Tensor): Value embeddings with shape + `(num_key, bs, embed_dim)` + query_pos (torch.Tensor): The position embedding for `query`. Default: None. + key_padding_mask (torch.Tensor): ByteTensor for `query`, with shape `(bs, num_key)`, + indicating which elements within `key` to be ignored in attention. + reference_points (torch.Tensor): The normalized reference points + with shape `(bs, num_query, num_levels, 2)`, + all elements is range in [0, 1], top-left (0, 0), + bottom-right (1, 1), including padding are. + or `(N, Length_{query}, num_levels, 4)`, add additional + two dimensions `(h, w)` to form reference boxes. + spatial_shapes (torch.Tensor): Spatial shape of features in different levels. + With shape `(num_levels, 2)`, last dimension represents `(h, w)`. + level_start_index (torch.Tensor): The start index of each level. A tensor with + shape `(num_levels, )` which can be represented as + `[0, h_0 * w_0, h_0 * w_0 + h_1 * w_1, ...]`. + + Returns: + torch.Tensor: forward results with shape `(num_query, bs, embed_dim)` + """ + + if value is None: + value = query + + if query_pos is not None: + query = query + query_pos + + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], float(0)) + value = value.view(bs, num_value, self.num_heads, -1) + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2 + ) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points + ) + attention_weights = attention_weights.softmax(-1) + attention_weights = attention_weights.view( + bs, + num_query, + self.num_heads, + self.num_levels, + self.num_points, + ) + + # bs, num_query, num_heads, num_levels, num_points, 2 + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif reference_points.shape[-1] == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets + / self.num_points + * reference_points[:, :, None, :, None, 2:] + * 0.5 + ) + else: + raise ValueError( + "Last dim of reference_points must be 2 or 4, but get {} instead.".format( + reference_points.shape[-1] + ) + ) + + if torch.cuda.is_available() and value.is_cuda: + halffloat = False + if value.dtype == torch.float16: + halffloat = True + value = value.float() + sampling_locations = sampling_locations.float() + attention_weights = attention_weights.float() + + output = MultiScaleDeformableAttnFunction.apply( + value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + + if halffloat: + output = output.half() + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights + ) + + output = self.output_proj(output) + + if not self.batch_first: + output = output.permute(1, 0, 2) + + return output + + +def create_dummy_class(klass, dependency, message=""): + """ + When a dependency of a class is not available, create a dummy class which throws ImportError + when used. + + Args: + klass (str): name of the class. + dependency (str): name of the dependency. + message: extra message to print + Returns: + class: a class object + """ + err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, klass) + if message: + err = err + " " + message + + class _DummyMetaClass(type): + # throw error on class attribute access + def __getattr__(_, __): # noqa: B902 + raise ImportError(err) + + class _Dummy(object, metaclass=_DummyMetaClass): + # throw error on constructor + def __init__(self, *args, **kwargs): + raise ImportError(err) + + return _Dummy + + +def create_dummy_func(func, dependency, message=""): + """ + When a dependency of a function is not available, create a dummy function which throws + ImportError when used. + + Args: + func (str): name of the function. + dependency (str or list[str]): name(s) of the dependency. + message: extra message to print + Returns: + function: a function object + """ + err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, func) + if message: + err = err + " " + message + + if isinstance(dependency, (list, tuple)): + dependency = ",".join(dependency) + + def _dummy(*args, **kwargs): + raise ImportError(err) + + return _dummy diff --git a/groundingdino/models/GroundingDINO/transformer.py b/groundingdino/models/GroundingDINO/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..e3da439a39c2c82ae135b36275f8b22c70fbd904 --- /dev/null +++ b/groundingdino/models/GroundingDINO/transformer.py @@ -0,0 +1,961 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# DINO +# Copyright (c) 2022 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR Transformer class. +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Modified from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ + +from typing import Optional + +import torch +import torch.utils.checkpoint as checkpoint +from torch import Tensor, nn +import loralib as lora +from groundingdino.util.misc import inverse_sigmoid + +from .fuse_modules import BiAttentionBlock +from .ms_deform_attn import MultiScaleDeformableAttention as MSDeformAttn +from .transformer_vanilla import TransformerEncoderLayer +from .utils import ( + MLP, + _get_activation_fn, + _get_clones, + gen_encoder_output_proposals, + gen_sineembed_for_position, + get_sine_pos_embed, +) + + +class Transformer(nn.Module): + def __init__( + self, + d_model=256, + nhead=8, + num_queries=300, + num_encoder_layers=6, + num_unicoder_layers=0, + num_decoder_layers=6, + dim_feedforward=2048, + dropout=0.0, + activation="relu", + normalize_before=False, + return_intermediate_dec=False, + query_dim=4, + num_patterns=0, + # for deformable encoder + num_feature_levels=1, + enc_n_points=4, + dec_n_points=4, + # init query + learnable_tgt_init=False, + # two stage + two_stage_type="no", # ['no', 'standard', 'early', 'combine', 'enceachlayer', 'enclayer1'] + embed_init_tgt=False, + # for text + use_text_enhancer=False, + use_fusion_layer=False, + use_checkpoint=False, + use_transformer_ckpt=False, + use_text_cross_attention=False, + text_dropout=0.1, + fusion_dropout=0.1, + fusion_droppath=0.0, + ): + super().__init__() + self.num_feature_levels = num_feature_levels + self.num_encoder_layers = num_encoder_layers + self.num_unicoder_layers = num_unicoder_layers + self.num_decoder_layers = num_decoder_layers + self.num_queries = num_queries + assert query_dim == 4 + + # choose encoder layer type + encoder_layer = DeformableTransformerEncoderLayer( + d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, enc_n_points + ) + + if use_text_enhancer: + text_enhance_layer = TransformerEncoderLayer( + d_model=d_model, + nhead=nhead // 2, + dim_feedforward=dim_feedforward // 2, + dropout=text_dropout, + ) + else: + text_enhance_layer = None + + if use_fusion_layer: + feature_fusion_layer = BiAttentionBlock( + v_dim=d_model, + l_dim=d_model, + embed_dim=dim_feedforward // 2, + num_heads=nhead // 2, + dropout=fusion_dropout, + drop_path=fusion_droppath, + ) + else: + feature_fusion_layer = None + + encoder_norm = nn.LayerNorm(d_model) if normalize_before else None + assert encoder_norm is None + self.encoder = TransformerEncoder( + encoder_layer, + num_encoder_layers, + d_model=d_model, + num_queries=num_queries, + text_enhance_layer=text_enhance_layer, + feature_fusion_layer=feature_fusion_layer, + use_checkpoint=use_checkpoint, + use_transformer_ckpt=use_transformer_ckpt, + ) + + # choose decoder layer type + decoder_layer = DeformableTransformerDecoderLayer( + d_model, + dim_feedforward, + dropout, + activation, + num_feature_levels, + nhead, + dec_n_points, + use_text_cross_attention=use_text_cross_attention, + ) + + decoder_norm = nn.LayerNorm(d_model) + self.decoder = TransformerDecoder( + decoder_layer, + num_decoder_layers, + decoder_norm, + return_intermediate=return_intermediate_dec, + d_model=d_model, + query_dim=query_dim, + num_feature_levels=num_feature_levels, + ) + + self.d_model = d_model + self.nhead = nhead + self.dec_layers = num_decoder_layers + self.num_queries = num_queries # useful for single stage model only + self.num_patterns = num_patterns + if not isinstance(num_patterns, int): + Warning("num_patterns should be int but {}".format(type(num_patterns))) + self.num_patterns = 0 + + if num_feature_levels > 1: + if self.num_encoder_layers > 0: + self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) + else: + self.level_embed = None + + self.learnable_tgt_init = learnable_tgt_init + assert learnable_tgt_init, "why not learnable_tgt_init" + self.embed_init_tgt = embed_init_tgt + if (two_stage_type != "no" and embed_init_tgt) or (two_stage_type == "no"): + self.tgt_embed = nn.Embedding(self.num_queries, d_model) + nn.init.normal_(self.tgt_embed.weight.data) + else: + self.tgt_embed = None + + # for two stage + self.two_stage_type = two_stage_type + assert two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format( + two_stage_type + ) + if two_stage_type == "standard": + # anchor selection at the output of encoder + self.enc_output = nn.Linear(d_model, d_model) + self.enc_output_norm = nn.LayerNorm(d_model) + self.two_stage_wh_embedding = None + + if two_stage_type == "no": + self.init_ref_points(num_queries) # init self.refpoint_embed + + self.enc_out_class_embed = None + self.enc_out_bbox_embed = None + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MSDeformAttn): + m._reset_parameters() + if self.num_feature_levels > 1 and self.level_embed is not None: + nn.init.normal_(self.level_embed) + + def get_valid_ratio(self, mask): + _, H, W = mask.shape + valid_H = torch.sum(~mask[:, :, 0], 1) + valid_W = torch.sum(~mask[:, 0, :], 1) + valid_ratio_h = valid_H.float() / H + valid_ratio_w = valid_W.float() / W + valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) + return valid_ratio + + def init_ref_points(self, use_num_queries): + self.refpoint_embed = nn.Embedding(use_num_queries, 4) + + def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, text_dict=None): + """ + Input: + - srcs: List of multi features [bs, ci, hi, wi] + - masks: List of multi masks [bs, hi, wi] + - refpoint_embed: [bs, num_dn, 4]. None in infer + - pos_embeds: List of multi pos embeds [bs, ci, hi, wi] + - tgt: [bs, num_dn, d_model]. None in infer + + """ + # prepare input for encoder + src_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): + bs, c, h, w = src.shape + spatial_shape = (h, w) + spatial_shapes.append(spatial_shape) + + src = src.flatten(2).transpose(1, 2) # bs, hw, c + mask = mask.flatten(1) # bs, hw + pos_embed = pos_embed.flatten(2).transpose(1, 2) # bs, hw, c + if self.num_feature_levels > 1 and self.level_embed is not None: + lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) + else: + lvl_pos_embed = pos_embed + lvl_pos_embed_flatten.append(lvl_pos_embed) + src_flatten.append(src) + mask_flatten.append(mask) + src_flatten = torch.cat(src_flatten, 1) # bs, \sum{hxw}, c + mask_flatten = torch.cat(mask_flatten, 1) # bs, \sum{hxw} + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) # bs, \sum{hxw}, c + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=src_flatten.device + ) + level_start_index = torch.cat( + (spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]) + ) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) + + # two stage + enc_topk_proposals = enc_refpoint_embed = None + + ######################################################### + # Begin Encoder + ######################################################### + memory, memory_text = self.encoder( + src_flatten, + pos=lvl_pos_embed_flatten, + level_start_index=level_start_index, + spatial_shapes=spatial_shapes, + valid_ratios=valid_ratios, + key_padding_mask=mask_flatten, + memory_text=text_dict["encoded_text"], + text_attention_mask=~text_dict["text_token_mask"], + # we ~ the mask . False means use the token; True means pad the token + position_ids=text_dict["position_ids"], + text_self_attention_masks=text_dict["text_self_attention_masks"], + ) + ######################################################### + # End Encoder + # - memory: bs, \sum{hw}, c + # - mask_flatten: bs, \sum{hw} + # - lvl_pos_embed_flatten: bs, \sum{hw}, c + # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c) + # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c) + ######################################################### + text_dict["encoded_text"] = memory_text + # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': + # if memory.isnan().any() | memory.isinf().any(): + # import ipdb; ipdb.set_trace() + + if self.two_stage_type == "standard": + output_memory, output_proposals = gen_encoder_output_proposals( + memory, mask_flatten, spatial_shapes + ) + output_memory = self.enc_output_norm(self.enc_output(output_memory)) + + if text_dict is not None: + enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict) + else: + enc_outputs_class_unselected = self.enc_out_class_embed(output_memory) + + topk_logits = enc_outputs_class_unselected.max(-1)[0] + enc_outputs_coord_unselected = ( + self.enc_out_bbox_embed(output_memory) + output_proposals + ) # (bs, \sum{hw}, 4) unsigmoid + topk = self.num_queries + + topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] # bs, nq + + # gather boxes + refpoint_embed_undetach = torch.gather( + enc_outputs_coord_unselected, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) # unsigmoid + refpoint_embed_ = refpoint_embed_undetach.detach() + init_box_proposal = torch.gather( + output_proposals, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ).sigmoid() # sigmoid + + # gather tgt + tgt_undetach = torch.gather( + output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) + ) + if self.embed_init_tgt: + tgt_ = ( + self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) + ) # nq, bs, d_model + else: + tgt_ = tgt_undetach.detach() + + if refpoint_embed is not None: + refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1) + tgt = torch.cat([tgt, tgt_], dim=1) + else: + refpoint_embed, tgt = refpoint_embed_, tgt_ + + elif self.two_stage_type == "no": + tgt_ = ( + self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) + ) # nq, bs, d_model + refpoint_embed_ = ( + self.refpoint_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) + ) # nq, bs, 4 + + if refpoint_embed is not None: + refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1) + tgt = torch.cat([tgt, tgt_], dim=1) + else: + refpoint_embed, tgt = refpoint_embed_, tgt_ + + if self.num_patterns > 0: + tgt_embed = tgt.repeat(1, self.num_patterns, 1) + refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1) + tgt_pat = self.patterns.weight[None, :, :].repeat_interleave( + self.num_queries, 1 + ) # 1, n_q*n_pat, d_model + tgt = tgt_embed + tgt_pat + + init_box_proposal = refpoint_embed_.sigmoid() + + else: + raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type)) + ######################################################### + # End preparing tgt + # - tgt: bs, NQ, d_model + # - refpoint_embed(unsigmoid): bs, NQ, d_model + ######################################################### + + ######################################################### + # Begin Decoder + ######################################################### + hs, references = self.decoder( + tgt=tgt.transpose(0, 1), + memory=memory.transpose(0, 1), + memory_key_padding_mask=mask_flatten, + pos=lvl_pos_embed_flatten.transpose(0, 1), + refpoints_unsigmoid=refpoint_embed.transpose(0, 1), + level_start_index=level_start_index, + spatial_shapes=spatial_shapes, + valid_ratios=valid_ratios, + tgt_mask=attn_mask, + memory_text=text_dict["encoded_text"], + text_attention_mask=~text_dict["text_token_mask"], + # we ~ the mask . False means use the token; True means pad the token + ) + ######################################################### + # End Decoder + # hs: n_dec, bs, nq, d_model + # references: n_dec+1, bs, nq, query_dim + ######################################################### + + ######################################################### + # Begin postprocess + ######################################################### + if self.two_stage_type == "standard": + hs_enc = tgt_undetach.unsqueeze(0) + ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0) + else: + hs_enc = ref_enc = None + ######################################################### + # End postprocess + # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None + # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None + ######################################################### + + return hs, references, hs_enc, ref_enc, init_box_proposal + # hs: (n_dec, bs, nq, d_model) + # references: sigmoid coordinates. (n_dec+1, bs, bq, 4) + # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None + # ref_enc: sigmoid coordinates. \ + # (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None + + +class TransformerEncoder(nn.Module): + def __init__( + self, + encoder_layer, + num_layers, + d_model=256, + num_queries=300, + enc_layer_share=False, + text_enhance_layer=None, + feature_fusion_layer=None, + use_checkpoint=False, + use_transformer_ckpt=False, + ): + """_summary_ + + Args: + encoder_layer (_type_): _description_ + num_layers (_type_): _description_ + norm (_type_, optional): _description_. Defaults to None. + d_model (int, optional): _description_. Defaults to 256. + num_queries (int, optional): _description_. Defaults to 300. + enc_layer_share (bool, optional): _description_. Defaults to False. + + """ + super().__init__() + # prepare layers + self.layers = [] + self.text_layers = [] + self.fusion_layers = [] + if num_layers > 0: + self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share) + + if text_enhance_layer is not None: + self.text_layers = _get_clones( + text_enhance_layer, num_layers, layer_share=enc_layer_share + ) + if feature_fusion_layer is not None: + self.fusion_layers = _get_clones( + feature_fusion_layer, num_layers, layer_share=enc_layer_share + ) + else: + self.layers = [] + del encoder_layer + + if text_enhance_layer is not None: + self.text_layers = [] + del text_enhance_layer + if feature_fusion_layer is not None: + self.fusion_layers = [] + del feature_fusion_layer + + self.query_scale = None + self.num_queries = num_queries + self.num_layers = num_layers + self.d_model = d_model + + self.use_checkpoint = use_checkpoint + self.use_transformer_ckpt = use_transformer_ckpt + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + reference_points_list = [] + for lvl, (H_, W_) in enumerate(spatial_shapes): + + ref_y, ref_x = torch.meshgrid( + torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), + torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device), + ) + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + # for images + src: Tensor, + pos: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + valid_ratios: Tensor, + key_padding_mask: Tensor, + # for texts + memory_text: Tensor = None, + text_attention_mask: Tensor = None, + pos_text: Tensor = None, + text_self_attention_masks: Tensor = None, + position_ids: Tensor = None, + ): + """ + Input: + - src: [bs, sum(hi*wi), 256] + - pos: pos embed for src. [bs, sum(hi*wi), 256] + - spatial_shapes: h,w of each level [num_level, 2] + - level_start_index: [num_level] start point of level in sum(hi*wi). + - valid_ratios: [bs, num_level, 2] + - key_padding_mask: [bs, sum(hi*wi)] + + - memory_text: bs, n_text, 256 + - text_attention_mask: bs, n_text + False for no padding; True for padding + - pos_text: bs, n_text, 256 + + - position_ids: bs, n_text + Intermedia: + - reference_points: [bs, sum(hi*wi), num_level, 2] + Outpus: + - output: [bs, sum(hi*wi), 256] + """ + + output = src + + # preparation and reshape + if self.num_layers > 0: + reference_points = self.get_reference_points( + spatial_shapes, valid_ratios, device=src.device + ) + + if self.text_layers: + # generate pos_text + bs, n_text, text_dim = memory_text.shape + if pos_text is None and position_ids is None: + pos_text = ( + torch.arange(n_text, device=memory_text.device) + .float() + .unsqueeze(0) + .unsqueeze(-1) + .repeat(bs, 1, 1) + ) + pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False) + if position_ids is not None: + pos_text = get_sine_pos_embed( + position_ids[..., None], num_pos_feats=256, exchange_xy=False + ) + + # main process + for layer_id, layer in enumerate(self.layers): + # if output.isnan().any() or memory_text.isnan().any(): + # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': + # import ipdb; ipdb.set_trace() + if self.fusion_layers: + if self.use_checkpoint: + output, memory_text = checkpoint.checkpoint( + self.fusion_layers[layer_id], + output, + memory_text, + key_padding_mask, + text_attention_mask, + ) + else: + output, memory_text = self.fusion_layers[layer_id]( + v=output, + l=memory_text, + attention_mask_v=key_padding_mask, + attention_mask_l=text_attention_mask, + ) + + if self.text_layers: + memory_text = self.text_layers[layer_id]( + src=memory_text.transpose(0, 1), + src_mask=~text_self_attention_masks, # note we use ~ for mask here + src_key_padding_mask=text_attention_mask, + pos=(pos_text.transpose(0, 1) if pos_text is not None else None), + ).transpose(0, 1) + + # main process + if self.use_transformer_ckpt: + output = checkpoint.checkpoint( + layer, + output, + pos, + reference_points, + spatial_shapes, + level_start_index, + key_padding_mask, + ) + else: + output = layer( + src=output, + pos=pos, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + key_padding_mask=key_padding_mask, + ) + + return output, memory_text + + +class TransformerDecoder(nn.Module): + def __init__( + self, + decoder_layer, + num_layers, + norm=None, + return_intermediate=False, + d_model=256, + query_dim=4, + num_feature_levels=1, + ): + super().__init__() + if num_layers > 0: + self.layers = _get_clones(decoder_layer, num_layers) + else: + self.layers = [] + self.num_layers = num_layers + self.norm = norm + self.return_intermediate = return_intermediate + assert return_intermediate, "support return_intermediate only" + self.query_dim = query_dim + assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim) + self.num_feature_levels = num_feature_levels + + self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2) + self.query_pos_sine_scale = None + + self.query_scale = None + self.bbox_embed = None + self.class_embed = None + + self.d_model = d_model + + self.ref_anchor_head = None + + def forward( + self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + refpoints_unsigmoid: Optional[Tensor] = None, # num_queries, bs, 2 + # for memory + level_start_index: Optional[Tensor] = None, # num_levels + spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2 + valid_ratios: Optional[Tensor] = None, + # for text + memory_text: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + ): + """ + Input: + - tgt: nq, bs, d_model + - memory: hw, bs, d_model + - pos: hw, bs, d_model + - refpoints_unsigmoid: nq, bs, 2/4 + - valid_ratios/spatial_shapes: bs, nlevel, 2 + """ + output = tgt + + intermediate = [] + reference_points = refpoints_unsigmoid.sigmoid() + ref_points = [reference_points] + + for layer_id, layer in enumerate(self.layers): + + if reference_points.shape[-1] == 4: + reference_points_input = ( + reference_points[:, :, None] + * torch.cat([valid_ratios, valid_ratios], -1)[None, :] + ) # nq, bs, nlevel, 4 + else: + assert reference_points.shape[-1] == 2 + reference_points_input = reference_points[:, :, None] * valid_ratios[None, :] + query_sine_embed = gen_sineembed_for_position( + reference_points_input[:, :, 0, :] + ) # nq, bs, 256*2 + + # conditional query + raw_query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256 + pos_scale = self.query_scale(output) if self.query_scale is not None else 1 + query_pos = pos_scale * raw_query_pos + # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': + # if query_pos.isnan().any() | query_pos.isinf().any(): + # import ipdb; ipdb.set_trace() + + # main process + output = layer( + tgt=output, + tgt_query_pos=query_pos, + tgt_query_sine_embed=query_sine_embed, + tgt_key_padding_mask=tgt_key_padding_mask, + tgt_reference_points=reference_points_input, + memory_text=memory_text, + text_attention_mask=text_attention_mask, + memory=memory, + memory_key_padding_mask=memory_key_padding_mask, + memory_level_start_index=level_start_index, + memory_spatial_shapes=spatial_shapes, + memory_pos=pos, + self_attn_mask=tgt_mask, + cross_attn_mask=memory_mask, + ) + if output.isnan().any() | output.isinf().any(): + print(f"output layer_id {layer_id} is nan") + try: + num_nan = output.isnan().sum().item() + num_inf = output.isinf().sum().item() + print(f"num_nan {num_nan}, num_inf {num_inf}") + except Exception as e: + print(e) + # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': + # import ipdb; ipdb.set_trace() + + # iter update + if self.bbox_embed is not None: + # box_holder = self.bbox_embed(output) + # box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points) + # new_reference_points = box_holder[..., :self.query_dim].sigmoid() + + reference_before_sigmoid = inverse_sigmoid(reference_points) + delta_unsig = self.bbox_embed[layer_id](output) + outputs_unsig = delta_unsig + reference_before_sigmoid + new_reference_points = outputs_unsig.sigmoid() + + reference_points = new_reference_points.detach() + # if layer_id != self.num_layers - 1: + ref_points.append(new_reference_points) + + intermediate.append(self.norm(output)) + + return [ + [itm_out.transpose(0, 1) for itm_out in intermediate], + [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points], + ] + + +class DeformableTransformerEncoderLayer(nn.Module): + def __init__( + self, + d_model=256, + d_ffn=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_heads=8, + n_points=4, + ): + super().__init__() + + # self attention + self.self_attn = MSDeformAttn( + embed_dim=d_model, + num_levels=n_levels, + num_heads=n_heads, + num_points=n_points, + batch_first=True, + ) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + + # ffn + r = 16 + self.linear1 = lora.Linear(d_model, d_ffn , r=r) + self.activation = _get_activation_fn(activation, d_model=d_ffn) + self.dropout2 = nn.Dropout(dropout) + self.linear2 = lora.Linear(d_ffn, d_model , r=r) + self.dropout3 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm(d_model) + + @staticmethod + def with_pos_embed(tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, src): + src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) + src = src + self.dropout3(src2) + src = self.norm2(src) + return src + + def forward( + self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None + ): + # self attention + # import ipdb; ipdb.set_trace() + src2 = self.self_attn( + query=self.with_pos_embed(src, pos), + reference_points=reference_points, + value=src, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + key_padding_mask=key_padding_mask, + ) + src = src + self.dropout1(src2) + src = self.norm1(src) + + # ffn + src = self.forward_ffn(src) + + return src + + +class DeformableTransformerDecoderLayer(nn.Module): + def __init__( + self, + d_model=256, + d_ffn=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_heads=8, + n_points=4, + use_text_feat_guide=False, + use_text_cross_attention=False, + ): + super().__init__() + + # cross attention + self.cross_attn = MSDeformAttn( + embed_dim=d_model, + num_levels=n_levels, + num_heads=n_heads, + num_points=n_points, + batch_first=True, + ) + self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.norm1 = nn.LayerNorm(d_model) + + # cross attention text + if use_text_cross_attention: + self.ca_text = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) + self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.catext_norm = nn.LayerNorm(d_model) + + # self attention + self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) + self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.norm2 = nn.LayerNorm(d_model) + + # ffn + r = 16 + self.linear1 = lora.Linear(d_model, d_ffn , r=r) + self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1) + self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.linear2 = lora.Linear(d_ffn, d_model , r=r) + self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.norm3 = nn.LayerNorm(d_model) + + self.key_aware_proj = None + self.use_text_feat_guide = use_text_feat_guide + assert not use_text_feat_guide + self.use_text_cross_attention = use_text_cross_attention + + def rm_self_attn_modules(self): + self.self_attn = None + self.dropout2 = None + self.norm2 = None + + @staticmethod + def with_pos_embed(tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt): + with torch.cuda.amp.autocast(enabled=False): + tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout4(tgt2) + tgt = self.norm3(tgt) + return tgt + + def forward( + self, + # for tgt + tgt: Optional[Tensor], # nq, bs, d_model + tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos)) + tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos) + tgt_key_padding_mask: Optional[Tensor] = None, + tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4 + memory_text: Optional[Tensor] = None, # bs, num_token, d_model + text_attention_mask: Optional[Tensor] = None, # bs, num_token + # for memory + memory: Optional[Tensor] = None, # hw, bs, d_model + memory_key_padding_mask: Optional[Tensor] = None, + memory_level_start_index: Optional[Tensor] = None, # num_levels + memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2 + memory_pos: Optional[Tensor] = None, # pos for memory + # sa + self_attn_mask: Optional[Tensor] = None, # mask used for self-attention + cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention + ): + """ + Input: + - tgt/tgt_query_pos: nq, bs, d_model + - + """ + assert cross_attn_mask is None + + # self attention + if self.self_attn is not None: + # import ipdb; ipdb.set_trace() + q = k = self.with_pos_embed(tgt, tgt_query_pos) + tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0] + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + if self.use_text_cross_attention: + tgt2 = self.ca_text( + self.with_pos_embed(tgt, tgt_query_pos), + memory_text.transpose(0, 1), + memory_text.transpose(0, 1), + key_padding_mask=text_attention_mask, + )[0] + tgt = tgt + self.catext_dropout(tgt2) + tgt = self.catext_norm(tgt) + + tgt2 = self.cross_attn( + query=self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1), + reference_points=tgt_reference_points.transpose(0, 1).contiguous(), + value=memory.transpose(0, 1), + spatial_shapes=memory_spatial_shapes, + level_start_index=memory_level_start_index, + key_padding_mask=memory_key_padding_mask, + ).transpose(0, 1) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + + # ffn + tgt = self.forward_ffn(tgt) + + return tgt + + +def build_transformer(args): + return Transformer( + d_model=args.hidden_dim, + dropout=args.dropout, + nhead=args.nheads, + num_queries=args.num_queries, + dim_feedforward=args.dim_feedforward, + num_encoder_layers=args.enc_layers, + num_decoder_layers=args.dec_layers, + normalize_before=args.pre_norm, + return_intermediate_dec=True, + query_dim=args.query_dim, + activation=args.transformer_activation, + num_patterns=args.num_patterns, + num_feature_levels=args.num_feature_levels, + enc_n_points=args.enc_n_points, + dec_n_points=args.dec_n_points, + learnable_tgt_init=True, + # two stage + two_stage_type=args.two_stage_type, # ['no', 'standard', 'early'] + embed_init_tgt=args.embed_init_tgt, + use_text_enhancer=args.use_text_enhancer, + use_fusion_layer=args.use_fusion_layer, + use_checkpoint=args.use_checkpoint, + use_transformer_ckpt=args.use_transformer_ckpt, + use_text_cross_attention=args.use_text_cross_attention, + text_dropout=args.text_dropout, + fusion_dropout=args.fusion_dropout, + fusion_droppath=args.fusion_droppath, + ) diff --git a/groundingdino/models/GroundingDINO/transformer_vanilla.py b/groundingdino/models/GroundingDINO/transformer_vanilla.py new file mode 100644 index 0000000000000000000000000000000000000000..710e1c21c87281def4da9af9670e1ad880dc0f5f --- /dev/null +++ b/groundingdino/models/GroundingDINO/transformer_vanilla.py @@ -0,0 +1,124 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +DETR Transformer class. + +Copy-paste from torch.nn.Transformer with modifications: + * positional encodings are passed in MHattention + * extra LN at the end of encoder is removed + * decoder returns a stack of activations from all decoding layers +""" +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import Tensor, nn +import loralib as lora +from .utils import ( + MLP, + _get_activation_fn, + _get_clones, + gen_encoder_output_proposals, + gen_sineembed_for_position, + sigmoid_focal_loss, +) + + +class TextTransformer(nn.Module): + def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1): + super().__init__() + self.num_layers = num_layers + self.d_model = d_model + self.nheads = nheads + self.dim_feedforward = dim_feedforward + self.norm = None + + single_encoder_layer = TransformerEncoderLayer( + d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout + ) + self.layers = _get_clones(single_encoder_layer, num_layers) + + def forward(self, memory_text: torch.Tensor, text_attention_mask: torch.Tensor): + """ + + Args: + text_attention_mask: bs, num_token + memory_text: bs, num_token, d_model + + Raises: + RuntimeError: _description_ + + Returns: + output: bs, num_token, d_model + """ + + output = memory_text.transpose(0, 1) + + for layer in self.layers: + output = layer(output, src_key_padding_mask=text_attention_mask) + + if self.norm is not None: + output = self.norm(output) + + return output.transpose(0, 1) + + +class TransformerEncoderLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + r=16 + self.linear1 = lora.Linear(d_model, dim_feedforward , r=r) + self.dropout = nn.Dropout(dropout) + self.linear2 = lora.Linear(dim_feedforward, d_model , r=r) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + self.nhead = nhead + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward( + self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): + # repeat attn mask + if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]: + # bs, num_q, num_k + src_mask = src_mask.repeat(self.nhead, 1, 1) + + q = k = self.with_pos_embed(src, pos) + + src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0] + + # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + return src diff --git a/groundingdino/models/GroundingDINO/utils.py b/groundingdino/models/GroundingDINO/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..319c02da18b1da17f64bd6e03372ccab8ec21c87 --- /dev/null +++ b/groundingdino/models/GroundingDINO/utils.py @@ -0,0 +1,269 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ + +import copy +import math + +import torch +import torch.nn.functional as F +from torch import Tensor, nn +import loralib as lora + +def _get_clones(module, N, layer_share=False): + # import ipdb; ipdb.set_trace() + if layer_share: + return nn.ModuleList([module for i in range(N)]) + else: + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def get_sine_pos_embed( + pos_tensor: torch.Tensor, + num_pos_feats: int = 128, + temperature: int = 10000, + exchange_xy: bool = True, +): + """generate sine position embedding from a position tensor + Args: + pos_tensor (torch.Tensor): shape: [..., n]. + num_pos_feats (int): projected shape for each float in the tensor. + temperature (int): temperature in the sine/cosine function. + exchange_xy (bool, optional): exchange pos x and pos y. \ + For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True. + Returns: + pos_embed (torch.Tensor): shape: [..., n*num_pos_feats]. + """ + scale = 2 * math.pi + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + + def sine_func(x: torch.Tensor): + sin_x = x * scale / dim_t + sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2) + return sin_x + + pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)] + if exchange_xy: + pos_res[0], pos_res[1] = pos_res[1], pos_res[0] + pos_res = torch.cat(pos_res, dim=-1) + return pos_res + + +def gen_encoder_output_proposals( + memory: Tensor, memory_padding_mask: Tensor, spatial_shapes: Tensor, learnedwh=None +): + """ + Input: + - memory: bs, \sum{hw}, d_model + - memory_padding_mask: bs, \sum{hw} + - spatial_shapes: nlevel, 2 + - learnedwh: 2 + Output: + - output_memory: bs, \sum{hw}, d_model + - output_proposals: bs, \sum{hw}, 4 + """ + N_, S_, C_ = memory.shape + proposals = [] + _cur = 0 + for lvl, (H_, W_) in enumerate(spatial_shapes): + mask_flatten_ = memory_padding_mask[:, _cur : (_cur + H_ * W_)].view(N_, H_, W_, 1) + valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + # import ipdb; ipdb.set_trace() + + grid_y, grid_x = torch.meshgrid( + torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device), + torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device), + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2 + + scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale + + if learnedwh is not None: + # import ipdb; ipdb.set_trace() + wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0**lvl) + else: + wh = torch.ones_like(grid) * 0.05 * (2.0**lvl) + + # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1) + # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale + # wh = torch.ones_like(grid) / scale + proposal = torch.cat((grid, wh), -1).view(N_, -1, 4) + proposals.append(proposal) + _cur += H_ * W_ + # import ipdb; ipdb.set_trace() + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all( + -1, keepdim=True + ) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid + output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + output_memory = memory + output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0)) + output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) + + # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf')) + # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf')) + + return output_memory, output_proposals + + +class RandomBoxPerturber: + def __init__( + self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2 + ) -> None: + self.noise_scale = torch.Tensor( + [x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale] + ) + + def __call__(self, refanchors: Tensor) -> Tensor: + nq, bs, query_dim = refanchors.shape + device = refanchors.device + + noise_raw = torch.rand_like(refanchors) + noise_scale = self.noise_scale.to(device)[:query_dim] + + new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale) + return new_refanchors.clamp_(0, 1) + + +def sigmoid_focal_loss( + inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False +): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + alpha: (optional) Weighting factor in range (0,1) to balance + positive vs negative examples. Default = -1 (no weighting). + gamma: Exponent of the modulating factor (1 - p_t) to + balance easy vs hard examples. + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + if no_reduction: + return loss + + return loss.mean(1).sum() / num_boxes + + +class MLP(nn.Module): + """Very simple multi-layer perceptron (also called FFN)""" + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + r=16 + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList( + [lora.Linear(n, k, r=r) for n, k in zip([input_dim] + h, h + [output_dim])] + ) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +def _get_activation_fn(activation, d_model=256, batch_dim=0): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + if activation == "prelu": + return nn.PReLU() + if activation == "selu": + return F.selu + + raise RuntimeError(f"activation should be relu/gelu, not {activation}.") + + +def gen_sineembed_for_position(pos_tensor): + # n_query, bs, _ = pos_tensor.size() + # sineembed_tensor = torch.zeros(n_query, bs, 256) + scale = 2 * math.pi + dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device) + dim_t = 10000 ** (2 * (torch.div(dim_t, 2, rounding_mode='floor')) / 128) + x_embed = pos_tensor[:, :, 0] * scale + y_embed = pos_tensor[:, :, 1] * scale + pos_x = x_embed[:, :, None] / dim_t + pos_y = y_embed[:, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) + pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) + if pos_tensor.size(-1) == 2: + pos = torch.cat((pos_y, pos_x), dim=2) + elif pos_tensor.size(-1) == 4: + w_embed = pos_tensor[:, :, 2] * scale + pos_w = w_embed[:, :, None] / dim_t + pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) + + h_embed = pos_tensor[:, :, 3] * scale + pos_h = h_embed[:, :, None] / dim_t + pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) + + pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) + else: + raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1))) + return pos + + +class ContrastiveEmbed(nn.Module): + def __init__(self, max_text_len=256): + """ + Args: + max_text_len: max length of text. + """ + super().__init__() + self.max_text_len = max_text_len + + def forward(self, x, text_dict): + """_summary_ + + Args: + x (_type_): _description_ + text_dict (_type_): _description_ + { + 'encoded_text': encoded_text, # bs, 195, d_model + 'text_token_mask': text_token_mask, # bs, 195 + # True for used tokens. False for padding tokens + } + Returns: + _type_: _description_ + """ + assert isinstance(text_dict, dict) + + y = text_dict["encoded_text"] + text_token_mask = text_dict["text_token_mask"] + + res = x @ y.transpose(-1, -2) + res.masked_fill_(~text_token_mask[:, None, :], float("-inf")) + + # padding to max_text_len + new_res = torch.full((*res.shape[:-1], self.max_text_len), float("-inf"), device=res.device) + new_res[..., : res.shape[-1]] = res + + return new_res diff --git a/groundingdino/models/__init__.py b/groundingdino/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e3413961d1d184b99835eb1e919b052d70298bc6 --- /dev/null +++ b/groundingdino/models/__init__.py @@ -0,0 +1,18 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .GroundingDINO import build_groundingdino + + +def build_model(args): + # we use register to maintain models from catdet6 on. + from .registry import MODULE_BUILD_FUNCS + + assert args.modelname in MODULE_BUILD_FUNCS._module_dict + build_func = MODULE_BUILD_FUNCS.get(args.modelname) + model = build_func(args) + return model diff --git a/groundingdino/models/__pycache__/__init__.cpython-310.pyc b/groundingdino/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c95eabc05ff2b971fe109f8ea61ed7e890e3aaca Binary files /dev/null and b/groundingdino/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/groundingdino/models/__pycache__/registry.cpython-310.pyc b/groundingdino/models/__pycache__/registry.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d02ec1ce4fd095816eb446f309d71aac0286fd21 Binary files /dev/null and b/groundingdino/models/__pycache__/registry.cpython-310.pyc differ diff --git a/groundingdino/models/registry.py b/groundingdino/models/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..2d22a59eec79a2a19b83fa1779f2adaf5753aec6 --- /dev/null +++ b/groundingdino/models/registry.py @@ -0,0 +1,66 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# -*- coding: utf-8 -*- +# @Author: Yihao Chen +# @Date: 2021-08-16 16:03:17 +# @Last Modified by: Shilong Liu +# @Last Modified time: 2022-01-23 15:26 +# modified from mmcv + +import inspect +from functools import partial + + +class Registry(object): + def __init__(self, name): + self._name = name + self._module_dict = dict() + + def __repr__(self): + format_str = self.__class__.__name__ + "(name={}, items={})".format( + self._name, list(self._module_dict.keys()) + ) + return format_str + + def __len__(self): + return len(self._module_dict) + + @property + def name(self): + return self._name + + @property + def module_dict(self): + return self._module_dict + + def get(self, key): + return self._module_dict.get(key, None) + + def registe_with_name(self, module_name=None, force=False): + return partial(self.register, module_name=module_name, force=force) + + def register(self, module_build_function, module_name=None, force=False): + """Register a module build function. + Args: + module (:obj:`nn.Module`): Module to be registered. + """ + if not inspect.isfunction(module_build_function): + raise TypeError( + "module_build_function must be a function, but got {}".format( + type(module_build_function) + ) + ) + if module_name is None: + module_name = module_build_function.__name__ + if not force and module_name in self._module_dict: + raise KeyError("{} is already registered in {}".format(module_name, self.name)) + self._module_dict[module_name] = module_build_function + + return module_build_function + + +MODULE_BUILD_FUNCS = Registry("model build functions") diff --git a/groundingdino/util/__pycache__/__init__.cpython-310.pyc b/groundingdino/util/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce387dfadd7477794d67e6eedf0eddc016842c7e Binary files /dev/null and b/groundingdino/util/__pycache__/__init__.cpython-310.pyc differ diff --git a/groundingdino/util/__pycache__/box_ops.cpython-310.pyc b/groundingdino/util/__pycache__/box_ops.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13628860081b7d499973b6c880a9353d357f70b1 Binary files /dev/null and b/groundingdino/util/__pycache__/box_ops.cpython-310.pyc differ diff --git a/groundingdino/util/__pycache__/get_tokenlizer.cpython-310.pyc b/groundingdino/util/__pycache__/get_tokenlizer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad4b6248b5c5f091caf6fbd0eff172a8edda60dd Binary files /dev/null and b/groundingdino/util/__pycache__/get_tokenlizer.cpython-310.pyc differ diff --git a/groundingdino/util/__pycache__/misc.cpython-310.pyc b/groundingdino/util/__pycache__/misc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a15fdbb4627cb09e7d5c95dacf06d25636a37d31 Binary files /dev/null and b/groundingdino/util/__pycache__/misc.cpython-310.pyc differ diff --git a/groundingdino/util/__pycache__/slconfig.cpython-310.pyc b/groundingdino/util/__pycache__/slconfig.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a12f5dc0208a5b85ec8553c39867cf32d394626d Binary files /dev/null and b/groundingdino/util/__pycache__/slconfig.cpython-310.pyc differ diff --git a/groundingdino/util/__pycache__/utils.cpython-310.pyc b/groundingdino/util/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e971c877dcdc5806ad2c179adb479b95bf2fa19 Binary files /dev/null and b/groundingdino/util/__pycache__/utils.cpython-310.pyc differ diff --git a/groundingdino/util/__pycache__/visualizer.cpython-310.pyc b/groundingdino/util/__pycache__/visualizer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e09b7736c2156631e6c9a9be51c69c8a589e35d8 Binary files /dev/null and b/groundingdino/util/__pycache__/visualizer.cpython-310.pyc differ diff --git a/groundingdino/util/__pycache__/vl_utils.cpython-310.pyc b/groundingdino/util/__pycache__/vl_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4838e8a99590901eb05a5e0b05ef9b7537b11733 Binary files /dev/null and b/groundingdino/util/__pycache__/vl_utils.cpython-310.pyc differ diff --git a/groundingdino/util/get_tokenlizer.py b/groundingdino/util/get_tokenlizer.py new file mode 100644 index 0000000000000000000000000000000000000000..dd2d972b4278e04a1ebef7d5e77aecd4eaf4205b --- /dev/null +++ b/groundingdino/util/get_tokenlizer.py @@ -0,0 +1,29 @@ +from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast +import os + +def get_tokenlizer(text_encoder_type): + if not isinstance(text_encoder_type, str): + # print("text_encoder_type is not a str") + if hasattr(text_encoder_type, "text_encoder_type"): + text_encoder_type = text_encoder_type.text_encoder_type + elif text_encoder_type.get("text_encoder_type", False): + text_encoder_type = text_encoder_type.get("text_encoder_type") + elif os.path.isdir(text_encoder_type) and os.path.exists(text_encoder_type): + pass + else: + raise ValueError( + "Unknown type of text_encoder_type: {}".format(type(text_encoder_type)) + ) + print("final text_encoder_type: {}".format(text_encoder_type)) + + tokenizer = AutoTokenizer.from_pretrained(text_encoder_type) + return tokenizer + + +def get_pretrained_language_model(text_encoder_type): + if text_encoder_type == "bert-base-uncased" or (os.path.isdir(text_encoder_type) and os.path.exists(text_encoder_type)): + return BertModel.from_pretrained(text_encoder_type) + if text_encoder_type == "roberta-base": + return RobertaModel.from_pretrained(text_encoder_type) + + raise ValueError("Unknown text_encoder_type {}".format(text_encoder_type)) diff --git a/groundingdino/version.py b/groundingdino/version.py new file mode 100644 index 0000000000000000000000000000000000000000..b794fd409a5e3b3b65ad76a43d6a01a318877640 --- /dev/null +++ b/groundingdino/version.py @@ -0,0 +1 @@ +__version__ = '0.1.0'