Spaces:

Hasanmog
/

Peft-GroundingDINO

Runtime error

App Files Files Community

Hasanmog commited on Jun 20, 2024

Commit

e33160d

1 Parent(s): 2ebac4f

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.ipynb_checkpoints/README-checkpoint.md +11 -0
.ipynb_checkpoints/README-checkpoint.txt +0 -1
.ipynb_checkpoints/requirements-checkpoint.txt +0 -17
.ipynb_checkpoints/test-checkpoint.ipynb +0 -113
README.md +10 -0
README.txt +0 -1
app.py +0 -125
groundingdino.egg-info/PKG-INFO +0 -213
groundingdino.egg-info/SOURCES.txt +0 -46
groundingdino.egg-info/requires.txt +0 -10
groundingdino.egg-info/top_level.txt +0 -1
groundingdino/.ipynb_checkpoints/__init__-checkpoint.py +0 -0
groundingdino/.ipynb_checkpoints/version-checkpoint.py +0 -1
groundingdino/__init__.py +0 -0
groundingdino/__pycache__/__init__.cpython-310.pyc +0 -0
groundingdino/config/.ipynb_checkpoints/GroundingDINO_SwinB_cfg-checkpoint.py +0 -43
groundingdino/config/GroundingDINO_SwinB_cfg.py +0 -43
groundingdino/config/GroundingDINO_SwinT_OGC.py +0 -43
groundingdino/config/__init__.py +0 -0
groundingdino/datasets/.ipynb_checkpoints/__init__-checkpoint.py +23 -0
groundingdino/datasets/.ipynb_checkpoints/coco-checkpoint.py +649 -0
groundingdino/datasets/.ipynb_checkpoints/dataset-checkpoint.py +44 -0
groundingdino/datasets/.ipynb_checkpoints/odvg-checkpoint.py +258 -0
groundingdino/datasets/.ipynb_checkpoints/transforms-checkpoint.py +285 -0
groundingdino/datasets/__init__.py +23 -0
groundingdino/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
groundingdino/datasets/__pycache__/coco.cpython-310.pyc +0 -0
groundingdino/datasets/__pycache__/coco_eval.cpython-310.pyc +0 -0
groundingdino/datasets/__pycache__/cocogrounding_eval.cpython-310.pyc +0 -0
groundingdino/datasets/__pycache__/data_util.cpython-310.pyc +0 -0
groundingdino/datasets/__pycache__/odvg.cpython-310.pyc +0 -0
groundingdino/datasets/__pycache__/panoptic_eval.cpython-310.pyc +0 -0
groundingdino/datasets/__pycache__/random_crop.cpython-310.pyc +0 -0
groundingdino/datasets/__pycache__/sltransform.cpython-310.pyc +0 -0
groundingdino/datasets/__pycache__/transforms.cpython-310.pyc +0 -0
groundingdino/datasets/coco.py +649 -0
groundingdino/datasets/coco_eval.py +266 -0
groundingdino/datasets/coco_panoptic.py +99 -0
groundingdino/datasets/cocogrounding_eval.py +3 -1
groundingdino/datasets/data_util.py +170 -0
groundingdino/datasets/dataset.py +44 -0
groundingdino/datasets/odvg.py +258 -0
groundingdino/datasets/panoptic_eval.py +44 -0
groundingdino/datasets/random_crop.py +135 -0
groundingdino/datasets/sltransform.py +247 -0
groundingdino/datasets/transforms.py +22 -48
groundingdino/models/.ipynb_checkpoints/__init__-checkpoint.py +0 -18
groundingdino/models/.ipynb_checkpoints/registry-checkpoint.py +0 -66
groundingdino/models/GroundingDINO/.ipynb_checkpoints/bertwarper-checkpoint.py +273 -0
groundingdino/models/GroundingDINO/.ipynb_checkpoints/fuse_modules-checkpoint.py +8 -8

.ipynb_checkpoints/README-checkpoint.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: My Awesome Space
+emoji: 🚀
+colorFrom: red
+colorTo: blue
+sdk: gradio
+sdk_version: 3.9.0
+app_file: app.py
+pinned: false
+---

.ipynb_checkpoints/README-checkpoint.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- Peft-ed Grounding DINO on RSVG dataset

.ipynb_checkpoints/requirements-checkpoint.txt DELETED Viewed

@@ -1,17 +0,0 @@
-cython
-submitit
-scipy
-termcolor
-addict
-yapf==0.40.1
-timm
-torch
-torchvision
-transformers
-numpy
-opencv-python
-supervision==0.6.0
-pycocotools
-pyyaml>3.10
-colorlog
-loralib

.ipynb_checkpoints/test-checkpoint.ipynb DELETED Viewed

@@ -1,113 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "final text_encoder_type: bert-base-uncased\n"
-     ]
-    },
-    {
-     "data": {
-      "application/json": {
-       "ascii": false,
-       "bar_format": null,
-       "colour": null,
-       "elapsed": 0.014210224151611328,
-       "initial": 0,
-       "n": 0,
-       "ncols": null,
-       "nrows": null,
-       "postfix": null,
-       "prefix": "Downloading model.safetensors",
-       "rate": null,
-       "total": 440449768,
-       "unit": "B",
-       "unit_divisor": 1000,
-       "unit_scale": true
-      },
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5922f34578364d36afa13de9f01254bd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/root/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:881: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/root/miniconda3/lib/python3.8/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
-      "  warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from groundingdino.util.inference import load_model, load_image, predict, annotate\n",
-    "import cv2\n",
-    "\n",
-    "model = load_model(\"groundingdino/config/GroundingDINO_SwinT_OGC.py\", \"../04-06-segment-anything/weights/groundingdino_swint_ogc.pth\")\n",
-    "IMAGE_PATH = \".asset/cat_dog.jpeg\"\n",
-    "TEXT_PROMPT = \"chair . person . dog .\"\n",
-    "BOX_TRESHOLD = 0.35\n",
-    "TEXT_TRESHOLD = 0.25\n",
-    "\n",
-    "image_source, image = load_image(IMAGE_PATH)\n",
-    "\n",
-    "boxes, logits, phrases = predict(\n",
-    "    model=model,\n",
-    "    image=image,\n",
-    "    caption=TEXT_PROMPT,\n",
-    "    box_threshold=BOX_TRESHOLD,\n",
-    "    text_threshold=TEXT_TRESHOLD\n",
-    ")\n",
-    "\n",
-    "annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)\n",
-    "cv2.imwrite(\"annotated_image.jpg\", annotated_frame)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: My Awesome Space
+emoji: 🚀
+colorFrom: red
+colorTo: blue
+sdk: gradio
+sdk_version: 4.36.1
+app_file: app.py
+pinned: false
+---

README.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- Peft-ed Grounding DINO on RSVG dataset

app.py DELETED Viewed

@@ -1,125 +0,0 @@
-import argparse
-from functools import partial
-import cv2
-import requests
-import os
-from io import BytesIO
-from PIL import Image
-import numpy as np
-from pathlib import Path
-import warnings
-import torch
-# prepare the environment
-os.system("python setup.py build develop --user")
-os.system("pip install packaging==21.3")
-os.system("pip install gradio")
-warnings.filterwarnings("ignore")
-import gradio as gr
-from groundingdino.models import build_model
-from groundingdino.util.slconfig import SLConfig
-from groundingdino.util.utils import clean_state_dict
-from groundingdino.util.inference import annotate, load_image, predict
-import groundingdino.datasets.transforms as T
-from huggingface_hub import hf_hub_download
-# Use this command for evaluate the Grounding DINO model
-config_file = "groundingdino/config/GroundingDINO_SwinB_OGC.py"
-ckpt_repo_id = "Hasanmog/Peft-GroundingDINO"
-ckpt_filenmae = "Best.pth"
-def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
-    args = SLConfig.fromfile(model_config_path)
-    model = build_model(args)
-    args.device = device
-    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
-    checkpoint = torch.load(cache_file, map_location='cpu')
-    log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
-    print("Model loaded from {} \n => {}".format(cache_file, log))
-    _ = model.eval()
-    return model
-def image_transform_grounding(init_image):
-    transform = T.Compose([
-        T.RandomResize([800], max_size=1333),
-        T.ToTensor(),
-        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
-    ])
-    image, _ = transform(init_image, None) # 3, h, w
-    return init_image, image
-def image_transform_grounding_for_vis(init_image):
-    transform = T.Compose([
-        T.RandomResize([800], max_size=1333),
-    ])
-    image, _ = transform(init_image, None) # 3, h, w
-    return image
-model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
-def run_grounding(input_image, grounding_caption, box_threshold, text_threshold):
-    init_image = input_image.convert("RGB")
-    original_size = init_image.size
-    _, image_tensor = image_transform_grounding(init_image)
-    image_pil: Image = image_transform_grounding_for_vis(init_image)
-    # run grounidng
-    boxes, logits, phrases = predict(model, image_tensor, grounding_caption, box_threshold, text_threshold, device='cpu')
-    annotated_frame = annotate(image_source=np.asarray(image_pil), boxes=boxes, logits=logits, phrases=phrases)
-    image_with_box = Image.fromarray(cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB))
-    return image_with_box
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser("Grounding DINO demo", add_help=True)
-    parser.add_argument("--debug", action="store_true", help="using debug mode")
-    parser.add_argument("--share", action="store_true", help="share the app")
-    args = parser.parse_args()
-    block = gr.Blocks().queue()
-    with block:
-        gr.Markdown("# [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO)")
-        gr.Markdown("### Open-World Detection with Grounding DINO")
-        with gr.Row():
-            with gr.Column():
-                input_image = gr.Image(source='upload', type="pil")
-                grounding_caption = gr.Textbox(label="Detection Prompt")
-                run_button = gr.Button(label="Run")
-                with gr.Accordion("Advanced options", open=False):
-                    box_threshold = gr.Slider(
-                        label="Box Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
-                    )
-                    text_threshold = gr.Slider(
-                        label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
-                    )
-            with gr.Column():
-                gallery = gr.outputs.Image(
-                    type="pil",
-                    # label="grounding results"
-                ).style(full_width=True, full_height=True)
-                # gallery = gr.Gallery(label="Generated images", show_label=False).style(
-                #         grid=[1], height="auto", container=True, full_width=True, full_height=True)
-        run_button.click(fn=run_grounding, inputs=[
-                        input_image, grounding_caption, box_threshold, text_threshold], outputs=[gallery])
-    block.launch(server_name='0.0.0.0', server_port=7579, debug=args.debug, share=args.share)

groundingdino.egg-info/PKG-INFO DELETED Viewed

@@ -1,213 +0,0 @@
-Metadata-Version: 2.1
-Name: groundingdino
-Version: 0.1.0
-Summary: open-set object detector
-Home-page: https://github.com/IDEA-Research/GroundingDINO
-Author: International Digital Economy Academy, Shilong Liu
-License:                                  Apache License
-                                   Version 2.0, January 2004
-                                http://www.apache.org/licenses/
-           TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-           1. Definitions.
-              "License" shall mean the terms and conditions for use, reproduction,
-              and distribution as defined by Sections 1 through 9 of this document.
-              "Licensor" shall mean the copyright owner or entity authorized by
-              the copyright owner that is granting the License.
-              "Legal Entity" shall mean the union of the acting entity and all
-              other entities that control, are controlled by, or are under common
-              control with that entity. For the purposes of this definition,
-              "control" means (i) the power, direct or indirect, to cause the
-              direction or management of such entity, whether by contract or
-              otherwise, or (ii) ownership of fifty percent (50%) or more of the
-              outstanding shares, or (iii) beneficial ownership of such entity.
-              "You" (or "Your") shall mean an individual or Legal Entity
-              exercising permissions granted by this License.
-              "Source" form shall mean the preferred form for making modifications,
-              including but not limited to software source code, documentation
-              source, and configuration files.
-              "Object" form shall mean any form resulting from mechanical
-              transformation or translation of a Source form, including but
-              not limited to compiled object code, generated documentation,
-              and conversions to other media types.
-              "Work" shall mean the work of authorship, whether in Source or
-              Object form, made available under the License, as indicated by a
-              copyright notice that is included in or attached to the work
-              (an example is provided in the Appendix below).
-              "Derivative Works" shall mean any work, whether in Source or Object
-              form, that is based on (or derived from) the Work and for which the
-              editorial revisions, annotations, elaborations, or other modifications
-              represent, as a whole, an original work of authorship. For the purposes
-              of this License, Derivative Works shall not include works that remain
-              separable from, or merely link (or bind by name) to the interfaces of,
-              the Work and Derivative Works thereof.
-              "Contribution" shall mean any work of authorship, including
-              the original version of the Work and any modifications or additions
-              to that Work or Derivative Works thereof, that is intentionally
-              submitted to Licensor for inclusion in the Work by the copyright owner
-              or by an individual or Legal Entity authorized to submit on behalf of
-              the copyright owner. For the purposes of this definition, "submitted"
-              means any form of electronic, verbal, or written communication sent
-              to the Licensor or its representatives, including but not limited to
-              communication on electronic mailing lists, source code control systems,
-              and issue tracking systems that are managed by, or on behalf of, the
-              Licensor for the purpose of discussing and improving the Work, but
-              excluding communication that is conspicuously marked or otherwise
-              designated in writing by the copyright owner as "Not a Contribution."
-              "Contributor" shall mean Licensor and any individual or Legal Entity
-              on behalf of whom a Contribution has been received by Licensor and
-              subsequently incorporated within the Work.
-           2. Grant of Copyright License. Subject to the terms and conditions of
-              this License, each Contributor hereby grants to You a perpetual,
-              worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-              copyright license to reproduce, prepare Derivative Works of,
-              publicly display, publicly perform, sublicense, and distribute the
-              Work and such Derivative Works in Source or Object form.
-           3. Grant of Patent License. Subject to the terms and conditions of
-              this License, each Contributor hereby grants to You a perpetual,
-              worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-              (except as stated in this section) patent license to make, have made,
-              use, offer to sell, sell, import, and otherwise transfer the Work,
-              where such license applies only to those patent claims licensable
-              by such Contributor that are necessarily infringed by their
-              Contribution(s) alone or by combination of their Contribution(s)
-              with the Work to which such Contribution(s) was submitted. If You
-              institute patent litigation against any entity (including a
-              cross-claim or counterclaim in a lawsuit) alleging that the Work
-              or a Contribution incorporated within the Work constitutes direct
-              or contributory patent infringement, then any patent licenses
-              granted to You under this License for that Work shall terminate
-              as of the date such litigation is filed.
-           4. Redistribution. You may reproduce and distribute copies of the
-              Work or Derivative Works thereof in any medium, with or without
-              modifications, and in Source or Object form, provided that You
-              meet the following conditions:
-              (a) You must give any other recipients of the Work or
-                  Derivative Works a copy of this License; and
-              (b) You must cause any modified files to carry prominent notices
-                  stating that You changed the files; and
-              (c) You must retain, in the Source form of any Derivative Works
-                  that You distribute, all copyright, patent, trademark, and
-                  attribution notices from the Source form of the Work,
-                  excluding those notices that do not pertain to any part of
-                  the Derivative Works; and
-              (d) If the Work includes a "NOTICE" text file as part of its
-                  distribution, then any Derivative Works that You distribute must
-                  include a readable copy of the attribution notices contained
-                  within such NOTICE file, excluding those notices that do not
-                  pertain to any part of the Derivative Works, in at least one
-                  of the following places: within a NOTICE text file distributed
-                  as part of the Derivative Works; within the Source form or
-                  documentation, if provided along with the Derivative Works; or,
-                  within a display generated by the Derivative Works, if and
-                  wherever such third-party notices normally appear. The contents
-                  of the NOTICE file are for informational purposes only and
-                  do not modify the License. You may add Your own attribution
-                  notices within Derivative Works that You distribute, alongside
-                  or as an addendum to the NOTICE text from the Work, provided
-                  that such additional attribution notices cannot be construed
-                  as modifying the License.
-              You may add Your own copyright statement to Your modifications and
-              may provide additional or different license terms and conditions
-              for use, reproduction, or distribution of Your modifications, or
-              for any such Derivative Works as a whole, provided Your use,
-              reproduction, and distribution of the Work otherwise complies with
-              the conditions stated in this License.
-           5. Submission of Contributions. Unless You explicitly state otherwise,
-              any Contribution intentionally submitted for inclusion in the Work
-              by You to the Licensor shall be under the terms and conditions of
-              this License, without any additional terms or conditions.
-              Notwithstanding the above, nothing herein shall supersede or modify
-              the terms of any separate license agreement you may have executed
-              with Licensor regarding such Contributions.
-           6. Trademarks. This License does not grant permission to use the trade
-              names, trademarks, service marks, or product names of the Licensor,
-              except as required for reasonable and customary use in describing the
-              origin of the Work and reproducing the content of the NOTICE file.
-           7. Disclaimer of Warranty. Unless required by applicable law or
-              agreed to in writing, Licensor provides the Work (and each
-              Contributor provides its Contributions) on an "AS IS" BASIS,
-              WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-              implied, including, without limitation, any warranties or conditions
-              of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-              PARTICULAR PURPOSE. You are solely responsible for determining the
-              appropriateness of using or redistributing the Work and assume any
-              risks associated with Your exercise of permissions under this License.
-           8. Limitation of Liability. In no event and under no legal theory,
-              whether in tort (including negligence), contract, or otherwise,
-              unless required by applicable law (such as deliberate and grossly
-              negligent acts) or agreed to in writing, shall any Contributor be
-              liable to You for damages, including any direct, indirect, special,
-              incidental, or consequential damages of any character arising as a
-              result of this License or out of the use or inability to use the
-              Work (including but not limited to damages for loss of goodwill,
-              work stoppage, computer failure or malfunction, or any and all
-              other commercial damages or losses), even if such Contributor
-              has been advised of the possibility of such damages.
-           9. Accepting Warranty or Additional Liability. While redistributing
-              the Work or Derivative Works thereof, You may choose to offer,
-              and charge a fee for, acceptance of support, warranty, indemnity,
-              or other liability obligations and/or rights consistent with this
-              License. However, in accepting such obligations, You may act only
-              on Your own behalf and on Your sole responsibility, not on behalf
-              of any other Contributor, and only if You agree to indemnify,
-              defend, and hold each Contributor harmless for any liability
-              incurred by, or claims asserted against, such Contributor by reason
-              of your accepting any such warranty or additional liability.
-           END OF TERMS AND CONDITIONS
-           APPENDIX: How to apply the Apache License to your work.
-              To apply the Apache License to your work, attach the following
-              boilerplate notice, with the fields enclosed by brackets "[]"
-              replaced with your own identifying information. (Don't include
-              the brackets!)  The text should be enclosed in the appropriate
-              comment syntax for the file format. We also recommend that a
-              file or class name and description of purpose be included on the
-              same "printed page" as the copyright notice for easier
-              identification within third-party archives.
-           Copyright 2023 - present, IDEA Research.
-           Licensed under the Apache License, Version 2.0 (the "License");
-           you may not use this file except in compliance with the License.
-           You may obtain a copy of the License at
-               http://www.apache.org/licenses/LICENSE-2.0
-           Unless required by applicable law or agreed to in writing, software
-           distributed under the License is distributed on an "AS IS" BASIS,
-           WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-           See the License for the specific language governing permissions and
-           limitations under the License.
-Platform: UNKNOWN
-License-File: LICENSE
-UNKNOWN

groundingdino.egg-info/SOURCES.txt DELETED Viewed

@@ -1,46 +0,0 @@
-LICENSE
-README.md
-setup.py
-/home/jamada/jupyterlab/projects/gdino-peft/gdino-official/GroundingDINO/groundingdino/models/GroundingDINO/csrc/cuda_version.cu
-/home/jamada/jupyterlab/projects/gdino-peft/gdino-official/GroundingDINO/groundingdino/models/GroundingDINO/csrc/vision.cpp
-/home/jamada/jupyterlab/projects/gdino-peft/gdino-official/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp
-/home/jamada/jupyterlab/projects/gdino-peft/gdino-official/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cuda.cu
-groundingdino/__init__.py
-groundingdino/version.py
-groundingdino.egg-info/PKG-INFO
-groundingdino.egg-info/SOURCES.txt
-groundingdino.egg-info/dependency_links.txt
-groundingdino.egg-info/requires.txt
-groundingdino.egg-info/top_level.txt
-groundingdino/config/GroundingDINO_SwinB_cfg.py
-groundingdino/config/GroundingDINO_SwinT_OGC.py
-groundingdino/config/__init__.py
-groundingdino/datasets/__init__.py
-groundingdino/datasets/cocogrounding_eval.py
-groundingdino/datasets/transforms.py
-groundingdino/models/__init__.py
-groundingdino/models/registry.py
-groundingdino/models/GroundingDINO/__init__.py
-groundingdino/models/GroundingDINO/bertwarper.py
-groundingdino/models/GroundingDINO/fuse_modules.py
-groundingdino/models/GroundingDINO/groundingdino.py
-groundingdino/models/GroundingDINO/ms_deform_attn.py
-groundingdino/models/GroundingDINO/transformer.py
-groundingdino/models/GroundingDINO/transformer_vanilla.py
-groundingdino/models/GroundingDINO/utils.py
-groundingdino/models/GroundingDINO/backbone/__init__.py
-groundingdino/models/GroundingDINO/backbone/backbone.py
-groundingdino/models/GroundingDINO/backbone/position_encoding.py
-groundingdino/models/GroundingDINO/backbone/swin_transformer.py
-groundingdino/util/__init__.py
-groundingdino/util/box_ops.py
-groundingdino/util/get_tokenlizer.py
-groundingdino/util/inference.py
-groundingdino/util/logger.py
-groundingdino/util/misc.py
-groundingdino/util/slconfig.py
-groundingdino/util/slio.py
-groundingdino/util/time_counter.py
-groundingdino/util/utils.py
-groundingdino/util/visualizer.py
-groundingdino/util/vl_utils.py

groundingdino.egg-info/requires.txt DELETED Viewed

@@ -1,10 +0,0 @@
-addict
-numpy
-opencv-python
-pycocotools
-supervision
-timm
-torch
-torchvision
-transformers
-yapf

groundingdino.egg-info/top_level.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- groundingdino

groundingdino/.ipynb_checkpoints/__init__-checkpoint.py DELETED Viewed

File without changes

groundingdino/.ipynb_checkpoints/version-checkpoint.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = '0.1.0'

groundingdino/__init__.py DELETED Viewed

File without changes

groundingdino/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (182 Bytes)

groundingdino/config/.ipynb_checkpoints/GroundingDINO_SwinB_cfg-checkpoint.py DELETED Viewed

@@ -1,43 +0,0 @@
-batch_size = 1
-modelname = "groundingdino"
-backbone = "swin_B_384_22k"
-position_embedding = "sine"
-pe_temperatureH = 20
-pe_temperatureW = 20
-return_interm_indices = [1, 2, 3]
-backbone_freeze_keywords = None
-enc_layers = 6
-dec_layers = 6
-pre_norm = False
-dim_feedforward = 2048
-hidden_dim = 256
-dropout = 0.0
-nheads = 8
-num_queries = 900
-query_dim = 4
-num_patterns = 0
-num_feature_levels = 4
-enc_n_points = 4
-dec_n_points = 4
-two_stage_type = "standard"
-two_stage_bbox_embed_share = False
-two_stage_class_embed_share = False
-transformer_activation = "relu"
-dec_pred_bbox_embed_share = True
-dn_box_noise_scale = 1.0
-dn_label_noise_ratio = 0.5
-dn_label_coef = 1.0
-dn_bbox_coef = 1.0
-embed_init_tgt = True
-dn_labelbook_size = 2000
-max_text_len = 256
-text_encoder_type = "bert-base-uncased"
-use_text_enhancer = True
-use_fusion_layer = True
-use_checkpoint = True
-use_transformer_ckpt = True
-use_text_cross_attention = True
-text_dropout = 0.0
-fusion_dropout = 0.0
-fusion_droppath = 0.1
-sub_sentence_present = True

groundingdino/config/GroundingDINO_SwinB_cfg.py DELETED Viewed

@@ -1,43 +0,0 @@
-batch_size = 1
-modelname = "groundingdino"
-backbone = "swin_B_384_22k"
-position_embedding = "sine"
-pe_temperatureH = 20
-pe_temperatureW = 20
-return_interm_indices = [1, 2, 3]
-backbone_freeze_keywords = None
-enc_layers = 6
-dec_layers = 6
-pre_norm = False
-dim_feedforward = 2048
-hidden_dim = 256
-dropout = 0.0
-nheads = 8
-num_queries = 900
-query_dim = 4
-num_patterns = 0
-num_feature_levels = 4
-enc_n_points = 4
-dec_n_points = 4
-two_stage_type = "standard"
-two_stage_bbox_embed_share = False
-two_stage_class_embed_share = False
-transformer_activation = "relu"
-dec_pred_bbox_embed_share = True
-dn_box_noise_scale = 1.0
-dn_label_noise_ratio = 0.5
-dn_label_coef = 1.0
-dn_bbox_coef = 1.0
-embed_init_tgt = True
-dn_labelbook_size = 2000
-max_text_len = 256
-text_encoder_type = "bert-base-uncased"
-use_text_enhancer = True
-use_fusion_layer = True
-use_checkpoint = True
-use_transformer_ckpt = True
-use_text_cross_attention = True
-text_dropout = 0.0
-fusion_dropout = 0.0
-fusion_droppath = 0.1
-sub_sentence_present = True

groundingdino/config/GroundingDINO_SwinT_OGC.py DELETED Viewed

@@ -1,43 +0,0 @@
-batch_size = 1
-modelname = "groundingdino"
-backbone = "swin_T_224_1k"
-position_embedding = "sine"
-pe_temperatureH = 20
-pe_temperatureW = 20
-return_interm_indices = [1, 2, 3]
-backbone_freeze_keywords = None
-enc_layers = 6
-dec_layers = 6
-pre_norm = False
-dim_feedforward = 2048
-hidden_dim = 256
-dropout = 0.0
-nheads = 8
-num_queries = 900
-query_dim = 4
-num_patterns = 0
-num_feature_levels = 4
-enc_n_points = 4
-dec_n_points = 4
-two_stage_type = "standard"
-two_stage_bbox_embed_share = False
-two_stage_class_embed_share = False
-transformer_activation = "relu"
-dec_pred_bbox_embed_share = True
-dn_box_noise_scale = 1.0
-dn_label_noise_ratio = 0.5
-dn_label_coef = 1.0
-dn_bbox_coef = 1.0
-embed_init_tgt = True
-dn_labelbook_size = 2000
-max_text_len = 256
-text_encoder_type = "bert-base-uncased"
-use_text_enhancer = True
-use_fusion_layer = True
-use_checkpoint = True
-use_transformer_ckpt = True
-use_text_cross_attention = True
-text_dropout = 0.0
-fusion_dropout = 0.0
-fusion_droppath = 0.1
-sub_sentence_present = True

groundingdino/config/__init__.py DELETED Viewed

File without changes

groundingdino/datasets/.ipynb_checkpoints/__init__-checkpoint.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch.utils.data
+import torchvision
+from .coco import build as build_coco
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        # if isinstance(dataset, torchvision.datasets.CocoDetection):
+        #     break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+def build_dataset(image_set, args, datasetinfo):
+    if datasetinfo["dataset_mode"] == 'coco':
+        return build_coco(image_set, args, datasetinfo)
+    if datasetinfo["dataset_mode"] == 'odvg':
+        from .odvg import build_odvg
+        return build_odvg(image_set, args, datasetinfo)
+    raise ValueError(f'dataset {args.dataset_file} not supported')

groundingdino/datasets/.ipynb_checkpoints/coco-checkpoint.py ADDED Viewed

	@@ -0,0 +1,649 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+COCO dataset which returns image_id for evaluation.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+if __name__=="__main__":
+    # for debug only
+    import os, sys
+    sys.path.append(os.path.dirname(sys.path[0]))
+from torchvision.datasets.vision import VisionDataset
+import json
+from pathlib import Path
+import random
+import os
+from typing import Any, Callable, List, Optional, Tuple
+from PIL import Image
+import torch
+import torch.utils.data
+import torchvision
+from pycocotools import mask as coco_mask
+from datasets.data_util import preparing_dataset
+import datasets.transforms as T
+from util.box_ops import box_cxcywh_to_xyxy, box_iou
+__all__ = ['build']
+class label2compat():
+    def __init__(self) -> None:
+        self.category_map_str = {"1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10, "11": 11, "13": 12, "14": 13, "15": 14, "16": 15, "17": 16, "18": 17, "19": 18, "20": 19, "21": 20, "22": 21, "23": 22, "24": 23, "25": 24, "27": 25, "28": 26, "31": 27, "32": 28, "33": 29, "34": 30, "35": 31, "36": 32, "37": 33, "38": 34, "39": 35, "40": 36, "41": 37, "42": 38, "43": 39, "44": 40, "46": 41, "47": 42, "48": 43, "49": 44, "50": 45, "51": 46, "52": 47, "53": 48, "54": 49, "55": 50, "56": 51, "57": 52, "58": 53, "59": 54, "60": 55, "61": 56, "62": 57, "63": 58, "64": 59, "65": 60, "67": 61, "70": 62, "72": 63, "73": 64, "74": 65, "75": 66, "76": 67, "77": 68, "78": 69, "79": 70, "80": 71, "81": 72, "82": 73, "84": 74, "85": 75, "86": 76, "87": 77, "88": 78, "89": 79, "90": 80}
+        self.category_map = {int(k):v for k,v in self.category_map_str.items()}
+    def __call__(self, target, img=None):
+        labels = target['labels']
+        res = torch.zeros(labels.shape, dtype=labels.dtype)
+        for idx, item in enumerate(labels):
+            res[idx] = self.category_map[item.item()] - 1
+        target['label_compat'] = res
+        if img is not None:
+            return target, img
+        else:
+            return target
+class label_compat2onehot():
+    def __init__(self, num_class=80, num_output_objs=1):
+        self.num_class = num_class
+        self.num_output_objs = num_output_objs
+        if num_output_objs != 1:
+            raise DeprecationWarning("num_output_objs!=1, which is only used for comparison")
+    def __call__(self, target, img=None):
+        labels = target['label_compat']
+        place_dict = {k:0 for k in range(self.num_class)}
+        if self.num_output_objs == 1:
+            res = torch.zeros(self.num_class)
+            for i in labels:
+                itm = i.item()
+                res[itm] = 1.0
+        else:
+            # compat with baseline
+            res = torch.zeros(self.num_class, self.num_output_objs)
+            for i in labels:
+                itm = i.item()
+                res[itm][place_dict[itm]] = 1.0
+                place_dict[itm] += 1
+        target['label_compat_onehot'] = res
+        if img is not None:
+            return target, img
+        else:
+            return target
+class box_label_catter():
+    def __init__(self):
+        pass
+    def __call__(self, target, img=None):
+        labels = target['label_compat']
+        boxes = target['boxes']
+        box_label = torch.cat((boxes, labels.unsqueeze(-1)), 1)
+        target['box_label'] = box_label
+        if img is not None:
+            return target, img
+        else:
+            return target
+class RandomSelectBoxlabels():
+    def __init__(self, num_classes, leave_one_out=False, blank_prob=0.8,
+                    prob_first_item = 0.0,
+                    prob_random_item = 0.0,
+                    prob_last_item = 0.8,
+                    prob_stop_sign = 0.2
+                ) -> None:
+        self.num_classes = num_classes
+        self.leave_one_out = leave_one_out
+        self.blank_prob = blank_prob
+        self.set_state(prob_first_item, prob_random_item, prob_last_item, prob_stop_sign)
+    def get_state(self):
+        return [self.prob_first_item, self.prob_random_item, self.prob_last_item, self.prob_stop_sign]
+    def set_state(self, prob_first_item, prob_random_item, prob_last_item, prob_stop_sign):
+        sum_prob = prob_first_item + prob_random_item + prob_last_item + prob_stop_sign
+        assert sum_prob - 1 < 1e-6, \
+            f"Sum up all prob = {sum_prob}. prob_first_item:{prob_first_item}" \
+            + f"prob_random_item:{prob_random_item}, prob_last_item:{prob_last_item}" \
+            + f"prob_stop_sign:{prob_stop_sign}"
+        self.prob_first_item = prob_first_item
+        self.prob_random_item = prob_random_item
+        self.prob_last_item = prob_last_item
+        self.prob_stop_sign = prob_stop_sign
+    def sample_for_pred_first_item(self, box_label: torch.FloatTensor):
+        box_label_known = torch.Tensor(0,5)
+        box_label_unknown = box_label
+        return box_label_known, box_label_unknown
+    def sample_for_pred_random_item(self, box_label: torch.FloatTensor):
+        n_select = int(random.random() * box_label.shape[0])
+        box_label = box_label[torch.randperm(box_label.shape[0])]
+        box_label_known = box_label[:n_select]
+        box_label_unknown = box_label[n_select:]
+        return box_label_known, box_label_unknown
+    def sample_for_pred_last_item(self, box_label: torch.FloatTensor):
+        box_label_perm = box_label[torch.randperm(box_label.shape[0])]
+        known_label_list = []
+        box_label_known = []
+        box_label_unknown = []
+        for item in box_label_perm:
+            label_i = item[4].item()
+            if label_i in known_label_list:
+                box_label_known.append(item)
+            else:
+                # first item
+                box_label_unknown.append(item)
+                known_label_list.append(label_i)
+        box_label_known = torch.stack(box_label_known) if len(box_label_known) > 0 else torch.Tensor(0,5)
+        box_label_unknown = torch.stack(box_label_unknown) if len(box_label_unknown) > 0 else torch.Tensor(0,5)
+        return box_label_known, box_label_unknown
+    def sample_for_pred_stop_sign(self, box_label: torch.FloatTensor):
+        box_label_unknown = torch.Tensor(0,5)
+        box_label_known = box_label
+        return box_label_known, box_label_unknown
+    def __call__(self, target, img=None):
+        box_label = target['box_label'] # K, 5
+        dice_number = random.random()
+        if dice_number < self.prob_first_item:
+            box_label_known, box_label_unknown = self.sample_for_pred_first_item(box_label)
+        elif dice_number < self.prob_first_item + self.prob_random_item:
+            box_label_known, box_label_unknown = self.sample_for_pred_random_item(box_label)
+        elif dice_number < self.prob_first_item + self.prob_random_item + self.prob_last_item:
+            box_label_known, box_label_unknown = self.sample_for_pred_last_item(box_label)
+        else:
+            box_label_known, box_label_unknown = self.sample_for_pred_stop_sign(box_label)
+        target['label_onehot_known'] = label2onehot(box_label_known[:,-1], self.num_classes)
+        target['label_onehot_unknown'] = label2onehot(box_label_unknown[:, -1], self.num_classes)
+        target['box_label_known'] = box_label_known
+        target['box_label_unknown'] = box_label_unknown
+        return target, img
+class RandomDrop():
+    def __init__(self, p=0.2) -> None:
+        self.p = p
+    def __call__(self, target, img=None):
+        known_box = target['box_label_known']
+        num_known_box = known_box.size(0)
+        idxs = torch.rand(num_known_box)
+        # indices = torch.randperm(num_known_box)[:int((1-self).p*num_known_box + 0.5 + random.random())]
+        target['box_label_known'] = known_box[idxs > self.p]
+        return target, img
+class BboxPertuber():
+    def __init__(self, max_ratio = 0.02, generate_samples = 1000) -> None:
+        self.max_ratio = max_ratio
+        self.generate_samples = generate_samples
+        self.samples = self.generate_pertube_samples()
+        self.idx = 0
+    def generate_pertube_samples(self):
+        import torch
+        samples = (torch.rand(self.generate_samples, 5) - 0.5) * 2 * self.max_ratio
+        return samples
+    def __call__(self, target, img):
+        known_box = target['box_label_known'] # Tensor(K,5), K known bbox
+        K = known_box.shape[0]
+        known_box_pertube = torch.zeros(K, 6) # 4:bbox, 1:prob, 1:label
+        if K == 0:
+            pass
+        else:
+            if self.idx + K > self.generate_samples:
+                self.idx = 0
+            delta = self.samples[self.idx: self.idx + K, :]
+            known_box_pertube[:, :4] = known_box[:, :4] + delta[:, :4]
+            iou = (torch.diag(box_iou(box_cxcywh_to_xyxy(known_box[:, :4]), box_cxcywh_to_xyxy(known_box_pertube[:, :4]))[0])) * (1 + delta[:, -1])
+            known_box_pertube[:, 4].copy_(iou)
+            known_box_pertube[:, -1].copy_(known_box[:, -1])
+        target['box_label_known_pertube'] = known_box_pertube
+        return target, img
+class RandomCutout():
+    def __init__(self, factor=0.5) -> None:
+        self.factor = factor
+    def __call__(self, target, img=None):
+        unknown_box = target['box_label_unknown']           # Ku, 5
+        known_box = target['box_label_known_pertube']       # Kk, 6
+        Ku = unknown_box.size(0)
+        known_box_add = torch.zeros(Ku, 6) # Ku, 6
+        known_box_add[:, :5] = unknown_box
+        known_box_add[:, 5].uniform_(0.5, 1)
+        known_box_add[:, :2] += known_box_add[:, 2:4] * (torch.rand(Ku, 2) - 0.5) / 2
+        known_box_add[:, 2:4] /= 2
+        target['box_label_known_pertube'] = torch.cat((known_box, known_box_add))
+        return target, img
+class RandomSelectBoxes():
+    def __init__(self, num_class=80) -> None:
+        Warning("This is such a slow function and will be deprecated soon!!!")
+        self.num_class = num_class
+    def __call__(self, target, img=None):
+        boxes = target['boxes']
+        labels = target['label_compat']
+        # transform to list of tensors
+        boxs_list = [[] for i in range(self.num_class)]
+        for idx, item in enumerate(boxes):
+            label = labels[idx].item()
+            boxs_list[label].append(item)
+        boxs_list_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in boxs_list]
+        # random selection
+        box_known = []
+        box_unknown = []
+        for idx, item in enumerate(boxs_list_tensor):
+            ncnt = item.shape[0]
+            nselect = int(random.random() * ncnt) # close in both sides, much faster than random.randint
+            item = item[torch.randperm(ncnt)]
+            # random.shuffle(item)
+            box_known.append(item[:nselect])
+            box_unknown.append(item[nselect:])
+        # box_known_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_known]
+        # box_unknown_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_unknown]
+        # print('box_unknown_tensor:', box_unknown_tensor)
+        target['known_box'] = box_known
+        target['unknown_box'] = box_unknown
+        return target, img
+def label2onehot(label, num_classes):
+    """
+    label: Tensor(K)
+    """
+    res = torch.zeros(num_classes)
+    for i in label:
+        itm = int(i.item())
+        res[itm] = 1.0
+    return res
+class MaskCrop():
+    def __init__(self) -> None:
+        pass
+    def __call__(self, target, img):
+        known_box = target['known_box']
+        h,w = img.shape[1:] # h,w
+        # imgsize = target['orig_size'] # h,w
+        scale = torch.Tensor([w, h, w, h])
+        # _cnt = 0
+        for boxes in known_box:
+            if boxes.shape[0] == 0:
+                continue
+            box_xyxy = box_cxcywh_to_xyxy(boxes) * scale
+            for box in box_xyxy:
+                x1, y1, x2, y2 = [int(i) for i in box.tolist()]
+                img[:, y1:y2, x1:x2] = 0
+                # _cnt += 1
+        # print("_cnt:", _cnt)
+        return target, img
+dataset_hook_register = {
+    'label2compat': label2compat,
+    'label_compat2onehot': label_compat2onehot,
+    'box_label_catter': box_label_catter,
+    'RandomSelectBoxlabels': RandomSelectBoxlabels,
+    'RandomSelectBoxes': RandomSelectBoxes,
+    'MaskCrop': MaskCrop,
+    'BboxPertuber': BboxPertuber,
+}
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms, return_masks, aux_target_hacks=None):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+        self.aux_target_hacks = aux_target_hacks
+    def change_hack_attr(self, hackclassname, attrkv_dict):
+        target_class = dataset_hook_register[hackclassname]
+        for item in self.aux_target_hacks:
+            if isinstance(item, target_class):
+                for k,v in attrkv_dict.items():
+                    setattr(item, k, v)
+    def get_hack(self, hackclassname):
+        target_class = dataset_hook_register[hackclassname]
+        for item in self.aux_target_hacks:
+            if isinstance(item, target_class):
+                return item
+    def _load_image(self, id: int) -> Image.Image:
+        path = self.coco.loadImgs(id)[0]["file_name"]
+        abs_path = os.path.join(self.root, path)
+        return Image.open(abs_path).convert("RGB")
+    def __getitem__(self, idx):
+        """
+        Output:
+            - target: dict of multiple items
+                - boxes: Tensor[num_box, 4]. \
+                    Init type: x0,y0,x1,y1. unnormalized data.
+                    Final type: cx,cy,w,h. normalized data.
+        """
+        try:
+            img, target = super(CocoDetection, self).__getitem__(idx)
+        except:
+            print("Error idx: {}".format(idx))
+            idx += 1
+            img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = {'image_id': image_id, 'annotations': target}
+        img, target = self.prepare(img, target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        # convert to needed format
+        if self.aux_target_hacks is not None:
+            for hack_runner in self.aux_target_hacks:
+                target, img = hack_runner(target, img=img)
+        return img, target
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+    def __call__(self, image, target):
+        w, h = image.size
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+        anno = target["annotations"]
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if self.return_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+        return image, target
+def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    # config the params for data aug
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+    max_size = 1333
+    scales2_resize = [400, 500, 600]
+    scales2_crop = [384, 600]
+    # update args from config files
+    scales = getattr(args, 'data_aug_scales', scales)
+    max_size = getattr(args, 'data_aug_max_size', max_size)
+    scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
+    scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
+    # resize them
+    data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
+    if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
+        data_aug_scale_overlap = float(data_aug_scale_overlap)
+        scales = [int(i*data_aug_scale_overlap) for i in scales]
+        max_size = int(max_size*data_aug_scale_overlap)
+        scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
+        scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
+    datadict_for_print = {
+        'scales': scales,
+        'max_size': max_size,
+        'scales2_resize': scales2_resize,
+        'scales2_crop': scales2_crop
+    }
+    # print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
+    if image_set == 'train':
+        if fix_size:
+            return T.Compose([
+                T.RandomHorizontalFlip(),
+                T.RandomResize([(max_size, max(scales))]),
+                # T.RandomResize([(512, 512)]),
+                normalize,
+            ])
+        if strong_aug:
+            import datasets.sltransform as SLT
+            return T.Compose([
+                T.RandomHorizontalFlip(),
+                T.RandomSelect(
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Compose([
+                        T.RandomResize(scales2_resize),
+                        T.RandomSizeCrop(*scales2_crop),
+                        T.RandomResize(scales, max_size=max_size),
+                    ])
+                ),
+                SLT.RandomSelectMulti([
+                    SLT.RandomCrop(),
+                    SLT.LightingNoise(),
+                    SLT.AdjustBrightness(2),
+                    SLT.AdjustContrast(2),
+                ]),
+                normalize,
+            ])
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=max_size),
+                T.Compose([
+                    T.RandomResize(scales2_resize),
+                    T.RandomSizeCrop(*scales2_crop),
+                    T.RandomResize(scales, max_size=max_size),
+                ])
+            ),
+            normalize,
+        ])
+    if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
+        if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
+            print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
+            return T.Compose([
+                T.ResizeDebug((1280, 800)),
+                normalize,
+            ])
+        return T.Compose([
+            T.RandomResize([max(scales)], max_size=max_size),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def get_aux_target_hacks_list(image_set, args):
+    if args.modelname in ['q2bs_mask', 'q2bs']:
+        aux_target_hacks_list = [
+            label2compat(),
+            label_compat2onehot(),
+            RandomSelectBoxes(num_class=args.num_classes)
+        ]
+        if args.masked_data and image_set == 'train':
+            # aux_target_hacks_list.append()
+            aux_target_hacks_list.append(MaskCrop())
+    elif args.modelname in ['q2bm_v2', 'q2bs_ce', 'q2op', 'q2ofocal', 'q2opclip', 'q2ocqonly']:
+        aux_target_hacks_list = [
+            label2compat(),
+            label_compat2onehot(),
+            box_label_catter(),
+            RandomSelectBoxlabels(num_classes=args.num_classes,
+                                    prob_first_item=args.prob_first_item,
+                                    prob_random_item=args.prob_random_item,
+                                    prob_last_item=args.prob_last_item,
+                                    prob_stop_sign=args.prob_stop_sign,
+                                    ),
+            BboxPertuber(max_ratio=0.02, generate_samples=1000),
+        ]
+    elif args.modelname in ['q2omask', 'q2osa']:
+        if args.coco_aug:
+            aux_target_hacks_list = [
+                label2compat(),
+                label_compat2onehot(),
+                box_label_catter(),
+                RandomSelectBoxlabels(num_classes=args.num_classes,
+                                        prob_first_item=args.prob_first_item,
+                                        prob_random_item=args.prob_random_item,
+                                        prob_last_item=args.prob_last_item,
+                                        prob_stop_sign=args.prob_stop_sign,
+                                        ),
+                RandomDrop(p=0.2),
+                BboxPertuber(max_ratio=0.02, generate_samples=1000),
+                RandomCutout(factor=0.5)
+            ]
+        else:
+            aux_target_hacks_list = [
+                label2compat(),
+                label_compat2onehot(),
+                box_label_catter(),
+                RandomSelectBoxlabels(num_classes=args.num_classes,
+                                        prob_first_item=args.prob_first_item,
+                                        prob_random_item=args.prob_random_item,
+                                        prob_last_item=args.prob_last_item,
+                                        prob_stop_sign=args.prob_stop_sign,
+                                        ),
+                BboxPertuber(max_ratio=0.02, generate_samples=1000),
+            ]
+    else:
+        aux_target_hacks_list = None
+    return aux_target_hacks_list
+def build(image_set, args, datasetinfo):
+    img_folder = datasetinfo["root"]
+    ann_file = datasetinfo["anno"]
+    # copy to local path
+    if os.environ.get('DATA_COPY_SHILONG') == 'INFO':
+        preparing_dataset(dict(img_folder=img_folder, ann_file=ann_file), image_set, args)
+    try:
+        strong_aug = args.strong_aug
+    except:
+        strong_aug = False
+    print(img_folder, ann_file)
+    dataset = CocoDetection(img_folder, ann_file,
+            transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
+            return_masks=args.masks,
+            aux_target_hacks=None,
+        )
+    return dataset
+if __name__ == "__main__":
+    # Objects365 Val example
+    dataset_o365 = CocoDetection(
+            '/path/Objects365/train/',
+            "/path/Objects365/slannos/anno_preprocess_train_v2.json",
+            transforms=None,
+            return_masks=False,
+        )
+    print('len(dataset_o365):', len(dataset_o365))

groundingdino/datasets/.ipynb_checkpoints/dataset-checkpoint.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from __future__ import print_function
+import torch
+import torchvision.datasets as datasets
+from torch.utils.data import Dataset
+from PIL import Image
+from .tsv_io import TSVFile
+import numpy as np
+import base64
+import io
+class TSVDataset(Dataset):
+    """ TSV dataset for ImageNet 1K training
+    """
+    def __init__(self, tsv_file, transform=None, target_transform=None):
+        self.tsv = TSVFile(tsv_file)
+        self.transform = transform
+        self.target_transform = target_transform
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is class_index of the target class.
+        """
+        row = self.tsv.seek(index)
+        image_data = base64.b64decode(row[-1])
+        image = Image.open(io.BytesIO(image_data))
+        image = image.convert('RGB')
+        target = int(row[1])
+        if self.transform is not None:
+            img = self.transform(image)
+        else:
+            img = image
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return img, target
+    def __len__(self):
+        return self.tsv.num_rows()

groundingdino/datasets/.ipynb_checkpoints/odvg-checkpoint.py ADDED Viewed

	@@ -0,0 +1,258 @@

+from torchvision.datasets.vision import VisionDataset
+import os.path
+from typing import Callable, Optional
+import json
+from PIL import Image
+import torch
+import random
+import os, sys
+sys.path.append(os.path.dirname(sys.path[0]))
+import datasets.transforms as T
+class ODVGDataset(VisionDataset):
+    """
+    Args:
+        root (string): Root directory where images are downloaded to.
+        anno (string): Path to json annotation file.
+        label_map_anno (string):  Path to json label mapping file. Only for Object Detection
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.PILToTensor``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+    """
+    def __init__(
+        self,
+        root: str,
+        anno: str,
+        label_map_anno: str = None,
+        max_labels: int = 80,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        transforms: Optional[Callable] = None,
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+        self.root = root
+        self.dataset_mode = "OD" if label_map_anno else "VG"
+        self.max_labels = max_labels
+        if self.dataset_mode == "OD":
+            self.load_label_map(label_map_anno)
+        self._load_metas(anno)
+        self.get_dataset_info()
+    def load_label_map(self, label_map_anno):
+        with open(label_map_anno, 'r') as file:
+            self.label_map = json.load(file)
+        self.label_index = set(self.label_map.keys())
+    def _load_metas(self, anno):
+      with open(anno, 'r') as f:
+          self.metas = json.load(f)
+    def get_dataset_info(self):
+        print(f"  == total images: {len(self)}")
+        if self.dataset_mode == "OD":
+            print(f"  == total labels: {len(self.label_map)}")
+    def __getitem__(self, index: int):
+        meta = self.metas[index]
+        rel_path = meta["filename"]
+        abs_path = os.path.join(self.root, rel_path)
+        if not os.path.exists(abs_path):
+            raise FileNotFoundError(f"{abs_path} not found.")
+        image = Image.open(abs_path).convert('RGB')
+        w, h = image.size
+        if self.dataset_mode == "OD":
+            anno = meta["detection"]
+            instances = [obj for obj in anno["instances"]]
+            boxes = [obj["bbox"] for obj in instances]
+            # generate vg_labels
+            # pos bbox labels
+            ori_classes = [str(obj["label"]) for obj in instances]
+            pos_labels = set(ori_classes)
+            # neg bbox labels
+            neg_labels = self.label_index.difference(pos_labels)
+            vg_labels = list(pos_labels)
+            num_to_add = min(len(neg_labels), self.max_labels-len(pos_labels))
+            if num_to_add > 0:
+                vg_labels.extend(random.sample(neg_labels, num_to_add))
+            # shuffle
+            for i in range(len(vg_labels)-1, 0, -1):
+                j = random.randint(0, i)
+                vg_labels[i], vg_labels[j] = vg_labels[j], vg_labels[i]
+            caption_list = [self.label_map[lb] for lb in vg_labels]
+            caption_dict = {item:index for index, item in enumerate(caption_list)}
+            caption = ' . '.join(caption_list) + ' .'
+            classes = [caption_dict[self.label_map[str(obj["label"])]] for obj in instances]
+            boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+            classes = torch.tensor(classes, dtype=torch.int64)
+        elif self.dataset_mode == "VG":
+            anno = meta["Grounding"]
+            instances = [obj for obj in anno["regions"]]
+            boxes = [obj["bbox"] for obj in instances]
+            caption_list = [obj["phrase"] for obj in instances]
+            c = list(zip(boxes, caption_list))
+            random.shuffle(c)
+            boxes[:], caption_list[:] = zip(*c)
+            uni_caption_list  = list(set(caption_list))
+            label_map = {}
+            for idx in range(len(uni_caption_list)):
+                label_map[uni_caption_list[idx]] = idx
+            classes = [label_map[cap] for cap in caption_list]
+            caption = ' . '.join(uni_caption_list) + ' .'
+            boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+            classes = torch.tensor(classes, dtype=torch.int64)
+            caption_list = uni_caption_list
+            # print("caption_list" , caption_list)
+            # print("caption" , caption)
+            # print("boxes" , boxes)
+        target = {}
+        target["image_id"] = rel_path.strip(".jpg")
+        target["size"] = torch.as_tensor([int(h), int(w)])
+        target["cap_list"] = caption_list
+        target["caption"] = caption
+        target["boxes"] = boxes
+        target["labels"] = classes
+        # print(" image_id " , target["image_id"])
+        # size, cap_list, caption, bboxes, labels
+        if self.transforms is not None:
+            image, target = self.transforms(image, target)
+        return image, target
+    def __len__(self) -> int:
+        return len(self.metas)
+def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    # config the params for data aug
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+    max_size = 1333
+    scales2_resize = [400, 500, 600]
+    scales2_crop = [384, 600]
+    # update args from config files
+    scales = getattr(args, 'data_aug_scales', scales)
+    max_size = getattr(args, 'data_aug_max_size', max_size)
+    scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
+    scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
+    # resize them
+    data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
+    if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
+        data_aug_scale_overlap = float(data_aug_scale_overlap)
+        scales = [int(i*data_aug_scale_overlap) for i in scales]
+        max_size = int(max_size*data_aug_scale_overlap)
+        scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
+        scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
+    # datadict_for_print = {
+    #     'scales': scales,
+    #     'max_size': max_size,
+    #     'scales2_resize': scales2_resize,
+    #     'scales2_crop': scales2_crop
+    # }
+    # print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
+    if image_set == 'train':
+        if fix_size:
+            return T.Compose([
+                T.RandomHorizontalFlip(),
+                T.RandomResize([(max_size, max(scales))]),
+                normalize,
+            ])
+        if strong_aug:
+            import datasets.sltransform as SLT
+            return T.Compose([
+                T.RandomHorizontalFlip(),
+                T.RandomSelect(
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Compose([
+                        T.RandomResize(scales2_resize),
+                        T.RandomSizeCrop(*scales2_crop),
+                        T.RandomResize(scales, max_size=max_size),
+                    ])
+                ),
+                SLT.RandomSelectMulti([
+                    SLT.RandomCrop(),
+                    SLT.LightingNoise(),
+                    SLT.AdjustBrightness(2),
+                    SLT.AdjustContrast(2),
+                ]),
+                normalize,
+            ])
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=max_size),
+                T.Compose([
+                    T.RandomResize(scales2_resize),
+                    T.RandomSizeCrop(*scales2_crop),
+                    T.RandomResize(scales, max_size=max_size),
+                ])
+            ),
+            normalize,
+        ])
+    if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
+        if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
+            print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
+            return T.Compose([
+                T.ResizeDebug((1280, 800)),
+                normalize,
+            ])
+        return T.Compose([
+            T.RandomResize([max(scales)], max_size=max_size),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build_odvg(image_set, args, datasetinfo):
+    img_folder = datasetinfo["root"]
+    ann_file = datasetinfo["anno"]
+    label_map = datasetinfo["label_map"] if "label_map" in datasetinfo else None
+    try:
+        strong_aug = args.strong_aug
+    except:
+        strong_aug = False # False originally
+    print(img_folder, ann_file, label_map)
+    dataset = ODVGDataset(img_folder, ann_file, label_map, max_labels=args.max_labels,
+            transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
+    )
+    return dataset
+if __name__=="__main__":
+    dataset_vg = ODVGDataset("path/GRIT-20M/data/","path/GRIT-20M/anno/grit_odvg_10k.jsonl",)
+    print(len(dataset_vg))
+    data = dataset_vg[random.randint(0, 100)]
+    print(data)
+    dataset_od = ODVGDataset("pathl/V3Det/",
+        "path/V3Det/annotations/v3det_2023_v1_all_odvg.jsonl",
+        "path/V3Det/annotations/v3det_label_map.json",
+    )
+    print(len(dataset_od))
+    data = dataset_od[random.randint(0, 100)]
+    print(data)

groundingdino/datasets/.ipynb_checkpoints/transforms-checkpoint.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import random
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+from util.box_ops import box_xyxy_to_cxcywh
+from util.misc import interpolate
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+    target = target.copy()
+    i, j, h, w = region
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+    fields = ["labels", "area"]
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append("masks")
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target['masks'].flatten(1).any(1)
+        for field in fields:
+            target[field] = target[field][keep]
+    return cropped_image, target
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+    w, h = image.size
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target["boxes"] = boxes
+    if "masks" in target:
+        target['masks'] = target['masks'].flip(-1)
+    return flipped_image, target
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+        return (oh, ow)
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+    if target is None:
+        return rescaled_image, None
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+    if "masks" in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+    return rescaled_image, target
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+class ResizeDebug(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        return resize(img, target, self.size)
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int):
+        self.min_size = min_size
+        self.max_size = max_size
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        w = random.randint(self.min_size, min(img.width, self.max_size))
+        h = random.randint(self.min_size, min(img.height, self.max_size))
+        region = T.RandomCrop.get_params(img, [h, w])
+        return crop(img, target, region)
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.))
+        crop_left = int(round((image_width - crop_width) / 2.))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+class RandomPad(object):
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+class RandomErasing(object):
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+    def __call__(self, img, target):
+        return self.eraser(img), target
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+        return image, target
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string

groundingdino/datasets/__init__.py CHANGED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch.utils.data
+import torchvision
+from .coco import build as build_coco
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        # if isinstance(dataset, torchvision.datasets.CocoDetection):
+        #     break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+def build_dataset(image_set, args, datasetinfo):
+    if datasetinfo["dataset_mode"] == 'coco':
+        return build_coco(image_set, args, datasetinfo)
+    if datasetinfo["dataset_mode"] == 'odvg':
+        from .odvg import build_odvg
+        return build_odvg(image_set, args, datasetinfo)
+    raise ValueError(f'dataset {args.dataset_file} not supported')

groundingdino/datasets/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc and b/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc differ

groundingdino/datasets/__pycache__/coco.cpython-310.pyc ADDED Viewed

Binary file (20.2 kB). View file

groundingdino/datasets/__pycache__/coco_eval.cpython-310.pyc ADDED Viewed

Binary file (7.42 kB). View file

groundingdino/datasets/__pycache__/cocogrounding_eval.cpython-310.pyc ADDED Viewed

Binary file (7.44 kB). View file

groundingdino/datasets/__pycache__/data_util.cpython-310.pyc ADDED Viewed

Binary file (4.55 kB). View file

groundingdino/datasets/__pycache__/odvg.cpython-310.pyc ADDED Viewed

Binary file (8.21 kB). View file

groundingdino/datasets/__pycache__/panoptic_eval.cpython-310.pyc ADDED Viewed

Binary file (1.87 kB). View file

groundingdino/datasets/__pycache__/random_crop.cpython-310.pyc ADDED Viewed

Binary file (3.69 kB). View file

groundingdino/datasets/__pycache__/sltransform.cpython-310.pyc ADDED Viewed

Binary file (7.68 kB). View file

groundingdino/datasets/__pycache__/transforms.cpython-310.pyc CHANGED Viewed

Binary files a/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc and b/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc differ

groundingdino/datasets/coco.py ADDED Viewed

	@@ -0,0 +1,649 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+COCO dataset which returns image_id for evaluation.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+if __name__=="__main__":
+    # for debug only
+    import os, sys
+    sys.path.append(os.path.dirname(sys.path[0]))
+from torchvision.datasets.vision import VisionDataset
+import json
+from pathlib import Path
+import random
+import os
+from typing import Any, Callable, List, Optional, Tuple
+from PIL import Image
+import torch
+import torch.utils.data
+import torchvision
+from pycocotools import mask as coco_mask
+from datasets.data_util import preparing_dataset
+import datasets.transforms as T
+from util.box_ops import box_cxcywh_to_xyxy, box_iou
+__all__ = ['build']
+class label2compat():
+    def __init__(self) -> None:
+        self.category_map_str = {"1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10, "11": 11, "13": 12, "14": 13, "15": 14, "16": 15, "17": 16, "18": 17, "19": 18, "20": 19, "21": 20, "22": 21, "23": 22, "24": 23, "25": 24, "27": 25, "28": 26, "31": 27, "32": 28, "33": 29, "34": 30, "35": 31, "36": 32, "37": 33, "38": 34, "39": 35, "40": 36, "41": 37, "42": 38, "43": 39, "44": 40, "46": 41, "47": 42, "48": 43, "49": 44, "50": 45, "51": 46, "52": 47, "53": 48, "54": 49, "55": 50, "56": 51, "57": 52, "58": 53, "59": 54, "60": 55, "61": 56, "62": 57, "63": 58, "64": 59, "65": 60, "67": 61, "70": 62, "72": 63, "73": 64, "74": 65, "75": 66, "76": 67, "77": 68, "78": 69, "79": 70, "80": 71, "81": 72, "82": 73, "84": 74, "85": 75, "86": 76, "87": 77, "88": 78, "89": 79, "90": 80}
+        self.category_map = {int(k):v for k,v in self.category_map_str.items()}
+    def __call__(self, target, img=None):
+        labels = target['labels']
+        res = torch.zeros(labels.shape, dtype=labels.dtype)
+        for idx, item in enumerate(labels):
+            res[idx] = self.category_map[item.item()] - 1
+        target['label_compat'] = res
+        if img is not None:
+            return target, img
+        else:
+            return target
+class label_compat2onehot():
+    def __init__(self, num_class=80, num_output_objs=1):
+        self.num_class = num_class
+        self.num_output_objs = num_output_objs
+        if num_output_objs != 1:
+            raise DeprecationWarning("num_output_objs!=1, which is only used for comparison")
+    def __call__(self, target, img=None):
+        labels = target['label_compat']
+        place_dict = {k:0 for k in range(self.num_class)}
+        if self.num_output_objs == 1:
+            res = torch.zeros(self.num_class)
+            for i in labels:
+                itm = i.item()
+                res[itm] = 1.0
+        else:
+            # compat with baseline
+            res = torch.zeros(self.num_class, self.num_output_objs)
+            for i in labels:
+                itm = i.item()
+                res[itm][place_dict[itm]] = 1.0
+                place_dict[itm] += 1
+        target['label_compat_onehot'] = res
+        if img is not None:
+            return target, img
+        else:
+            return target
+class box_label_catter():
+    def __init__(self):
+        pass
+    def __call__(self, target, img=None):
+        labels = target['label_compat']
+        boxes = target['boxes']
+        box_label = torch.cat((boxes, labels.unsqueeze(-1)), 1)
+        target['box_label'] = box_label
+        if img is not None:
+            return target, img
+        else:
+            return target
+class RandomSelectBoxlabels():
+    def __init__(self, num_classes, leave_one_out=False, blank_prob=0.8,
+                    prob_first_item = 0.0,
+                    prob_random_item = 0.0,
+                    prob_last_item = 0.8,
+                    prob_stop_sign = 0.2
+                ) -> None:
+        self.num_classes = num_classes
+        self.leave_one_out = leave_one_out
+        self.blank_prob = blank_prob
+        self.set_state(prob_first_item, prob_random_item, prob_last_item, prob_stop_sign)
+    def get_state(self):
+        return [self.prob_first_item, self.prob_random_item, self.prob_last_item, self.prob_stop_sign]
+    def set_state(self, prob_first_item, prob_random_item, prob_last_item, prob_stop_sign):
+        sum_prob = prob_first_item + prob_random_item + prob_last_item + prob_stop_sign
+        assert sum_prob - 1 < 1e-6, \
+            f"Sum up all prob = {sum_prob}. prob_first_item:{prob_first_item}" \
+            + f"prob_random_item:{prob_random_item}, prob_last_item:{prob_last_item}" \
+            + f"prob_stop_sign:{prob_stop_sign}"
+        self.prob_first_item = prob_first_item
+        self.prob_random_item = prob_random_item
+        self.prob_last_item = prob_last_item
+        self.prob_stop_sign = prob_stop_sign
+    def sample_for_pred_first_item(self, box_label: torch.FloatTensor):
+        box_label_known = torch.Tensor(0,5)
+        box_label_unknown = box_label
+        return box_label_known, box_label_unknown
+    def sample_for_pred_random_item(self, box_label: torch.FloatTensor):
+        n_select = int(random.random() * box_label.shape[0])
+        box_label = box_label[torch.randperm(box_label.shape[0])]
+        box_label_known = box_label[:n_select]
+        box_label_unknown = box_label[n_select:]
+        return box_label_known, box_label_unknown
+    def sample_for_pred_last_item(self, box_label: torch.FloatTensor):
+        box_label_perm = box_label[torch.randperm(box_label.shape[0])]
+        known_label_list = []
+        box_label_known = []
+        box_label_unknown = []
+        for item in box_label_perm:
+            label_i = item[4].item()
+            if label_i in known_label_list:
+                box_label_known.append(item)
+            else:
+                # first item
+                box_label_unknown.append(item)
+                known_label_list.append(label_i)
+        box_label_known = torch.stack(box_label_known) if len(box_label_known) > 0 else torch.Tensor(0,5)
+        box_label_unknown = torch.stack(box_label_unknown) if len(box_label_unknown) > 0 else torch.Tensor(0,5)
+        return box_label_known, box_label_unknown
+    def sample_for_pred_stop_sign(self, box_label: torch.FloatTensor):
+        box_label_unknown = torch.Tensor(0,5)
+        box_label_known = box_label
+        return box_label_known, box_label_unknown
+    def __call__(self, target, img=None):
+        box_label = target['box_label'] # K, 5
+        dice_number = random.random()
+        if dice_number < self.prob_first_item:
+            box_label_known, box_label_unknown = self.sample_for_pred_first_item(box_label)
+        elif dice_number < self.prob_first_item + self.prob_random_item:
+            box_label_known, box_label_unknown = self.sample_for_pred_random_item(box_label)
+        elif dice_number < self.prob_first_item + self.prob_random_item + self.prob_last_item:
+            box_label_known, box_label_unknown = self.sample_for_pred_last_item(box_label)
+        else:
+            box_label_known, box_label_unknown = self.sample_for_pred_stop_sign(box_label)
+        target['label_onehot_known'] = label2onehot(box_label_known[:,-1], self.num_classes)
+        target['label_onehot_unknown'] = label2onehot(box_label_unknown[:, -1], self.num_classes)
+        target['box_label_known'] = box_label_known
+        target['box_label_unknown'] = box_label_unknown
+        return target, img
+class RandomDrop():
+    def __init__(self, p=0.2) -> None:
+        self.p = p
+    def __call__(self, target, img=None):
+        known_box = target['box_label_known']
+        num_known_box = known_box.size(0)
+        idxs = torch.rand(num_known_box)
+        # indices = torch.randperm(num_known_box)[:int((1-self).p*num_known_box + 0.5 + random.random())]
+        target['box_label_known'] = known_box[idxs > self.p]
+        return target, img
+class BboxPertuber():
+    def __init__(self, max_ratio = 0.02, generate_samples = 1000) -> None:
+        self.max_ratio = max_ratio
+        self.generate_samples = generate_samples
+        self.samples = self.generate_pertube_samples()
+        self.idx = 0
+    def generate_pertube_samples(self):
+        import torch
+        samples = (torch.rand(self.generate_samples, 5) - 0.5) * 2 * self.max_ratio
+        return samples
+    def __call__(self, target, img):
+        known_box = target['box_label_known'] # Tensor(K,5), K known bbox
+        K = known_box.shape[0]
+        known_box_pertube = torch.zeros(K, 6) # 4:bbox, 1:prob, 1:label
+        if K == 0:
+            pass
+        else:
+            if self.idx + K > self.generate_samples:
+                self.idx = 0
+            delta = self.samples[self.idx: self.idx + K, :]
+            known_box_pertube[:, :4] = known_box[:, :4] + delta[:, :4]
+            iou = (torch.diag(box_iou(box_cxcywh_to_xyxy(known_box[:, :4]), box_cxcywh_to_xyxy(known_box_pertube[:, :4]))[0])) * (1 + delta[:, -1])
+            known_box_pertube[:, 4].copy_(iou)
+            known_box_pertube[:, -1].copy_(known_box[:, -1])
+        target['box_label_known_pertube'] = known_box_pertube
+        return target, img
+class RandomCutout():
+    def __init__(self, factor=0.5) -> None:
+        self.factor = factor
+    def __call__(self, target, img=None):
+        unknown_box = target['box_label_unknown']           # Ku, 5
+        known_box = target['box_label_known_pertube']       # Kk, 6
+        Ku = unknown_box.size(0)
+        known_box_add = torch.zeros(Ku, 6) # Ku, 6
+        known_box_add[:, :5] = unknown_box
+        known_box_add[:, 5].uniform_(0.5, 1)
+        known_box_add[:, :2] += known_box_add[:, 2:4] * (torch.rand(Ku, 2) - 0.5) / 2
+        known_box_add[:, 2:4] /= 2
+        target['box_label_known_pertube'] = torch.cat((known_box, known_box_add))
+        return target, img
+class RandomSelectBoxes():
+    def __init__(self, num_class=80) -> None:
+        Warning("This is such a slow function and will be deprecated soon!!!")
+        self.num_class = num_class
+    def __call__(self, target, img=None):
+        boxes = target['boxes']
+        labels = target['label_compat']
+        # transform to list of tensors
+        boxs_list = [[] for i in range(self.num_class)]
+        for idx, item in enumerate(boxes):
+            label = labels[idx].item()
+            boxs_list[label].append(item)
+        boxs_list_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in boxs_list]
+        # random selection
+        box_known = []
+        box_unknown = []
+        for idx, item in enumerate(boxs_list_tensor):
+            ncnt = item.shape[0]
+            nselect = int(random.random() * ncnt) # close in both sides, much faster than random.randint
+            item = item[torch.randperm(ncnt)]
+            # random.shuffle(item)
+            box_known.append(item[:nselect])
+            box_unknown.append(item[nselect:])
+        # box_known_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_known]
+        # box_unknown_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_unknown]
+        # print('box_unknown_tensor:', box_unknown_tensor)
+        target['known_box'] = box_known
+        target['unknown_box'] = box_unknown
+        return target, img
+def label2onehot(label, num_classes):
+    """
+    label: Tensor(K)
+    """
+    res = torch.zeros(num_classes)
+    for i in label:
+        itm = int(i.item())
+        res[itm] = 1.0
+    return res
+class MaskCrop():
+    def __init__(self) -> None:
+        pass
+    def __call__(self, target, img):
+        known_box = target['known_box']
+        h,w = img.shape[1:] # h,w
+        # imgsize = target['orig_size'] # h,w
+        scale = torch.Tensor([w, h, w, h])
+        # _cnt = 0
+        for boxes in known_box:
+            if boxes.shape[0] == 0:
+                continue
+            box_xyxy = box_cxcywh_to_xyxy(boxes) * scale
+            for box in box_xyxy:
+                x1, y1, x2, y2 = [int(i) for i in box.tolist()]
+                img[:, y1:y2, x1:x2] = 0
+                # _cnt += 1
+        # print("_cnt:", _cnt)
+        return target, img
+dataset_hook_register = {
+    'label2compat': label2compat,
+    'label_compat2onehot': label_compat2onehot,
+    'box_label_catter': box_label_catter,
+    'RandomSelectBoxlabels': RandomSelectBoxlabels,
+    'RandomSelectBoxes': RandomSelectBoxes,
+    'MaskCrop': MaskCrop,
+    'BboxPertuber': BboxPertuber,
+}
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms, return_masks, aux_target_hacks=None):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+        self.aux_target_hacks = aux_target_hacks
+    def change_hack_attr(self, hackclassname, attrkv_dict):
+        target_class = dataset_hook_register[hackclassname]
+        for item in self.aux_target_hacks:
+            if isinstance(item, target_class):
+                for k,v in attrkv_dict.items():
+                    setattr(item, k, v)
+    def get_hack(self, hackclassname):
+        target_class = dataset_hook_register[hackclassname]
+        for item in self.aux_target_hacks:
+            if isinstance(item, target_class):
+                return item
+    def _load_image(self, id: int) -> Image.Image:
+        path = self.coco.loadImgs(id)[0]["file_name"]
+        abs_path = os.path.join(self.root, path)
+        return Image.open(abs_path).convert("RGB")
+    def __getitem__(self, idx):
+        """
+        Output:
+            - target: dict of multiple items
+                - boxes: Tensor[num_box, 4]. \
+                    Init type: x0,y0,x1,y1. unnormalized data.
+                    Final type: cx,cy,w,h. normalized data.
+        """
+        try:
+            img, target = super(CocoDetection, self).__getitem__(idx)
+        except:
+            print("Error idx: {}".format(idx))
+            idx += 1
+            img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = {'image_id': image_id, 'annotations': target}
+        img, target = self.prepare(img, target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        # convert to needed format
+        if self.aux_target_hacks is not None:
+            for hack_runner in self.aux_target_hacks:
+                target, img = hack_runner(target, img=img)
+        return img, target
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+    def __call__(self, image, target):
+        w, h = image.size
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+        anno = target["annotations"]
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if self.return_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+        return image, target
+def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    # config the params for data aug
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+    max_size = 1333
+    scales2_resize = [400, 500, 600]
+    scales2_crop = [384, 600]
+    # update args from config files
+    scales = getattr(args, 'data_aug_scales', scales)
+    max_size = getattr(args, 'data_aug_max_size', max_size)
+    scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
+    scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
+    # resize them
+    data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
+    if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
+        data_aug_scale_overlap = float(data_aug_scale_overlap)
+        scales = [int(i*data_aug_scale_overlap) for i in scales]
+        max_size = int(max_size*data_aug_scale_overlap)
+        scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
+        scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
+    datadict_for_print = {
+        'scales': scales,
+        'max_size': max_size,
+        'scales2_resize': scales2_resize,
+        'scales2_crop': scales2_crop
+    }
+    # print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
+    if image_set == 'train':
+        if fix_size:
+            return T.Compose([
+                T.RandomHorizontalFlip(),
+                T.RandomResize([(max_size, max(scales))]),
+                # T.RandomResize([(512, 512)]),
+                normalize,
+            ])
+        if strong_aug:
+            import datasets.sltransform as SLT
+            return T.Compose([
+                T.RandomHorizontalFlip(),
+                T.RandomSelect(
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Compose([
+                        T.RandomResize(scales2_resize),
+                        T.RandomSizeCrop(*scales2_crop),
+                        T.RandomResize(scales, max_size=max_size),
+                    ])
+                ),
+                SLT.RandomSelectMulti([
+                    SLT.RandomCrop(),
+                    SLT.LightingNoise(),
+                    SLT.AdjustBrightness(2),
+                    SLT.AdjustContrast(2),
+                ]),
+                normalize,
+            ])
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=max_size),
+                T.Compose([
+                    T.RandomResize(scales2_resize),
+                    T.RandomSizeCrop(*scales2_crop),
+                    T.RandomResize(scales, max_size=max_size),
+                ])
+            ),
+            normalize,
+        ])
+    if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
+        if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
+            print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
+            return T.Compose([
+                T.ResizeDebug((1280, 800)),
+                normalize,
+            ])
+        return T.Compose([
+            T.RandomResize([max(scales)], max_size=max_size),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def get_aux_target_hacks_list(image_set, args):
+    if args.modelname in ['q2bs_mask', 'q2bs']:
+        aux_target_hacks_list = [
+            label2compat(),
+            label_compat2onehot(),
+            RandomSelectBoxes(num_class=args.num_classes)
+        ]
+        if args.masked_data and image_set == 'train':
+            # aux_target_hacks_list.append()
+            aux_target_hacks_list.append(MaskCrop())
+    elif args.modelname in ['q2bm_v2', 'q2bs_ce', 'q2op', 'q2ofocal', 'q2opclip', 'q2ocqonly']:
+        aux_target_hacks_list = [
+            label2compat(),
+            label_compat2onehot(),
+            box_label_catter(),
+            RandomSelectBoxlabels(num_classes=args.num_classes,
+                                    prob_first_item=args.prob_first_item,
+                                    prob_random_item=args.prob_random_item,
+                                    prob_last_item=args.prob_last_item,
+                                    prob_stop_sign=args.prob_stop_sign,
+                                    ),
+            BboxPertuber(max_ratio=0.02, generate_samples=1000),
+        ]
+    elif args.modelname in ['q2omask', 'q2osa']:
+        if args.coco_aug:
+            aux_target_hacks_list = [
+                label2compat(),
+                label_compat2onehot(),
+                box_label_catter(),
+                RandomSelectBoxlabels(num_classes=args.num_classes,
+                                        prob_first_item=args.prob_first_item,
+                                        prob_random_item=args.prob_random_item,
+                                        prob_last_item=args.prob_last_item,
+                                        prob_stop_sign=args.prob_stop_sign,
+                                        ),
+                RandomDrop(p=0.2),
+                BboxPertuber(max_ratio=0.02, generate_samples=1000),
+                RandomCutout(factor=0.5)
+            ]
+        else:
+            aux_target_hacks_list = [
+                label2compat(),
+                label_compat2onehot(),
+                box_label_catter(),
+                RandomSelectBoxlabels(num_classes=args.num_classes,
+                                        prob_first_item=args.prob_first_item,
+                                        prob_random_item=args.prob_random_item,
+                                        prob_last_item=args.prob_last_item,
+                                        prob_stop_sign=args.prob_stop_sign,
+                                        ),
+                BboxPertuber(max_ratio=0.02, generate_samples=1000),
+            ]
+    else:
+        aux_target_hacks_list = None
+    return aux_target_hacks_list
+def build(image_set, args, datasetinfo):
+    img_folder = datasetinfo["root"]
+    ann_file = datasetinfo["anno"]
+    # copy to local path
+    if os.environ.get('DATA_COPY_SHILONG') == 'INFO':
+        preparing_dataset(dict(img_folder=img_folder, ann_file=ann_file), image_set, args)
+    try:
+        strong_aug = args.strong_aug
+    except:
+        strong_aug = False
+    print(img_folder, ann_file)
+    dataset = CocoDetection(img_folder, ann_file,
+            transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
+            return_masks=args.masks,
+            aux_target_hacks=None,
+        )
+    return dataset
+if __name__ == "__main__":
+    # Objects365 Val example
+    dataset_o365 = CocoDetection(
+            '/path/Objects365/train/',
+            "/path/Objects365/slannos/anno_preprocess_train_v2.json",
+            transforms=None,
+            return_masks=False,
+        )
+    print('len(dataset_o365):', len(dataset_o365))

groundingdino/datasets/coco_eval.py ADDED Viewed

	@@ -0,0 +1,266 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+COCO evaluator that works in distributed mode.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
+The difference is that there is less copy-pasting from pycocotools
+in the end of the file, as python3 can suppress prints with contextlib
+"""
+import os
+import contextlib
+import copy
+import numpy as np
+import torch
+from pycocotools.cocoeval import COCOeval
+from pycocotools.coco import COCO
+import pycocotools.mask as mask_util
+from util.misc import all_gather
+class CocoEvaluator(object):
+    def __init__(self, coco_gt, iou_types, useCats=True):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+            self.coco_eval[iou_type].useCats = useCats
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+        self.useCats = useCats
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            # suppress pycocotools prints
+            with open(os.devnull, 'w') as devnull:
+                with contextlib.redirect_stdout(devnull):
+                    coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            coco_eval.params.useCats = self.useCats
+            img_ids, eval_imgs = evaluate(coco_eval)
+            self.eval_imgs[iou_type].append(eval_imgs)
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print("IoU metric: {}".format(iou_type))
+            coco_eval.summarize()
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            if not isinstance(prediction["scores"], list):
+                scores = prediction["scores"].tolist()
+            else:
+                scores = prediction["scores"]
+            if not isinstance(prediction["labels"], list):
+                labels = prediction["labels"].tolist()
+            else:
+                labels = prediction["labels"]
+            try:
+                coco_results.extend(
+                    [
+                        {
+                            "image_id": original_id,
+                            "category_id": labels[k],
+                            "bbox": box,
+                            "score": scores[k],
+                        }
+                        for k, box in enumerate(boxes)
+                    ]
+                )
+            except:
+                import ipdb; ipdb.set_trace()
+        return coco_results
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+            masks = masks > 0.5
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        'keypoints': keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+def merge(img_ids, eval_imgs):
+    all_img_ids = all_gather(img_ids)
+    all_eval_imgs = all_gather(eval_imgs)
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+    return merged_img_ids, merged_eval_imgs
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs)
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+#################################################################
+# From pycocotools, just removed the prints and fixed
+# a Python3 bug about unicode not defined
+#################################################################
+def evaluate(self):
+    '''
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    '''
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+        print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+    if p.iouType == 'segm' or p.iouType == 'bbox':
+        computeIoU = self.computeIoU
+    elif p.iouType == 'keypoints':
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds}
+    evaluateImg = self.evaluateImg
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    self._paramsEval = copy.deepcopy(self.params)
+    return p.imgIds, evalImgs
+#################################################################
+# end of straight copy from pycocotools, just removing the prints
+#################################################################

groundingdino/datasets/coco_panoptic.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import json
+from pathlib import Path
+import numpy as np
+import torch
+from PIL import Image
+from panopticapi.utils import rgb2id
+from util.box_ops import masks_to_boxes
+from .coco import make_coco_transforms
+class CocoPanoptic:
+    def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True):
+        with open(ann_file, 'r') as f:
+            self.coco = json.load(f)
+        # sort 'images' field so that they are aligned with 'annotations'
+        # i.e., in alphabetical order
+        self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id'])
+        # sanity check
+        if "annotations" in self.coco:
+            for img, ann in zip(self.coco['images'], self.coco['annotations']):
+                assert img['file_name'][:-4] == ann['file_name'][:-4]
+        self.img_folder = img_folder
+        self.ann_folder = ann_folder
+        self.ann_file = ann_file
+        self.transforms = transforms
+        self.return_masks = return_masks
+    def __getitem__(self, idx):
+        ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx]
+        img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg')
+        ann_path = Path(self.ann_folder) / ann_info['file_name']
+        img = Image.open(img_path).convert('RGB')
+        w, h = img.size
+        if "segments_info" in ann_info:
+            masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
+            masks = rgb2id(masks)
+            ids = np.array([ann['id'] for ann in ann_info['segments_info']])
+            masks = masks == ids[:, None, None]
+            masks = torch.as_tensor(masks, dtype=torch.uint8)
+            labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64)
+        target = {}
+        target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]])
+        if self.return_masks:
+            target['masks'] = masks
+        target['labels'] = labels
+        target["boxes"] = masks_to_boxes(masks)
+        target['size'] = torch.as_tensor([int(h), int(w)])
+        target['orig_size'] = torch.as_tensor([int(h), int(w)])
+        if "segments_info" in ann_info:
+            for name in ['iscrowd', 'area']:
+                target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']])
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+        return img, target
+    def __len__(self):
+        return len(self.coco['images'])
+    def get_height_and_width(self, idx):
+        img_info = self.coco['images'][idx]
+        height = img_info['height']
+        width = img_info['width']
+        return height, width
+def build(image_set, args):
+    img_folder_root = Path(args.coco_path)
+    ann_folder_root = Path(args.coco_panoptic_path)
+    assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist'
+    assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist'
+    mode = 'panoptic'
+    PATHS = {
+        "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'),
+        "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    img_folder_path = img_folder_root / img_folder
+    ann_folder = ann_folder_root / f'{mode}_{img_folder}'
+    ann_file = ann_folder_root / ann_file
+    dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file,
+                           transforms=make_coco_transforms(image_set), return_masks=args.masks)
+    return dataset

groundingdino/datasets/cocogrounding_eval.py CHANGED Viewed

@@ -45,7 +45,7 @@ class CocoGroundingEvaluator(object):
     def update(self, predictions):
         img_ids = list(np.unique(list(predictions.keys())))
         self.img_ids.extend(img_ids)
         for iou_type in self.iou_types:
             results = self.prepare(predictions, iou_type)
@@ -223,6 +223,8 @@ def evaluate(self):
     """
     # tic = time.time()
     # print('Running per image evaluation...')
     p = self.params
     # add backward compatibility if useSegm is specified in params
     if p.useSegm is not None:

     def update(self, predictions):
         img_ids = list(np.unique(list(predictions.keys())))
         self.img_ids.extend(img_ids)
+        # import pdb;pdb.set_trace()
         for iou_type in self.iou_types:
             results = self.prepare(predictions, iou_type)
     """
     # tic = time.time()
     # print('Running per image evaluation...')
+    # import pdb;pdb.set_trace()
     p = self.params
     # add backward compatibility if useSegm is specified in params
     if p.useSegm is not None:

groundingdino/datasets/data_util.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+import os.path as osp
+import shutil
+import time
+import datetime
+import torch
+from util.slconfig import SLConfig
+class Error(OSError):
+    pass
+def slcopytree(src, dst, symlinks=False, ignore=None, copy_function=shutil.copyfile,
+             ignore_dangling_symlinks=False):
+    """
+    modified from shutil.copytree without copystat.
+    Recursively copy a directory tree.
+    The destination directory must not already exist.
+    If exception(s) occur, an Error is raised with a list of reasons.
+    If the optional symlinks flag is true, symbolic links in the
+    source tree result in symbolic links in the destination tree; if
+    it is false, the contents of the files pointed to by symbolic
+    links are copied. If the file pointed by the symlink doesn't
+    exist, an exception will be added in the list of errors raised in
+    an Error exception at the end of the copy process.
+    You can set the optional ignore_dangling_symlinks flag to true if you
+    want to silence this exception. Notice that this has no effect on
+    platforms that don't support os.symlink.
+    The optional ignore argument is a callable. If given, it
+    is called with the `src` parameter, which is the directory
+    being visited by copytree(), and `names` which is the list of
+    `src` contents, as returned by os.listdir():
+        callable(src, names) -> ignored_names
+    Since copytree() is called recursively, the callable will be
+    called once for each directory that is copied. It returns a
+    list of names relative to the `src` directory that should
+    not be copied.
+    The optional copy_function argument is a callable that will be used
+    to copy each file. It will be called with the source path and the
+    destination path as arguments. By default, copy2() is used, but any
+    function that supports the same signature (like copy()) can be used.
+    """
+    errors = []
+    if os.path.isdir(src):
+        names = os.listdir(src)
+        if ignore is not None:
+            ignored_names = ignore(src, names)
+        else:
+            ignored_names = set()
+        os.makedirs(dst)
+        for name in names:
+            if name in ignored_names:
+                continue
+            srcname = os.path.join(src, name)
+            dstname = os.path.join(dst, name)
+            try:
+                if os.path.islink(srcname):
+                    linkto = os.readlink(srcname)
+                    if symlinks:
+                        # We can't just leave it to `copy_function` because legacy
+                        # code with a custom `copy_function` may rely on copytree
+                        # doing the right thing.
+                        os.symlink(linkto, dstname)
+                    else:
+                        # ignore dangling symlink if the flag is on
+                        if not os.path.exists(linkto) and ignore_dangling_symlinks:
+                            continue
+                        # otherwise let the copy occurs. copy2 will raise an error
+                        if os.path.isdir(srcname):
+                            slcopytree(srcname, dstname, symlinks, ignore,
+                                    copy_function)
+                        else:
+                            copy_function(srcname, dstname)
+                elif os.path.isdir(srcname):
+                    slcopytree(srcname, dstname, symlinks, ignore, copy_function)
+                else:
+                    # Will raise a SpecialFileError for unsupported file types
+                    copy_function(srcname, dstname)
+            # catch the Error from the recursive copytree so that we can
+            # continue with other files
+            except Error as err:
+                errors.extend(err.args[0])
+            except OSError as why:
+                errors.append((srcname, dstname, str(why)))
+    else:
+        copy_function(src, dst)
+    if errors:
+        raise Error(errors)
+    return dst
+def check_and_copy(src_path, tgt_path):
+    if os.path.exists(tgt_path):
+        return None
+    return slcopytree(src_path, tgt_path)
+def remove(srcpath):
+    if os.path.isdir(srcpath):
+        return shutil.rmtree(srcpath)
+    else:
+        return os.remove(srcpath)
+def preparing_dataset(pathdict, image_set, args):
+    start_time = time.time()
+    dataset_file = args.dataset_file
+    data_static_info = SLConfig.fromfile('util/static_data_path.py')
+    static_dict = data_static_info[dataset_file][image_set]
+    copyfilelist = []
+    for k,tgt_v in pathdict.items():
+        if os.path.exists(tgt_v):
+            if args.local_rank == 0:
+                print("path <{}> exist. remove it!".format(tgt_v))
+                remove(tgt_v)
+            # continue
+        if args.local_rank == 0:
+            src_v = static_dict[k]
+            assert isinstance(src_v, str)
+            if src_v.endswith('.zip'):
+                # copy
+                cp_tgt_dir = os.path.dirname(tgt_v)
+                filename = os.path.basename(src_v)
+                cp_tgt_path = os.path.join(cp_tgt_dir, filename)
+                print('Copy from <{}> to <{}>.'.format(src_v, cp_tgt_path))
+                os.makedirs(cp_tgt_dir, exist_ok=True)
+                check_and_copy(src_v, cp_tgt_path)
+                # unzip
+                import zipfile
+                print("Starting unzip <{}>".format(cp_tgt_path))
+                with zipfile.ZipFile(cp_tgt_path, 'r') as zip_ref:
+                    zip_ref.extractall(os.path.dirname(cp_tgt_path))
+                copyfilelist.append(cp_tgt_path)
+                copyfilelist.append(tgt_v)
+            else:
+                print('Copy from <{}> to <{}>.'.format(src_v, tgt_v))
+                os.makedirs(os.path.dirname(tgt_v), exist_ok=True)
+                check_and_copy(src_v, tgt_v)
+                copyfilelist.append(tgt_v)
+    if len(copyfilelist) == 0:
+        copyfilelist = None
+    args.copyfilelist = copyfilelist
+    if args.distributed:
+        torch.distributed.barrier()
+    total_time = time.time() - start_time
+    if copyfilelist:
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('Data copy time {}'.format(total_time_str))
+    return copyfilelist

groundingdino/datasets/dataset.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from __future__ import print_function
+import torch
+import torchvision.datasets as datasets
+from torch.utils.data import Dataset
+from PIL import Image
+from .tsv_io import TSVFile
+import numpy as np
+import base64
+import io
+class TSVDataset(Dataset):
+    """ TSV dataset for ImageNet 1K training
+    """
+    def __init__(self, tsv_file, transform=None, target_transform=None):
+        self.tsv = TSVFile(tsv_file)
+        self.transform = transform
+        self.target_transform = target_transform
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is class_index of the target class.
+        """
+        row = self.tsv.seek(index)
+        image_data = base64.b64decode(row[-1])
+        image = Image.open(io.BytesIO(image_data))
+        image = image.convert('RGB')
+        target = int(row[1])
+        if self.transform is not None:
+            img = self.transform(image)
+        else:
+            img = image
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return img, target
+    def __len__(self):
+        return self.tsv.num_rows()

groundingdino/datasets/odvg.py ADDED Viewed

	@@ -0,0 +1,258 @@

+from torchvision.datasets.vision import VisionDataset
+import os.path
+from typing import Callable, Optional
+import json
+from PIL import Image
+import torch
+import random
+import os, sys
+sys.path.append(os.path.dirname(sys.path[0]))
+import datasets.transforms as T
+class ODVGDataset(VisionDataset):
+    """
+    Args:
+        root (string): Root directory where images are downloaded to.
+        anno (string): Path to json annotation file.
+        label_map_anno (string):  Path to json label mapping file. Only for Object Detection
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.PILToTensor``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+    """
+    def __init__(
+        self,
+        root: str,
+        anno: str,
+        label_map_anno: str = None,
+        max_labels: int = 80,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        transforms: Optional[Callable] = None,
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+        self.root = root
+        self.dataset_mode = "OD" if label_map_anno else "VG"
+        self.max_labels = max_labels
+        if self.dataset_mode == "OD":
+            self.load_label_map(label_map_anno)
+        self._load_metas(anno)
+        self.get_dataset_info()
+    def load_label_map(self, label_map_anno):
+        with open(label_map_anno, 'r') as file:
+            self.label_map = json.load(file)
+        self.label_index = set(self.label_map.keys())
+    def _load_metas(self, anno):
+      with open(anno, 'r') as f:
+          self.metas = json.load(f)
+    def get_dataset_info(self):
+        print(f"  == total images: {len(self)}")
+        if self.dataset_mode == "OD":
+            print(f"  == total labels: {len(self.label_map)}")
+    def __getitem__(self, index: int):
+        meta = self.metas[index]
+        rel_path = meta["filename"]
+        abs_path = os.path.join(self.root, rel_path)
+        if not os.path.exists(abs_path):
+            raise FileNotFoundError(f"{abs_path} not found.")
+        image = Image.open(abs_path).convert('RGB')
+        w, h = image.size
+        if self.dataset_mode == "OD":
+            anno = meta["detection"]
+            instances = [obj for obj in anno["instances"]]
+            boxes = [obj["bbox"] for obj in instances]
+            # generate vg_labels
+            # pos bbox labels
+            ori_classes = [str(obj["label"]) for obj in instances]
+            pos_labels = set(ori_classes)
+            # neg bbox labels
+            neg_labels = self.label_index.difference(pos_labels)
+            vg_labels = list(pos_labels)
+            num_to_add = min(len(neg_labels), self.max_labels-len(pos_labels))
+            if num_to_add > 0:
+                vg_labels.extend(random.sample(neg_labels, num_to_add))
+            # shuffle
+            for i in range(len(vg_labels)-1, 0, -1):
+                j = random.randint(0, i)
+                vg_labels[i], vg_labels[j] = vg_labels[j], vg_labels[i]
+            caption_list = [self.label_map[lb] for lb in vg_labels]
+            caption_dict = {item:index for index, item in enumerate(caption_list)}
+            caption = ' . '.join(caption_list) + ' .'
+            classes = [caption_dict[self.label_map[str(obj["label"])]] for obj in instances]
+            boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+            classes = torch.tensor(classes, dtype=torch.int64)
+        elif self.dataset_mode == "VG":
+            anno = meta["Grounding"]
+            instances = [obj for obj in anno["regions"]]
+            boxes = [obj["bbox"] for obj in instances]
+            caption_list = [obj["phrase"] for obj in instances]
+            c = list(zip(boxes, caption_list))
+            random.shuffle(c)
+            boxes[:], caption_list[:] = zip(*c)
+            uni_caption_list  = list(set(caption_list))
+            label_map = {}
+            for idx in range(len(uni_caption_list)):
+                label_map[uni_caption_list[idx]] = idx
+            classes = [label_map[cap] for cap in caption_list]
+            caption = ' . '.join(uni_caption_list) + ' .'
+            boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+            classes = torch.tensor(classes, dtype=torch.int64)
+            caption_list = uni_caption_list
+            # print("caption_list" , caption_list)
+            # print("caption" , caption)
+            # print("boxes" , boxes)
+        target = {}
+        target["image_id"] = rel_path.strip(".jpg")
+        target["size"] = torch.as_tensor([int(h), int(w)])
+        target["cap_list"] = caption_list
+        target["caption"] = caption
+        target["boxes"] = boxes
+        target["labels"] = classes
+        # print(" image_id " , target["image_id"])
+        # size, cap_list, caption, bboxes, labels
+        if self.transforms is not None:
+            image, target = self.transforms(image, target)
+        return image, target
+    def __len__(self) -> int:
+        return len(self.metas)
+def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    # config the params for data aug
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+    max_size = 1333
+    scales2_resize = [400, 500, 600]
+    scales2_crop = [384, 600]
+    # update args from config files
+    scales = getattr(args, 'data_aug_scales', scales)
+    max_size = getattr(args, 'data_aug_max_size', max_size)
+    scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
+    scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
+    # resize them
+    data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
+    if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
+        data_aug_scale_overlap = float(data_aug_scale_overlap)
+        scales = [int(i*data_aug_scale_overlap) for i in scales]
+        max_size = int(max_size*data_aug_scale_overlap)
+        scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
+        scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
+    # datadict_for_print = {
+    #     'scales': scales,
+    #     'max_size': max_size,
+    #     'scales2_resize': scales2_resize,
+    #     'scales2_crop': scales2_crop
+    # }
+    # print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
+    if image_set == 'train':
+        if fix_size:
+            return T.Compose([
+                T.RandomHorizontalFlip(),
+                T.RandomResize([(max_size, max(scales))]),
+                normalize,
+            ])
+        if strong_aug:
+            import datasets.sltransform as SLT
+            return T.Compose([
+                T.RandomHorizontalFlip(),
+                T.RandomSelect(
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Compose([
+                        T.RandomResize(scales2_resize),
+                        T.RandomSizeCrop(*scales2_crop),
+                        T.RandomResize(scales, max_size=max_size),
+                    ])
+                ),
+                SLT.RandomSelectMulti([
+                    SLT.RandomCrop(),
+                    SLT.LightingNoise(),
+                    SLT.AdjustBrightness(2),
+                    SLT.AdjustContrast(2),
+                ]),
+                normalize,
+            ])
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=max_size),
+                T.Compose([
+                    T.RandomResize(scales2_resize),
+                    T.RandomSizeCrop(*scales2_crop),
+                    T.RandomResize(scales, max_size=max_size),
+                ])
+            ),
+            normalize,
+        ])
+    if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
+        if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
+            print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
+            return T.Compose([
+                T.ResizeDebug((1280, 800)),
+                normalize,
+            ])
+        return T.Compose([
+            T.RandomResize([max(scales)], max_size=max_size),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build_odvg(image_set, args, datasetinfo):
+    img_folder = datasetinfo["root"]
+    ann_file = datasetinfo["anno"]
+    label_map = datasetinfo["label_map"] if "label_map" in datasetinfo else None
+    try:
+        strong_aug = args.strong_aug
+    except:
+        strong_aug = False # False originally
+    print(img_folder, ann_file, label_map)
+    dataset = ODVGDataset(img_folder, ann_file, label_map, max_labels=args.max_labels,
+            transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
+    )
+    return dataset
+if __name__=="__main__":
+    dataset_vg = ODVGDataset("path/GRIT-20M/data/","path/GRIT-20M/anno/grit_odvg_10k.jsonl",)
+    print(len(dataset_vg))
+    data = dataset_vg[random.randint(0, 100)]
+    print(data)
+    dataset_od = ODVGDataset("pathl/V3Det/",
+        "path/V3Det/annotations/v3det_2023_v1_all_odvg.jsonl",
+        "path/V3Det/annotations/v3det_label_map.json",
+    )
+    print(len(dataset_od))
+    data = dataset_od[random.randint(0, 100)]
+    print(data)

groundingdino/datasets/panoptic_eval.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import json
+import os
+import util.misc as utils
+try:
+    from panopticapi.evaluation import pq_compute
+except ImportError:
+    pass
+class PanopticEvaluator(object):
+    def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
+        self.gt_json = ann_file
+        self.gt_folder = ann_folder
+        if utils.is_main_process():
+            if not os.path.exists(output_dir):
+                os.mkdir(output_dir)
+        self.output_dir = output_dir
+        self.predictions = []
+    def update(self, predictions):
+        for p in predictions:
+            with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
+                f.write(p.pop("png_string"))
+        self.predictions += predictions
+    def synchronize_between_processes(self):
+        all_predictions = utils.all_gather(self.predictions)
+        merged_predictions = []
+        for p in all_predictions:
+            merged_predictions += p
+        self.predictions = merged_predictions
+    def summarize(self):
+        if utils.is_main_process():
+            json_data = {"annotations": self.predictions}
+            predictions_json = os.path.join(self.output_dir, "predictions.json")
+            with open(predictions_json, "w") as f:
+                f.write(json.dumps(json_data))
+            return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
+        return None

groundingdino/datasets/random_crop.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import PIL #version 1.2.0
+import torch
+import os
+import torchvision.transforms.functional as F
+import numpy as np
+import random
+def intersect(boxes1, boxes2):
+    '''
+        Find intersection of every box combination between two sets of box
+        boxes1: bounding boxes 1, a tensor of dimensions (n1, 4)
+        boxes2: bounding boxes 2, a tensor of dimensions (n2, 4)
+        Out: Intersection each of boxes1 with respect to each of boxes2,
+             a tensor of dimensions (n1, n2)
+    '''
+    n1 = boxes1.size(0)
+    n2 = boxes2.size(0)
+    max_xy =  torch.min(boxes1[:, 2:].unsqueeze(1).expand(n1, n2, 2),
+                        boxes2[:, 2:].unsqueeze(0).expand(n1, n2, 2))
+    min_xy = torch.max(boxes1[:, :2].unsqueeze(1).expand(n1, n2, 2),
+                       boxes2[:, :2].unsqueeze(0).expand(n1, n2, 2))
+    inter = torch.clamp(max_xy - min_xy , min=0)  # (n1, n2, 2)
+    return inter[:, :, 0] * inter[:, :, 1]  #(n1, n2)
+def find_IoU(boxes1, boxes2):
+    '''
+        Find IoU between every boxes set of boxes
+        boxes1: a tensor of dimensions (n1, 4) (left, top, right , bottom)
+        boxes2: a tensor of dimensions (n2, 4)
+        Out: IoU each of boxes1 with respect to each of boxes2, a tensor of
+             dimensions (n1, n2)
+        Formula:
+        (box1 ∩ box2) / (box1 u box2) = (box1 ∩ box2) / (area(box1) + area(box2) - (box1 ∩ box2 ))
+    '''
+    inter = intersect(boxes1, boxes2)
+    area_boxes1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
+    area_boxes2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
+    area_boxes1 = area_boxes1.unsqueeze(1).expand_as(inter) #(n1, n2)
+    area_boxes2 = area_boxes2.unsqueeze(0).expand_as(inter)  #(n1, n2)
+    union = (area_boxes1 + area_boxes2 - inter)
+    return inter / union
+def random_crop(image, boxes, labels, difficulties=None):
+    '''
+        image: A PIL image
+        boxes: Bounding boxes, a tensor of dimensions (#objects, 4)
+        labels: labels of object, a tensor of dimensions (#objects)
+        difficulties: difficulties of detect object, a tensor of dimensions (#objects)
+        Out: cropped image , new boxes, new labels, new difficulties
+    '''
+    if type(image) == PIL.Image.Image:
+        image = F.to_tensor(image)
+    original_h = image.size(1)
+    original_w = image.size(2)
+    while True:
+        mode = random.choice([0.1, 0.3, 0.5, 0.9, None])
+        if mode is None:
+            return F.to_pil_image(image), boxes, labels, difficulties
+        new_image = image
+        new_boxes = boxes
+        new_difficulties = difficulties
+        new_labels = labels
+        for _ in range(50):
+            # Crop dimensions: [0.3, 1] of original dimensions
+            new_h = random.uniform(0.3*original_h, original_h)
+            new_w = random.uniform(0.3*original_w, original_w)
+            # Aspect ratio constraint b/t .5 & 2
+            if new_h/new_w < 0.5 or new_h/new_w > 2:
+                continue
+            #Crop coordinate
+            left = random.uniform(0, original_w - new_w)
+            right = left + new_w
+            top = random.uniform(0, original_h - new_h)
+            bottom = top + new_h
+            crop = torch.FloatTensor([int(left), int(top), int(right), int(bottom)])
+            # Calculate IoU  between the crop and the bounding boxes
+            overlap = find_IoU(crop.unsqueeze(0), boxes) #(1, #objects)
+            overlap = overlap.squeeze(0)
+            # If not a single bounding box has a IoU of greater than the minimum, try again
+            if overlap.shape[0] == 0:
+                continue
+            if overlap.max().item() < mode:
+                continue
+            #Crop
+            new_image = image[:, int(top):int(bottom), int(left):int(right)] #(3, new_h, new_w)
+            #Center of bounding boxes
+            center_bb = (boxes[:, :2] + boxes[:, 2:])/2.0
+            #Find bounding box has been had center in crop
+            center_in_crop = (center_bb[:, 0] >left) * (center_bb[:, 0] < right
+                             ) *(center_bb[:, 1] > top) * (center_bb[:, 1] < bottom)    #( #objects)
+            if not center_in_crop.any():
+                continue
+            #take matching bounding box
+            new_boxes = boxes[center_in_crop, :]
+            #take matching labels
+            new_labels = labels[center_in_crop]
+            #take matching difficulities
+            if difficulties is not None:
+                new_difficulties = difficulties[center_in_crop]
+            else:
+                new_difficulties = None
+            #Use the box left and top corner or the crop's
+            new_boxes[:, :2] = torch.max(new_boxes[:, :2], crop[:2])
+            #adjust to crop
+            new_boxes[:, :2] -= crop[:2]
+            new_boxes[:, 2:] = torch.min(new_boxes[:, 2:],crop[2:])
+            #adjust to crop
+            new_boxes[:, 2:] -= crop[:2]
+            return F.to_pil_image(new_image), new_boxes, new_labels, new_difficulties

groundingdino/datasets/sltransform.py ADDED Viewed

	@@ -0,0 +1,247 @@

+# modified from https://github.com/anhtuan85/Data-Augmentation-for-Object-Detection/blob/master/augmentation.ipynb
+import PIL #version 1.2.0
+from PIL import Image #version 6.1.0
+import torch
+import os
+import torchvision.transforms.functional as F
+import numpy as np
+import random
+from .random_crop import random_crop
+from util.box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
+class AdjustContrast:
+    def __init__(self, contrast_factor):
+        self.contrast_factor = contrast_factor
+    def __call__(self, img, target):
+        """
+        img (PIL Image or Tensor): Image to be adjusted.
+        """
+        _contrast_factor = ((random.random() + 1.0) / 2.0) * self.contrast_factor
+        img = F.adjust_contrast(img, _contrast_factor)
+        return img, target
+class AdjustBrightness:
+    def __init__(self, brightness_factor):
+        self.brightness_factor = brightness_factor
+    def __call__(self, img, target):
+        """
+        img (PIL Image or Tensor): Image to be adjusted.
+        """
+        _brightness_factor = ((random.random() + 1.0) / 2.0) * self.brightness_factor
+        img = F.adjust_brightness(img, _brightness_factor)
+        return img, target
+def lighting_noise(image):
+    '''
+        color channel swap in image
+        image: A PIL image
+    '''
+    new_image = image
+    perms = ((0, 1, 2), (0, 2, 1), (1, 0, 2),
+             (1, 2, 0), (2, 0, 1), (2, 1, 0))
+    swap = perms[random.randint(0, len(perms)- 1)]
+    new_image = F.to_tensor(new_image)
+    new_image = new_image[swap, :, :]
+    new_image = F.to_pil_image(new_image)
+    return new_image
+class LightingNoise:
+    def __init__(self) -> None:
+        pass
+    def __call__(self, img, target):
+        return lighting_noise(img), target
+def rotate(image, boxes, angle):
+    '''
+        Rotate image and bounding box
+        image: A Pil image (w, h)
+        boxes: A tensors of dimensions (#objects, 4)
+        Out: rotated image (w, h), rotated boxes
+    '''
+    new_image = image.copy()
+    new_boxes = boxes.clone()
+    #Rotate image, expand = True
+    w = image.width
+    h = image.height
+    cx = w/2
+    cy = h/2
+    new_image = new_image.rotate(angle, expand=True)
+    angle = np.radians(angle)
+    alpha = np.cos(angle)
+    beta = np.sin(angle)
+    #Get affine matrix
+    AffineMatrix = torch.tensor([[alpha, beta, (1-alpha)*cx - beta*cy],
+                                 [-beta, alpha, beta*cx + (1-alpha)*cy]])
+    #Rotation boxes
+    box_width = (boxes[:,2] - boxes[:,0]).reshape(-1,1)
+    box_height = (boxes[:,3] - boxes[:,1]).reshape(-1,1)
+    #Get corners for boxes
+    x1 = boxes[:,0].reshape(-1,1)
+    y1 = boxes[:,1].reshape(-1,1)
+    x2 = x1 + box_width
+    y2 = y1
+    x3 = x1
+    y3 = y1 + box_height
+    x4 = boxes[:,2].reshape(-1,1)
+    y4 = boxes[:,3].reshape(-1,1)
+    corners = torch.stack((x1,y1,x2,y2,x3,y3,x4,y4), dim= 1)
+    # corners.reshape(-1, 8)    #Tensors of dimensions (#objects, 8)
+    corners = corners.reshape(-1,2) #Tensors of dimension (4* #objects, 2)
+    corners = torch.cat((corners, torch.ones(corners.shape[0], 1)), dim= 1) #(Tensors of dimension (4* #objects, 3))
+    cos = np.abs(AffineMatrix[0, 0])
+    sin = np.abs(AffineMatrix[0, 1])
+    nW = int((h * sin) + (w * cos))
+    nH = int((h * cos) + (w * sin))
+    AffineMatrix[0, 2] += (nW / 2) - cx
+    AffineMatrix[1, 2] += (nH / 2) - cy
+    #Apply affine transform
+    rotate_corners = torch.mm(AffineMatrix, corners.t().to(torch.float64)).t()
+    rotate_corners = rotate_corners.reshape(-1,8)
+    x_corners = rotate_corners[:,[0,2,4,6]]
+    y_corners = rotate_corners[:,[1,3,5,7]]
+    #Get (x_min, y_min, x_max, y_max)
+    x_min, _ = torch.min(x_corners, dim= 1)
+    x_min = x_min.reshape(-1, 1)
+    y_min, _ = torch.min(y_corners, dim= 1)
+    y_min = y_min.reshape(-1, 1)
+    x_max, _ = torch.max(x_corners, dim= 1)
+    x_max = x_max.reshape(-1, 1)
+    y_max, _ = torch.max(y_corners, dim= 1)
+    y_max = y_max.reshape(-1, 1)
+    new_boxes = torch.cat((x_min, y_min, x_max, y_max), dim= 1)
+    scale_x = new_image.width / w
+    scale_y = new_image.height / h
+    #Resize new image to (w, h)
+    new_image = new_image.resize((w, h))
+    #Resize boxes
+    new_boxes /= torch.Tensor([scale_x, scale_y, scale_x, scale_y])
+    new_boxes[:, 0] = torch.clamp(new_boxes[:, 0], 0, w)
+    new_boxes[:, 1] = torch.clamp(new_boxes[:, 1], 0, h)
+    new_boxes[:, 2] = torch.clamp(new_boxes[:, 2], 0, w)
+    new_boxes[:, 3] = torch.clamp(new_boxes[:, 3], 0, h)
+    return new_image, new_boxes
+# def convert_xywh_to_xyxy(boxes: torch.Tensor):
+#     _boxes = boxes.clone()
+#     box_xy = _boxes[:, :2]
+#     box_wh = _boxes[:, 2:]
+#     box_x1y1 = box_xy - box_wh/2
+#     box_x2y2 = box_xy + box_wh/2
+#     box_xyxy = torch.cat((box_x1y1, box_x2y2), dim=-1)
+#     return box_xyxy
+class Rotate:
+    def __init__(self, angle=10) -> None:
+        self.angle = angle
+    def __call__(self, img, target):
+        w,h = img.size
+        whwh = torch.Tensor([w, h, w, h])
+        boxes_xyxy = box_cxcywh_to_xyxy(target['boxes']) * whwh
+        img, boxes_new = rotate(img, boxes_xyxy, self.angle)
+        target['boxes'] = box_xyxy_to_cxcywh(boxes_new).to(boxes_xyxy.dtype) / (whwh + 1e-3)
+        return img, target
+class RandomCrop:
+    def __init__(self) -> None:
+        pass
+    def __call__(self, img, target):
+        w,h = img.size
+        try:
+            boxes_xyxy = target['boxes']
+            labels = target['labels']
+            img, new_boxes, new_labels, _ = random_crop(img, boxes_xyxy, labels)
+            target['boxes'] = new_boxes
+            target['labels'] = new_labels
+        except Exception as e:
+            pass
+        return img, target
+class RandomCropDebug:
+    def __init__(self) -> None:
+        pass
+    def __call__(self, img, target):
+        boxes_xyxy = target['boxes'].clone()
+        labels = target['labels'].clone()
+        img, new_boxes, new_labels, _ = random_crop(img, boxes_xyxy, labels)
+        target['boxes'] = new_boxes
+        target['labels'] = new_labels
+        return img, target
+class RandomSelectMulti(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    """
+    def __init__(self, transformslist, p=-1):
+        self.transformslist = transformslist
+        self.p = p
+        assert p == -1
+    def __call__(self, img, target):
+        if self.p == -1:
+            return random.choice(self.transformslist)(img, target)
+class Albumentations:
+    def __init__(self):
+        import albumentations as A
+        self.transform = A.Compose([
+            A.Blur(p=0.01),
+            A.MedianBlur(p=0.01),
+            A.ToGray(p=0.01),
+            A.CLAHE(p=0.01),
+            A.RandomBrightnessContrast(p=0.005),
+            A.RandomGamma(p=0.005),
+            A.ImageCompression(quality_lower=75, p=0.005)],
+            bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_labels']))
+    def __call__(self, img, target, p=1.0):
+        """
+        Input:
+            target['boxes']: xyxy, unnormalized data.
+        """
+        boxes_raw = target['boxes']
+        labels_raw = target['labels']
+        img_np = np.array(img)
+        if self.transform and random.random() < p:
+            new_res = self.transform(image=img_np, bboxes=boxes_raw, class_labels=labels_raw)  # transformed
+            boxes_new = torch.Tensor(new_res['bboxes']).to(boxes_raw.dtype).reshape_as(boxes_raw)
+            img_np = new_res['image']
+            labels_new = torch.Tensor(new_res['class_labels']).to(labels_raw.dtype)
+        img_new = Image.fromarray(img_np)
+        target['boxes'] = boxes_new
+        target['labels'] = labels_new
+        return img_new, target

groundingdino/datasets/transforms.py CHANGED Viewed

@@ -2,7 +2,6 @@
 """
 Transforms and data augmentation for both image + bbox.
 """
-import os
 import random
 import PIL
@@ -10,8 +9,8 @@ import torch
 import torchvision.transforms as T
 import torchvision.transforms.functional as F
-from groundingdino.util.box_ops import box_xyxy_to_cxcywh
-from groundingdino.util.misc import interpolate
 def crop(image, target, region):
@@ -23,7 +22,7 @@ def crop(image, target, region):
     # should we do something wrt the original size?
     target["size"] = torch.tensor([h, w])
-    fields = ["labels", "area", "iscrowd", "positive_map"]
     if "boxes" in target:
         boxes = target["boxes"]
@@ -38,29 +37,22 @@ def crop(image, target, region):
     if "masks" in target:
         # FIXME should we update the area here if there are no boxes?
-        target["masks"] = target["masks"][:, i : i + h, j : j + w]
         fields.append("masks")
     # remove elements for which the boxes or masks that have zero area
     if "boxes" in target or "masks" in target:
         # favor boxes selection when defining which elements to keep
         # this is compatible with previous implementation
         if "boxes" in target:
-            cropped_boxes = target["boxes"].reshape(-1, 2, 2)
             keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
         else:
-            keep = target["masks"].flatten(1).any(1)
         for field in fields:
-            if field in target:
-                target[field] = target[field][keep]
-    if os.environ.get("IPDB_SHILONG_DEBUG", None) == "INFO":
-        # for debug and visualization only.
-        if "strings_positive" in target:
-            target["strings_positive"] = [
-                _i for _i, _j in zip(target["strings_positive"], keep) if _j
-            ]
     return cropped_image, target
@@ -73,13 +65,11 @@ def hflip(image, target):
     target = target.copy()
     if "boxes" in target:
         boxes = target["boxes"]
-        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor(
-            [w, 0, w, 0]
-        )
         target["boxes"] = boxes
     if "masks" in target:
-        target["masks"] = target["masks"].flip(-1)
     return flipped_image, target
@@ -125,9 +115,7 @@ def resize(image, target, size, max_size=None):
     target = target.copy()
     if "boxes" in target:
         boxes = target["boxes"]
-        scaled_boxes = boxes * torch.as_tensor(
-            [ratio_width, ratio_height, ratio_width, ratio_height]
-        )
         target["boxes"] = scaled_boxes
     if "area" in target:
@@ -139,9 +127,8 @@ def resize(image, target, size, max_size=None):
     target["size"] = torch.tensor([h, w])
     if "masks" in target:
-        target["masks"] = (
-            interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5
-        )
     return rescaled_image, target
@@ -155,7 +142,7 @@ def pad(image, target, padding):
     # should we do something wrt the original size?
     target["size"] = torch.tensor(padded_image.size[::-1])
     if "masks" in target:
-        target["masks"] = torch.nn.functional.pad(target["masks"], (0, padding[0], 0, padding[1]))
     return padded_image, target
@@ -177,28 +164,15 @@ class RandomCrop(object):
 class RandomSizeCrop(object):
-    def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False):
-        # respect_boxes:    True to keep all boxes
-        #                   False to tolerence box filter
         self.min_size = min_size
         self.max_size = max_size
-        self.respect_boxes = respect_boxes
     def __call__(self, img: PIL.Image.Image, target: dict):
-        init_boxes = len(target["boxes"])
-        max_patience = 10
-        for i in range(max_patience):
-            w = random.randint(self.min_size, min(img.width, self.max_size))
-            h = random.randint(self.min_size, min(img.height, self.max_size))
-            region = T.RandomCrop.get_params(img, [h, w])
-            result_img, result_target = crop(img, target, region)
-            if (
-                not self.respect_boxes
-                or len(result_target["boxes"]) == init_boxes
-                or i == max_patience - 1
-            ):
-                return result_img, result_target
-        return result_img, result_target
 class CenterCrop(object):
@@ -208,8 +182,8 @@ class CenterCrop(object):
     def __call__(self, img, target):
         image_width, image_height = img.size
         crop_height, crop_width = self.size
-        crop_top = int(round((image_height - crop_height) / 2.0))
-        crop_left = int(round((image_width - crop_width) / 2.0))
         return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
@@ -249,7 +223,6 @@ class RandomSelect(object):
     Randomly selects between transforms1 and transforms2,
     with probability p for transforms1 and (1 - p) for transforms2
     """
     def __init__(self, transforms1, transforms2, p=0.5):
         self.transforms1 = transforms1
         self.transforms2 = transforms2
@@ -267,6 +240,7 @@ class ToTensor(object):
 class RandomErasing(object):
     def __init__(self, *args, **kwargs):
         self.eraser = T.RandomErasing(*args, **kwargs)

 """
 Transforms and data augmentation for both image + bbox.
 """
 import random
 import PIL
 import torchvision.transforms as T
 import torchvision.transforms.functional as F
+from util.box_ops import box_xyxy_to_cxcywh
+from util.misc import interpolate
 def crop(image, target, region):
     # should we do something wrt the original size?
     target["size"] = torch.tensor([h, w])
+    fields = ["labels", "area"]
     if "boxes" in target:
         boxes = target["boxes"]
     if "masks" in target:
         # FIXME should we update the area here if there are no boxes?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
         fields.append("masks")
     # remove elements for which the boxes or masks that have zero area
     if "boxes" in target or "masks" in target:
         # favor boxes selection when defining which elements to keep
         # this is compatible with previous implementation
         if "boxes" in target:
+            cropped_boxes = target['boxes'].reshape(-1, 2, 2)
             keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
         else:
+            keep = target['masks'].flatten(1).any(1)
         for field in fields:
+            target[field] = target[field][keep]
     return cropped_image, target
     target = target.copy()
     if "boxes" in target:
         boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
         target["boxes"] = boxes
     if "masks" in target:
+        target['masks'] = target['masks'].flip(-1)
     return flipped_image, target
     target = target.copy()
     if "boxes" in target:
         boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
         target["boxes"] = scaled_boxes
     if "area" in target:
     target["size"] = torch.tensor([h, w])
     if "masks" in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
     return rescaled_image, target
     # should we do something wrt the original size?
     target["size"] = torch.tensor(padded_image.size[::-1])
     if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
     return padded_image, target
 class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int):
         self.min_size = min_size
         self.max_size = max_size
     def __call__(self, img: PIL.Image.Image, target: dict):
+        w = random.randint(self.min_size, min(img.width, self.max_size))
+        h = random.randint(self.min_size, min(img.height, self.max_size))
+        region = T.RandomCrop.get_params(img, [h, w])
+        return crop(img, target, region)
 class CenterCrop(object):
     def __call__(self, img, target):
         image_width, image_height = img.size
         crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.))
+        crop_left = int(round((image_width - crop_width) / 2.))
         return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
     Randomly selects between transforms1 and transforms2,
     with probability p for transforms1 and (1 - p) for transforms2
     """
     def __init__(self, transforms1, transforms2, p=0.5):
         self.transforms1 = transforms1
         self.transforms2 = transforms2
 class RandomErasing(object):
     def __init__(self, *args, **kwargs):
         self.eraser = T.RandomErasing(*args, **kwargs)

groundingdino/models/.ipynb_checkpoints/__init__-checkpoint.py DELETED Viewed

@@ -1,18 +0,0 @@
-# ------------------------------------------------------------------------
-# Grounding DINO
-# url: https://github.com/IDEA-Research/GroundingDINO
-# Copyright (c) 2023 IDEA. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-from .GroundingDINO import build_groundingdino
-def build_model(args):
-    # we use register to maintain models from catdet6 on.
-    from .registry import MODULE_BUILD_FUNCS
-    assert args.modelname in MODULE_BUILD_FUNCS._module_dict
-    build_func = MODULE_BUILD_FUNCS.get(args.modelname)
-    model = build_func(args)
-    return model

groundingdino/models/.ipynb_checkpoints/registry-checkpoint.py DELETED Viewed

@@ -1,66 +0,0 @@
-# ------------------------------------------------------------------------
-# Grounding DINO
-# url: https://github.com/IDEA-Research/GroundingDINO
-# Copyright (c) 2023 IDEA. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------
-# -*- coding: utf-8 -*-
-# @Author: Yihao Chen
-# @Date:   2021-08-16 16:03:17
-# @Last Modified by:   Shilong Liu
-# @Last Modified time: 2022-01-23 15:26
-# modified from mmcv
-import inspect
-from functools import partial
-class Registry(object):
-    def __init__(self, name):
-        self._name = name
-        self._module_dict = dict()
-    def __repr__(self):
-        format_str = self.__class__.__name__ + "(name={}, items={})".format(
-            self._name, list(self._module_dict.keys())
-        )
-        return format_str
-    def __len__(self):
-        return len(self._module_dict)
-    @property
-    def name(self):
-        return self._name
-    @property
-    def module_dict(self):
-        return self._module_dict
-    def get(self, key):
-        return self._module_dict.get(key, None)
-    def registe_with_name(self, module_name=None, force=False):
-        return partial(self.register, module_name=module_name, force=force)
-    def register(self, module_build_function, module_name=None, force=False):
-        """Register a module build function.
-        Args:
-            module (:obj:`nn.Module`): Module to be registered.
-        """
-        if not inspect.isfunction(module_build_function):
-            raise TypeError(
-                "module_build_function must be a function, but got {}".format(
-                    type(module_build_function)
-                )
-            )
-        if module_name is None:
-            module_name = module_build_function.__name__
-        if not force and module_name in self._module_dict:
-            raise KeyError("{} is already registered in {}".format(module_name, self.name))
-        self._module_dict[module_name] = module_build_function
-        return module_build_function
-MODULE_BUILD_FUNCS = Registry("model build functions")

groundingdino/models/GroundingDINO/.ipynb_checkpoints/bertwarper-checkpoint.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch import Tensor, nn
+from torchvision.ops.boxes import nms
+from transformers import BertConfig, BertModel, BertPreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
+class BertModelWarper(nn.Module):
+    def __init__(self, bert_model):
+        super().__init__()
+        # self.bert = bert_modelc
+        self.config = bert_model.config
+        self.embeddings = bert_model.embeddings
+        self.encoder = bert_model.encoder
+        self.pooler = bert_model.pooler
+        self.get_extended_attention_mask = bert_model.get_extended_attention_mask
+        self.invert_attention_mask = bert_model.invert_attention_mask
+        self.get_head_mask = bert_model.get_head_mask
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        )
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)), device=device
+            )
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device
+        )
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+class TextEncoderShell(nn.Module):
+    def __init__(self, text_encoder):
+        super().__init__()
+        self.text_encoder = text_encoder
+        self.config = self.text_encoder.config
+    def forward(self, **kw):
+        # feed into text encoder
+        return self.text_encoder(**kw)
+def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer):
+    """Generate attention mask between each pair of special tokens
+    Args:
+        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+        special_tokens_mask (list): special tokens mask.
+    Returns:
+        torch.Tensor: attention mask between each special tokens.
+    """
+    input_ids = tokenized["input_ids"]
+    bs, num_token = input_ids.shape
+    # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
+    for special_token in special_tokens_list:
+        special_tokens_mask |= input_ids == special_token
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+    # generate attention mask and positional ids
+    attention_mask = (
+        torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
+    )
+    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device
+            )
+        previous_col = col
+    # # padding mask
+    # padding_mask = tokenized['attention_mask']
+    # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
+    return attention_mask, position_ids.to(torch.long)
+def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer):
+    """Generate attention mask between each pair of special tokens
+    Args:
+        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+        special_tokens_mask (list): special tokens mask.
+    Returns:
+        torch.Tensor: attention mask between each special tokens.
+    """
+    input_ids = tokenized["input_ids"]
+    bs, num_token = input_ids.shape
+    # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
+    for special_token in special_tokens_list:
+        special_tokens_mask |= input_ids == special_token
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+    # generate attention mask and positional ids
+    attention_mask = (
+        torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
+    )
+    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+    cate_to_token_mask_list = [[] for _ in range(bs)]
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device
+            )
+            c2t_maski = torch.zeros((num_token), device=input_ids.device).bool()
+            c2t_maski[previous_col + 1 : col] = True
+            cate_to_token_mask_list[row].append(c2t_maski)
+        previous_col = col
+    cate_to_token_mask_list = [
+        torch.stack(cate_to_token_mask_listi, dim=0)
+        for cate_to_token_mask_listi in cate_to_token_mask_list
+    ]
+    # # padding mask
+    # padding_mask = tokenized['attention_mask']
+    # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
+    return attention_mask, position_ids.to(torch.long), cate_to_token_mask_list

groundingdino/models/GroundingDINO/.ipynb_checkpoints/fuse_modules-checkpoint.py CHANGED Viewed

@@ -20,9 +20,9 @@ class FeatureResizer(nn.Module):
     def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
         super().__init__()
         self.do_ln = do_ln
         # Object feature encoding
-        r = 16
-        self.fc = lora.Linear(input_feat_size, output_feat_size,r=r ,  bias=True)
         self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
         self.dropout = nn.Dropout(dropout)
@@ -112,14 +112,14 @@ class BiMultiHeadAttention(nn.Module):
         ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
         self.scale = self.head_dim ** (-0.5)
         self.dropout = dropout
-        r = 16
         self.v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r)
-        self.l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r)
-        self.values_v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r)
-        self.values_l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r)
-        self.out_v_proj = lora.Linear(self.embed_dim, self.v_dim , r=r)
-        self.out_l_proj = lora.Linear(self.embed_dim, self.l_dim , r=r)
         self.stable_softmax_2d = True
         self.clamp_min_for_underflow = True

     def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
         super().__init__()
         self.do_ln = do_ln
+        r = 12
         # Object feature encoding
+        self.fc = lora.Linear(input_feat_size, output_feat_size,r=r, bias=True)
         self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
         self.dropout = nn.Dropout(dropout)
         ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
         self.scale = self.head_dim ** (-0.5)
         self.dropout = dropout
+        r = 12
         self.v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r)
+        self.l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r )
+        self.values_v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r )
+        self.values_l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r )
+        self.out_v_proj = lora.Linear(self.embed_dim, self.v_dim , r=r )
+        self.out_l_proj = lora.Linear(self.embed_dim, self.l_dim , r=r )
         self.stable_softmax_2d = True
         self.clamp_min_for_underflow = True