Upload model

Browse files

Files changed (5) hide show

README.md +199 -0
config.json +41 -0
configuration_rf_detr.py +112 -0
model.safetensors +3 -0
modeling_rf_detr.py +249 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "amp": true,
+  "architectures": [
+    "RFDetrModelForObjectDetection"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_rf_detr.RFDetrConfig",
+    "AutoModelForObjectDetection": "modeling_rf_detr.RFDetrModelForObjectDetection"
+  },
+  "bbox_reparam": true,
+  "ca_nheads": 16,
+  "dec_layers": 3,
+  "dec_n_points": 2,
+  "device": "cpu",
+  "encoder": "dinov2_windowed_small",
+  "gradient_checkpointing": false,
+  "group_detr": 13,
+  "hidden_dim": 256,
+  "layer_norm": true,
+  "lite_refpoint_refine": true,
+  "model_name": "RFDETRBase",
+  "model_type": "rf-detr",
+  "num_classes": 90,
+  "num_queries": 300,
+  "out_feature_indexes": [
+    2,
+    5,
+    8,
+    11
+  ],
+  "pretrain_weights": "rf-detr-base.pth",
+  "pretrained": true,
+  "projector_scale": [
+    "P4"
+  ],
+  "resolution": 560,
+  "sa_nheads": 8,
+  "torch_dtype": "float32",
+  "transformers_version": "4.50.3",
+  "two_stage": true
+}

configuration_rf_detr.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from typing import Dict, Literal, List, OrderedDict
+import torch
+from transformers.configuration_utils import PretrainedConfig
+from optimum.exporters.onnx.model_configs import ViTOnnxConfig
+from optimum.utils import DummyVisionInputGenerator
+### modified from https://github.com/roboflow/rf-detr/blob/main/rfdetr/config.py
+DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+class RFDetrConfig(PretrainedConfig):
+    model_type = 'rf-detr'
+    def __init__(
+        self,
+        model_name: Literal['RFDETRBase, RFDETRLarge'] = 'RFDETRBase',
+        pretrained: bool = False,
+        out_feature_indexes: List[int] = [2, 5, 8, 11],
+        dec_layers: int = 3,
+        two_stage: bool = True,
+        bbox_reparam: bool = True,
+        lite_refpoint_refine: bool = True,
+        layer_norm: bool = True,
+        amp: bool = True,
+        num_classes: int = 90,
+        num_queries: int  = 300,
+        device: Literal["cpu", "cuda", "mps"] = DEVICE,
+        resolution: int = 560,
+        group_detr: int = 13,
+        gradient_checkpointing: bool = False,
+        **kwargs
+    ):
+        self.model_name = model_name
+        self.pretrained = pretrained
+        self.out_feature_indexes = out_feature_indexes
+        self.dec_layers = dec_layers
+        self.two_stage = two_stage
+        self.bbox_reparam = bbox_reparam
+        self.lite_refpoint_refine = lite_refpoint_refine
+        self.layer_norm = layer_norm
+        self.amp = amp
+        self.num_classes = num_classes
+        self.device = device
+        self.resolution = resolution
+        self.group_detr = group_detr
+        self.gradient_checkpointing = gradient_checkpointing
+        self.num_queries = num_queries
+        if self.model_name == 'RFDETRBase':
+            self.encoder = "dinov2_windowed_small"
+            self.hidden_dim = 256
+            self.sa_nheads = 8
+            self.ca_nheads = 16
+            self.dec_n_points = 2
+            self.projector_scale = ["P4"]
+            self.pretrain_weights = "rf-detr-base.pth"
+        elif self.model_name == 'RFDETRLarge':
+            self.encoder = "dinov2_windowed_base"
+            self.hidden_dim = 384
+            self.sa_nheads = 12
+            self.ca_nheads = 24
+            self.dec_n_points = 4
+            self.projector_scale = ["P3", "P5"]
+            self.pretrain_weights = "rf-detr-large.pth"
+        if not self.pretrained:
+            self.pretrain_weights = ""
+        super().__init__(**kwargs)
+class RFDetrDummyInputGenerator(DummyVisionInputGenerator):
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "pixel_mask":
+            return self.random_mask_tensor(
+                shape=[self.batch_size, self.height, self.width],
+                framework=framework,
+                dtype="bool",
+            )
+        else:
+            return self.random_float_tensor(
+                shape=[self.batch_size, self.num_channels, self.height, self.width],
+                framework=framework,
+                dtype=float_dtype,
+            )
+class RFDetrOnnxConfig(ViTOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (RFDetrDummyInputGenerator,)
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return OrderedDict(
+            {
+                "pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
+                "pixel_mask": {0: "batch_size", 1: "height", 2: "width"},
+            }
+        )
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        common_outputs = super().outputs
+        if self.task == "object-detection":
+            common_outputs["logits"] = {0: "batch_size", 1: "num_queries", 2: "num_classes"}
+            common_outputs["pred_boxes"] = {0: "batch_size", 1: "num_queries", 2: "4"}
+        return common_outputs
+__all__ = [
+    'RFDetrConfig',
+    'RFDetrOnnxConfig'
+]

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e111471a1b37b21f6970075eb663e383b63cf99585968e3f67c2cc1507511a02
+size 128760872

modeling_rf_detr.py ADDED Viewed

	@@ -0,0 +1,249 @@

+from dataclasses import dataclass
+from typing import List, Dict
+import torch
+from torchvision.transforms import Resize
+from transformers import PreTrainedModel
+from transformers.utils import ModelOutput, torch_int
+from rfdetr import RFDETRBase, RFDETRLarge
+from rfdetr.util.misc import NestedTensor
+from .configuration_rf_detr import RFDetrConfig
+### ONLY WORKS WITH Transformers version 4.50.3 and python 3.11
+@dataclass
+class RFDetrObjectDetectionOutput(ModelOutput):
+    loss: torch.Tensor = None
+    loss_dict: Dict[str, torch.Tensor] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    aux_outputs: List[Dict[str, torch.Tensor]] = None
+    enc_outputs: Dict[str, torch.Tensor] = None
+class RFDetrModelForObjectDetection(PreTrainedModel):
+    config_class = RFDetrConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        models = {
+            'RFDETRBase': RFDETRBase,
+            'RFDETRLarge': RFDETRLarge,
+        }
+        rf_detr_model = models[config.model_name](
+            out_feature_indexes = config.out_feature_indexes,
+            dec_layers = config.dec_layers,
+            two_stage = config.two_stage,
+            bbox_reparam = config.bbox_reparam,
+            lite_refpoint_refine = config.lite_refpoint_refine,
+            layer_norm = config.layer_norm,
+            amp = config.amp,
+            num_classes = config.num_classes,
+            device = config.device,
+            resolution = config.resolution,
+            group_detr = config.group_detr,
+            gradient_checkpointing = config.gradient_checkpointing,
+            num_queries = config.num_queries,
+            encoder = config.encoder,
+            hidden_dim = config.hidden_dim,
+            sa_nheads = config.sa_nheads,
+            ca_nheads = config.ca_nheads,
+            dec_n_points = config.dec_n_points,
+            projector_scale = config.projector_scale,
+            pretrain_weights = config.pretrain_weights,
+        )
+        self.model = rf_detr_model.model.model
+        self.criterion = rf_detr_model.model.criterion
+    def compute_loss(self, outputs, labels=None):
+        """
+        Parameters
+        ----------
+            labels: list[Dict[str, torch.Tensor]]
+                list of bounding boxes and labels for each image in the batch.
+            outputs:
+                outputs from rfdetr model
+        """
+        loss = None
+        loss_dict = None
+        #if self.model.training:
+        if labels is None:
+            #torch._assert(False, "targets should not be none when in training mode")
+            pass
+        else:
+            losses = self.criterion(outputs, targets=labels)
+            loss_dict = {
+                'loss_fl': losses["loss_ce"],
+                ### class error and cardinality error is for logging purposes only, no back propagation
+                'class_error': losses["class_error"],
+                'cardinality_error': losses["cardinality_error"],
+                'loss_bbox': losses["loss_bbox"],
+                'loss_giou': losses["loss_giou"],
+            }
+            loss = sum(loss_dict[k] for k in ['loss_fl', 'loss_bbox', 'loss_giou'])
+        return loss, loss_dict
+    def validate_labels(self, labels):
+        # Check for degenerate boxes
+        for label_idx, label in enumerate(labels):
+            boxes = label["boxes"]
+            degenerate_boxes = boxes[:, 2:] <= 0
+            if degenerate_boxes.any():
+                # print the first degenerate box
+                bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
+                degen_bb: List[float] = boxes[bb_idx].tolist()
+                torch._assert(
+                    False,
+                    "All bounding boxes should have positive height and width."
+                    f" Found invalid box {degen_bb} for target at index {label_idx}.",
+                )
+            # rename key class_labels to labels for compute_loss
+            if 'class_labels' in label.keys():
+                label['labels'] = label.pop('class_labels')
+    def resize_labels(self, labels, h, w):
+        """
+        Resize boxes coordinates to model's resolution
+        """
+        hr = self.config.resolution / float(h)
+        wr = self.config.resolution / float(w)
+        for label in labels:
+            boxes = label["boxes"].to(device=self.config.device, dtype=torch.float32)
+            # resize boxes to model's resolution
+            boxes[:, [0, 2]] *= wr
+            boxes[:, [1, 3]] *= hr
+            # normalize to [0, 1] by model's resolution
+            boxes[:] /= self.config.resolution
+            label["boxes"] = boxes
+            if "labels" in label:
+                label["labels"] = label["labels"].to(self.config.device)
+    ### modified from https://github.com/roboflow/rf-detr/blob/develop/rfdetr/models/backbone/dinov2_with_windowed_attn.py
+    def _onnx_interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility
+        with the original implementation.
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+        - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py
+        """
+        position_embeddings = self.model.backbone[0].encoder.encoder.embeddings.position_embeddings
+        config =  self.model.backbone[0].encoder.encoder.embeddings.config
+        num_patches = embeddings.shape[1] - 1
+        num_positions = position_embeddings.shape[1] - 1
+        # Skip interpolation for matching dimensions (unless tracing)
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return position_embeddings
+        # Handle class token and patch embeddings separately
+        class_pos_embed = position_embeddings[:, 0]
+        patch_pos_embed = position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        # Calculate new dimensions
+        height = height // config.patch_size
+        width = width // config.patch_size
+        # Reshape for interpolation
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        # Store original dtype for restoration after interpolation
+        target_dtype = patch_pos_embed.dtype
+        # Interpolate at float32 precision
+        ### disable antialiasing for ONNX export
+        patch_pos_embed = torch.nn.functional.interpolate(
+            patch_pos_embed.to(dtype=torch.float32),
+            size=(torch_int(height), torch_int(width)),  # Explicit size instead of scale_factor
+            mode="bicubic",
+            align_corners=False,
+            antialias=False,
+        ).to(dtype=target_dtype)
+        # Validate output dimensions if not tracing
+        if not torch.jit.is_tracing():
+            if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+                raise ValueError("Width or height does not match with the interpolated position embeddings")
+        # Reshape back to original format
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        # Combine class and patch embeddings
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor, labels=None, **kwargs) -> ModelOutput:
+        """
+        Parameters
+        ----------
+            pixel_values : torch.Tensor
+                Input tensor representing image pixel values.
+            labels : Optional[List[Dict[str, torch.Tensor | List]]]
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - boxes (FloatTensor[N, 4]): the ground-truth boxes in format [center_x, center_y, width, height]
+                - class_labels (Int64Tensor[N]): the class label for each ground-truth box
+        Returns
+        -------
+            RFDetrObjectDetectionOutput
+                Object containing
+                - loss: sum of focal loss, bounding box loss, and generalized iou loss
+                - loss_dict: dictionary of losses
+                - logits
+                - pred_boxes
+                - aux_outputs
+                - enc_outputs
+        """
+        if torch.jit.is_tracing():
+            ### disable antialiasing for ONNX export
+            resize = Resize((self.config.resolution, self.config.resolution), antialias=False)
+            self.model.backbone[0].encoder.encoder.embeddings.interpolate_pos_encoding = self._onnx_interpolate_pos_encoding
+        else:
+            resize = Resize((self.config.resolution, self.config.resolution))
+        if labels is not None:
+            self.validate_labels(labels)
+            _, _, h, w = pixel_values.shape
+            self.resize_labels(labels, h, w) # reshape labels with model's resolution
+        else:
+            self.model.training = False
+            self.model.transformer.training = False
+            for layer in self.model.transformer.decoder.layers:
+                layer.training = False
+            self.criterion.training = False
+        # resize pixel values and mask to model's resolution
+        pixel_values = pixel_values.to(self.config.device)
+        pixel_mask = pixel_mask.to(self.config.device)
+        pixel_values = resize(pixel_values)
+        pixel_mask = resize(pixel_mask)
+        samples = NestedTensor(pixel_values, pixel_mask)
+        outputs = self.model(samples)
+        # compute loss, return none and empty dict if not training
+        loss, loss_dict = self.compute_loss(outputs, labels)
+        return RFDetrObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=outputs["pred_logits"],
+            pred_boxes=outputs["pred_boxes"],
+            aux_outputs=outputs["aux_outputs"],
+            enc_outputs=outputs["enc_outputs"],
+        )
+__all__ = [
+    "RFDetrModelForObjectDetection"
+]