Imagroune
/

LLMEyeCap

Model card Files Files and versions Community

Imagroune commited on Sep 15, 2023

Commit

5f6da0d

1 Parent(s): bd2522f

Upload 5 files

Browse files

Files changed (5) hide show

LLMEyeCap_01.bin +3 -0
model.py +893 -0
train.py +642 -0
tuto.ipynb +0 -0
utils.py +569 -0

LLMEyeCap_01.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d53f80ed02bdee05882919aa81232ebf8e1af0510bfb6388dc6e616ce57db2a3
+size 445770457

model.py ADDED Viewed

	@@ -0,0 +1,893 @@

+import torch
+from torch import nn
+from torch.utils.data import Dataset, DataLoader
+from torchvision.models import resnet50
+from torchvision import transforms
+from PIL import Image
+import matplotlib.pyplot as plt
+from transformers import BertTokenizer, BertModel
+import os
+import json
+import numpy as np
+from collections import defaultdict
+import random
+from tqdm.notebook import tqdm
+from torchvision import models
+from torch.nn.utils.rnn import pad_sequence
+import matplotlib.patches as patches
+import math
+import time
+import os
+from PIL import Image
+import requests
+import nltk
+import os
+import cv2
+import colorsys
+from numpy import asarray
+import math
+from transformers import GPT2LMHeadModel, GPT2Config
+from scipy.optimize import linear_sum_assignment
+import sys
+sys.path.append("../src")
+from utils import *
+NUM_QUERIES = 40
+feature_size = 256  # Pour ResNet50
+token_size = 256  # Pour GPT-2
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# minimal updates here
+"""
+Various positional encodings for the transformer.
+"""
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return pos
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2
+    if args.position_embedding in ('v2', 'sine'):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
+    elif args.position_embedding in ('v3', 'learned'):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+    return position_embedding
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+class BackboneBase(nn.Module):
+    def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                parameter.requires_grad_(False)
+        if return_interm_layers:
+            return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
+        else:
+            return_layers = {'layer4': "0"}
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        return out
+    '''
+    The line mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] applies a mask to the output
+    features from the backbone. The mask is used to indicate which pixels in the image are valid.
+    The mask is a tensor of the same size as the output features. The mask is initialized to all zeros. The m[None].float()
+    operation expands the mask to be a 1-D tensor of size 1 x H x W. The F.interpolate()
+    operation then resizes the mask to the same size as the output features. The to(torch.bool) operation converts the
+    mask to a binary tensor. The [0] operation takes the first element of the tensor, which is the mask for the first output
+    feature map.
+    The mask of a feature extracted from ResNet50 as a backbone is a binary tensor that indicates which pixels in the image
+    are valid. The pixels that are valid are those that are not padded. The mask is used by the backbone to ignore the padded
+    pixels when it is extracting features from the image.
+    '''
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(self, name: str,
+                 train_backbone: bool,
+                 return_interm_layers: bool,
+                 dilation: bool):
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d)
+            # ==> todo weights=ResNet50_Weights.DEFAULT)
+        num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
+        super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.tensors.dtype))
+        return out, pos
+def build_backbone(args):
+    position_embedding = build_position_encoding(args)
+    train_backbone = args.lr_backbone > 0
+    return_interm_layers = args.masks
+    backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = backbone.num_channels
+    return model
+def get_sinusoid_encoding_table(n_position, d_hid):
+    def cal_angle(position, hid_idx):
+        return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
+    def get_posi_angle_vec(position):
+        return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
+    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+    return torch.FloatTensor(sinusoid_table)
+class PostProcess(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+        """
+        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+        prob = F.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+        # convert to [x0, y0, x1, y1] format
+        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+        results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
+        return results
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+def build(args):
+    # the `num_classes` naming here is somewhat misleading.
+    # it indeed corresponds to `max_obj_id + 1`, where max_obj_id
+    # is the maximum id for a class in your dataset. For example,
+    # COCO has a max_obj_id of 90, so we pass `num_classes` to be 91.
+    # As another example, for a dataset that has a single class with id 1,
+    # you should pass `num_classes` to be 2 (max_obj_id + 1).
+    # For more details on this, check the following discussion
+    # https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223
+    num_classes = 20 if args.dataset_file != 'coco' else 91
+    if args.dataset_file == "coco_panoptic":
+        # for panoptic, we just add a num_classes that is large enough to hold
+        # max_obj_id + 1, but the exact value doesn't really matter
+        num_classes = 250
+    device = torch.device(args.device)
+    backbone = build_backbone(args)
+    transformer = build_transformer(args)
+    model = DETR(
+        backbone,
+        transformer,
+        num_classes=num_classes,
+        num_queries=args.num_queries,
+        aux_loss=args.aux_loss,
+    )
+    if args.masks:
+        model = DETRsegm(model, freeze_detr=(args.frozen_weights is not None))
+    matcher = build_matcher(args)
+    weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef}
+    weight_dict['loss_giou'] = args.giou_loss_coef
+    if args.masks:
+        weight_dict["loss_mask"] = args.mask_loss_coef
+        weight_dict["loss_dice"] = args.dice_loss_coef
+    # TODO this is a hack
+    if args.aux_loss:
+        aux_weight_dict = {}
+        for i in range(args.dec_layers - 1):
+            aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+    losses = ['labels', 'boxes', 'cardinality']
+    if args.masks:
+        losses += ["masks"]
+    criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict,
+                             eos_coef=args.eos_coef, losses=losses)
+    criterion.to(device)
+    postprocessors = {'bbox': PostProcess()}
+    if args.masks:
+        postprocessors['segm'] = PostProcessSegm()
+        if args.dataset_file == "coco_panoptic":
+            is_thing_map = {i: i <= 90 for i in range(201)}
+            postprocessors["panoptic"] = PostProcessPanoptic(is_thing_map, threshold=0.85)
+    return model, criterion, postprocessors
+class Parameters:
+    def __init__(self):
+        self.lr = 1e-4
+        self.lr_backbone = 1e-5
+        self.batch_size = 2
+        self.weight_decay = 1e-4
+        self.epochs = 300
+        self.lr_drop = 200
+        self.clip_max_norm = 0.1
+args = Parameters()
+args.lr=1e-4
+args.lr_backbone=1e-5
+args.batch_size=32
+args.weight_decay=1e-4
+args.epochs=300
+args.lr_drop=200
+args.clip_max_norm=0.1 # type=float,    help='gradient clipping max norm')
+# Model parameters
+args.frozen_weights=False # ', type=str, default=None, #    help="Path to the pretrained model. If set, only the mask head will be trained")
+# * Backbone
+args.backbone='resnet50' # type=str, #     help="Name of the convolutional backbone to use")
+args.dilation=False  # ', action='store_true',          #      help="If true, we replace stride with dilation in the last convolutional block (DC5)")
+args.position_embedding='sine' # type=str, choices=('sine', 'learned'),     help="Type of positional embedding to use on top of the image features")
+# * Transformer
+args.enc_layers=6 # type=int,      help="Number of encoding layers in the transformer")
+args.dec_layers=6 # type=int,       help="Number of decoding layers in the transformer")
+args.dim_feedforward=2048  # ===> type=int,   help="Intermediate size of the feedforward layers in the transformer blocks")
+args.hidden_dim=256  # ===> type=int,   help="Size of the embeddings (dimension of the transformer)")
+args.dropout=0.1   #type=float,   help="Dropout applied in the transformer")
+args.nheads=8   #type=int,   help="Number of attention heads inside the transformer's attentions")
+args.num_queries=40  #type=int,   help="Number of query slots")
+args.pre_norm=True # ', action='store_true')
+# * Segmentation
+args.masks=False #, action='store_true',  help="Train segmentation head if the flag is provided")
+"""
+LLMEyeCap Transformer class.
+A DETR (FaceBook) Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import copy
+from typing import Optional, List
+class Transformer(nn.Module):
+    def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
+                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False,
+                 return_intermediate_dec=False):
+        super().__init__()
+        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec)
+        self._reset_parameters()
+        self.d_model = d_model
+        self.nhead = nhead
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(self, src, mask, query_embed, pos_embed):
+        # flatten NxCxHxW to HWxNxC
+        bs, c, h, w = src.shape
+        src = src.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
+        mask = mask.flatten(1)
+        tgt = torch.zeros_like(query_embed)
+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
+        hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
+                          pos=pos_embed, query_pos=query_embed)
+        return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
+class TransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+    def forward(self, src,
+                mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=mask,
+                           src_key_padding_mask=src_key_padding_mask, pos=pos)
+        if self.norm is not None:
+            output = self.norm(output)
+        return output
+class TransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        output = tgt
+        intermediate = []
+        for layer in self.layers:
+            output = layer(output, memory, tgt_mask=tgt_mask,
+                           memory_mask=memory_mask,
+                           tgt_key_padding_mask=tgt_key_padding_mask,
+                           memory_key_padding_mask=memory_key_padding_mask,
+                           pos=pos, query_pos=query_pos)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+        return output.unsqueeze(0)
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self,
+                     src,
+                     src_mask: Optional[Tensor] = None,
+                     src_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+    def forward_pre(self, src,
+                    src_mask: Optional[Tensor] = None,
+                    src_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+    def forward(self, src,
+                src_mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt, memory,
+                     tgt_mask: Optional[Tensor] = None,
+                     memory_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+    def forward_pre(self, tgt, memory,
+                    tgt_mask: Optional[Tensor] = None,
+                    memory_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
+                                    tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
+                                 tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def build_transformer(args):
+    return Transformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+    )
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+class LLMEyeCap(nn.Module): # Im Novel Object Captioning V 0.1
+    def __init__(self, backbone, transformer, num_queries, vocab_size,pad_token):
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        self.hidden_dim = transformer.d_model
+        self.caption_embed = nn.Linear(self.hidden_dim, vocab_size)
+        self.bbox_embed = MLP(self.hidden_dim, self.hidden_dim, 4, 3)
+        self.query_embed = nn.Embedding(num_queries, self.hidden_dim)
+        self.input_proj = nn.Conv2d(backbone.num_channels, self.hidden_dim, kernel_size=1)
+        self.backbone = backbone
+        '''
+        self.capdecoder =  CaptioningDecoder(detr_decoder_dim=transformer.d_model, token_embedding_dim=transformer.d_model,
+                                             vocab_size=vocab_size, num_queries=num_queries, num_layers=6)
+        '''
+        self.capdecoder = CaptionDecoder(feature_size, token_size, vocab_size,num_queries,pad_token ).to(device)
+    def forward(self, samples: NestedTensor, captions):
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, pos = self.backbone(samples)  #featers + position embedding
+        src, mask = features[-1].decompose()
+        assert mask is not None
+        hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]
+        outputs_coord = self.bbox_embed(hs).sigmoid()
+        outputs_captions=self.capdecoder(hs,captions)
+        # predicted_sequences = torch.argmax(outputs_captions, dim=-1)
+        out = {'pred_logits': outputs_captions , 'pred_boxes': outputs_coord[-1]}
+        return out
+    def generate_caption(self, image_path, tokenizer, max_length, pad_sos):
+        image = Image.open(image_path).convert('RGB')
+        transform = transforms.Compose([
+        transforms.Resize((256, 256)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+        image = transform(image).unsqueeze(0).to(device)
+        if isinstance(image, (list, torch.Tensor)):
+            image = nested_tensor_from_tensor_list(image)
+        with torch.no_grad():
+            features, pos = self.backbone(image)  #featers + position embedding
+            src, mask = features[-1].decompose()
+            assert mask is not None
+            hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]
+            outputs_coord = self.bbox_embed(hs).sigmoid()
+            input_ids = torch.ones((1, 40, 1), dtype=torch.long, device=device)
+            input_ids.fill_(pad_sos)
+            for i in range(max_length):
+                outputs_captions = self.capdecoder(hs, input_ids)
+                predicted_sequences = torch.argmax(outputs_captions, dim=-1)
+                next_token = predicted_sequences[:, :, -1:]  # take the last token from the sequence
+                input_ids = torch.cat((input_ids, next_token), dim=-1)
+            #caption = tokenizer.detokenize(input_ids[0].tolist()) #, skip_special_tokens=True)
+        return outputs_coord[-1], input_ids # caption[-1]
+class LLMEyeCapModel(nn.Module):
+    def __init__(self, num_queries,vocab_size,pad_token):
+        super(LLMEyeCapModel,self).__init__()
+        self.num_queries = num_queries
+        self.vocab_size=vocab_size
+        self.backbone = build_backbone(args)
+        self.transformer = build_transformer(args)
+        self.model = LLMEyeCap(
+        self.backbone,
+        self.transformer,
+        num_queries=self.num_queries,
+        vocab_size=self.vocab_size,
+        pad_token=pad_token
+        )
+        # self.in_features = self.caption_embed.in_features
+        # self.model.class_embed = nn.Linear(in_features=self.in_features,out_features=self.num_classes)
+        self.model.num_queries = self.num_queries
+    def forward(self,images,captions):
+        return self.model(images,captions)
+    def generate_caption(self, image_path, tokenizer, max_length=20,pad_sos=0):
+        return self.model.generate_caption(image_path, tokenizer, max_length,pad_sos)
+class CaptionDecoder(nn.Module):
+    def __init__(self, detr_decoder_dim, token_embedding_dim, vocab_size, num_queries, pad_token, num_layers=6):
+        super(CaptionDecoder, self).__init__()
+        self.detr_decoder_dim = detr_decoder_dim
+        self.token_embedding_dim = token_embedding_dim
+        self.vocab_size = vocab_size
+        self.num_queries = num_queries
+        self.pad_token = pad_token
+        # Token embedding layer
+        self.token_embedding = nn.Embedding(vocab_size, token_embedding_dim)
+        # Initialize GPT-2
+        config = GPT2Config(vocab_size=vocab_size, n_embd=detr_decoder_dim + token_embedding_dim, n_head=8 )
+        self.gpt2 = GPT2LMHeadModel(config)
+        self.target_projection = nn.Linear(token_embedding_dim, detr_decoder_dim + token_embedding_dim)
+    def forward(self, detr_output, captions):
+        # Create an attention mask with shape [batch_size, num_queries, sequence_length]
+        attention_mask = (captions != self.pad_token).float().to(captions.device)  # [batch_size, num_queries, sequence_length]
+        seq_length = captions.size(2)
+        pos_encoding = get_sinusoid_encoding_table(seq_length, self.token_embedding_dim).to(captions.device)
+        pos_encoding = pos_encoding.unsqueeze(0).repeat(captions.size(0) * self.num_queries, 1, 1)
+        # Get the last layer's output from the DETR decoder
+        spatial_embedding = detr_output[-1]  # [batch_size, num_queries, detr_decoder_dim]
+        # Get token embeddings
+        token_embeddings = self.token_embedding(captions)  # [batch_size, num_queries, seq_length, token_embedding_dim]
+        # Repeat the spatial embedding for each token in the sequence and concatenate
+        spatial_embedding = spatial_embedding.unsqueeze(2)  # Add seq_length dimension: [batch_size, num_queries, 1, detr_decoder_dim]
+        combined_embedding = torch.cat([spatial_embedding.repeat(1, 1, token_embeddings.size(2), 1), token_embeddings], dim=-1)
+        # combined_embedding shape: [batch_size, num_queries, seq_length, detr_decoder_dim + token_embedding_dim]
+        # Prepare the memory for the transformer decoder
+        memory = combined_embedding.permute(2, 0, 1, 3).reshape(captions.size(2), -1, self.detr_decoder_dim + self.token_embedding_dim)
+        # memory shape: [seq_length, batch_size*num_queries, detr_decoder_dim + token_embedding_dim]
+        # Prepare the target for the transformer decoder (using token embeddings)
+        target = token_embeddings.permute(2, 0, 1, 3).reshape(captions.size(2), -1, self.token_embedding_dim)
+        # target shape: [seq_length, batch_size*num_queries, token_embedding_dim]
+        pos_encoding = pos_encoding.permute(1, 0, 2)
+        target += pos_encoding
+        # Project target to the required dimension
+        target = self.target_projection(target)
+        attention_mask = attention_mask.permute(2, 0, 1).reshape(captions.size(2), -1)
+        tgt_key_padding_mask = (attention_mask == 0).permute(1,0)
+        # Prepare the inputs for GPT-2
+        inputs_embeds = combined_embedding.permute(2, 0, 1, 3).reshape(captions.size(2), -1, self.detr_decoder_dim + self.token_embedding_dim)
+        # Reshape attention_mask for GPT-2. Flatten the batch_size and num_queries dimensions.
+        attention_mask = attention_mask.reshape(-1, captions.size(2))  # New shape: [batch_size * num_queries, sequence_length]
+        # Pass through GPT-2
+        outputs = self.gpt2(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
+        logits = outputs.logits
+        # Reshape logits to match the original shape
+        logits = logits.view(captions.size(2), captions.size(0), self.num_queries, self.vocab_size).permute(1, 2, 0, 3)
+        return logits

train.py ADDED Viewed

	@@ -0,0 +1,642 @@

+import torch
+from torch import nn
+from torch.utils.data import Dataset, DataLoader
+from torchvision.models import resnet50
+from torchvision import transforms
+from PIL import Image
+import matplotlib.pyplot as plt
+from transformers import BertTokenizer, BertModel
+import os
+import json
+import numpy as np
+from collections import defaultdict
+import random
+from tqdm.notebook import tqdm
+from torchvision import models
+from torch.nn.utils.rnn import pad_sequence
+import matplotlib.patches as patches
+import math
+import time
+import os
+from PIL import Image
+import requests
+import nltk
+import os
+import cv2
+import colorsys
+from numpy import asarray
+import math
+from transformers import GPT2LMHeadModel, GPT2Config
+from transformers import BertTokenizer
+from scipy.optimize import linear_sum_assignment
+class CocoDataset(Dataset):
+    def __init__(self, root_dir, annotation_file, instance_file, max_objects=40, transform=None):
+        self.root_dir = root_dir
+        self.transform = transform
+        self.max_objects = max_objects
+        self.img_cache = dict()  # Cache for images
+        # Load instance file only once
+        with open(instance_file, 'r') as file:
+            data = json.load(file)
+            instances = data['annotations']
+            categories = data['categories']
+        with open(annotation_file, 'r') as file:
+            annotations = json.load(file)['annotations']
+        self.image_captions = defaultdict(list)
+        for annotation in annotations:
+            img_id = annotation['image_id']
+            self.image_captions[img_id].append(annotation['caption'])
+        self.image_instances = defaultdict(list)
+        self.category_id_to_name = {category['id']: category['name'] for category in categories}
+        for instance in instances:
+            img_id = instance['image_id']
+            bbox = instance['bbox']
+            category_id = instance['category_id']
+            self.image_instances[img_id].append((bbox, category_id))
+        self.img_ids = list(self.image_captions.keys())
+    def __len__(self):
+        return len(self.img_ids)
+    def __getitem__(self, index):
+        img_id = self.img_ids[index]
+        img_path = os.path.join(self.root_dir, f'{str(img_id).zfill(12)}.jpg')
+        # Use cached image if available
+        if img_id in self.img_cache:
+            img = self.img_cache[img_id]
+        else:
+            img = Image.open(img_path).convert("RGB")
+            self.img_cache[img_id] = img
+        captions = self.image_captions[img_id]
+        caption = random.choice(captions)
+        annotations = self.image_instances[img_id]
+        bboxes = []
+        labels = []
+        for obbox, label_id in annotations:
+            bbox = torch.tensor(obbox)  # Convert to PyTorch tensor immediately
+            bbox[0] = bbox[0] / img.width + (bbox[2] / img.width)/2
+            bbox[1] = bbox[1] / img.height + (bbox[3] / img.height)/2
+            bbox[2] = bbox[2] / img.width
+            bbox[3] = bbox[3] / img.height
+            label = self.category_id_to_name[label_id]
+            bboxes.append(bbox)
+            labels.append(label)
+        bboxes.append(torch.tensor([0.5, 0.5, 1, 1]))
+        labels.append(caption)
+        total_boxes = len(bboxes)
+        if total_boxes < 40:
+            for _ in range(40-total_boxes):
+                bboxes.append(torch.tensor([0, 0, 0 ,0]))
+                labels.append("na")
+        else:
+            bboxes = bboxes[:40]
+            labels = labels[:40]
+        if self.transform:
+            img = self.transform(img)
+        return img, bboxes, labels
+# Définir les transformations
+transform = transforms.Compose([
+    transforms.Resize((256, 256)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+])
+def custom_collate(batch):
+    images, boxes_list, labels_list = zip(*batch)
+    # Convert list of PIL images to a single PyTorch tensor
+    stacked_images = torch.stack(images)
+    # Convert list of list of boxes to a list of PyTorch tensors
+    stacked_boxes = [torch.stack([box.clone().detach() for box in boxes]) for boxes in boxes_list]
+    # Since labels are strings, we can keep them as a list of lists
+    # labels_list is already in the desired format
+    return stacked_images, stacked_boxes, labels_list
+def train_fn(data_loader, model, criterion, optimizer, device, scheduler, epoch):
+    model.train()
+    criterion.train()
+    summary_loss = AverageMeter()
+    tk0 = tqdm(data_loader, total=len(data_loader)-1)
+    for step, (images, bboxes, captions) in enumerate(tk0):
+        try:
+            flattened_captions = [caption for sublist in captions for caption in sublist]
+            captions = tokenizer(flattened_captions, padding=True, return_tensors="pt", truncation=True)
+            captions = captions["input_ids"]
+            input_ids = captions.reshape(batch_size, num_queries, -1).to(device)
+            min_length = 2
+        except RuntimeError as e:
+            print("Reshape failed:", e)
+            continue
+        '''
+        min_length = 2
+        if input_ids.size(-1) < min_length:
+            padding_needed = min_length - input_ids.size(-1)
+            input_ids = F.pad(input_ids, (0, padding_needed), 'constant', PAD_TOKEN)
+        targets = build_targets(bboxes, input_ids[:, :, 1:])
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        images = list(image.to(device) for image in images)
+        output = model(images,input_ids[:, :,:-1])
+        '''
+        min_length = 2
+        if input_ids.size(-1) < min_length:
+            padding_needed = min_length - input_ids.size(-1)
+            input_ids = F.pad(input_ids, (0, padding_needed), 'constant', PAD_TOKEN)
+        # input_ids = captions["input_ids"]
+        # input_ids = input_ids.reshape(batch_size, num_queries, -1).to(device)
+        targets = build_targets(bboxes, input_ids[:, :, 1:])
+        #targets = build_targets(bboxes, captions[:,:,1:])
+        images = list(image.to(device) for image in images)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        output = model(images,input_ids[:,:,:-1])
+        loss_dict = criterion(output, targets)
+        weight_dict = criterion.weight_dict
+        losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+        optimizer.zero_grad()
+        losses.backward()
+        optimizer.step()
+        if scheduler is not None:
+            scheduler.step()
+        # Detach and delete tensors
+        loss_dict = {k: v.detach() for k, v in loss_dict.items()}
+        del images, bboxes, captions, output, targets, loss_dict
+        torch.cuda.empty_cache()  # Clear cache
+        summary_loss.update(losses.item(),BATCH_SIZE)
+        tk0.set_postfix(loss=summary_loss.avg)
+    return summary_loss
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
+        """Creates the matcher
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+        # We flatten to compute the cost matrices in a batch
+        # out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
+        out_prob = outputs["pred_logits"].flatten(0,2 ).softmax(-1)  # [batch_size * num_queries * seq_length, vocab_size ]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["labels"] for v in targets])
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+        # but approximate it in 1 - proba[target class].
+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        cost_class = -out_prob[:, tgt_ids]
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+        # Compute the giou cost betwen boxes
+        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+        # Final cost matrix
+        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class.mean() + self.cost_giou * cost_giou
+        #C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
+        C = C.view(bs, num_queries, -1).cpu()
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+def build_matcher(args):
+    return HungarianMatcher(cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou)
+class SetCriterion(nn.Module):
+    """ This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(self, vocab_size, matcher, weight_dict, eos_coef, losses,pad_token):
+        """ Create the criterion.
+        Parameters:
+            vocab_size : es number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        self.pad_token=pad_token
+        empty_weight = torch.ones(self.vocab_size)
+        # empty_weight[-1] = self.eos_coef
+        self.register_buffer('empty_weight', empty_weight)
+        self.criterion = nn.CrossEntropyLoss(ignore_index=pad_token)
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=False):
+        """Classification loss (NLL) for sequences
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes, seq_length]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+        batch_size, num_boxes , sequence_length, _ = src_logits.size()
+        # Get the indices for the permutation
+        batch_idx, src_idx = self._get_src_permutation_idx(indices)
+        target_classes = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        # Ensure the target classes are valid
+        assert (target_classes >= 0).all() and (target_classes < self.vocab_size).all(), "Invalid token index in target!"
+        # loss_ce = criterion(outputs.reshape(-1, vocab_size), captions.view(-1))
+        loss_ce = self.criterion(src_logits.reshape(batch_size * num_boxes * sequence_length, -1), target_classes.reshape(-1))
+        # loss_ce = torchmetrics.functional.smooth_cross_entropy(src_logits[batch_idx], target_classes, ignore_index=PAD_TOKEN)
+        losses = {'loss_ce': loss_ce}
+        return losses
+    '''
+        criterion = nn.CrossEntropyLoss(ignore_index=self.PAD_TOKEN)
+        loss_ce = criterion(src_logits, target_classes_for_loss)
+        losses = {'loss_ce': loss_ce}
+    '''
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
+        """
+        pred_logits = outputs['pred_logits']
+        device = pred_logits.device
+        tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
+        card_pred = card_pred.sum(dim=1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {'cardinality_error': card_err}
+        return losses
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses = {}
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+        loss_giou = 1 - torch.diag(generalized_box_iou(
+            box_cxcywh_to_xyxy(src_boxes),
+            box_cxcywh_to_xyxy(target_boxes)))
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        return losses
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+           targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+        # upsample predictions to the target size
+        src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:],
+                                mode="bilinear", align_corners=False)
+        src_masks = src_masks[:, 0].flatten(1)
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'cardinality': self.loss_cardinality,
+            'boxes': self.loss_boxes,
+            'masks': self.loss_masks
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+    def forward(self, outputs, targets):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+        # print("indice len", len(indices), "len (indices[0])  ", len (indices[0]))
+        # print( " shape indices 0   0 ", indices [0][0].shape , " shape indices 0   1 ", indices [0][1].shape)
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+        # print("num_boxes",num_boxes)
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+        '''
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    if loss == 'masks':
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    kwargs = {}
+                    if loss == 'labels':
+                        # Logging is enabled only for the last layer
+                        kwargs = {'log': False}
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
+                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        '''
+        return losses
+def eval_fn(data_loader, model,criterion, device):
+    model.eval()
+    criterion.eval()
+    summary_loss = AverageMeter()
+    with torch.no_grad():
+        #tk0 = tqdm(data_loader, total=len(data_loader))
+        #for step, (images, bboxes, captions) in enumerate(tk0):
+        #pbar = tqdm(range(len(data_loader)))**
+        tk0 = tqdm(data_loader, total=len(data_loader)-1)
+        for step, (images, bboxes, captions) in enumerate(tk0):
+            try:
+                flattened_captions = [caption for sublist in captions for caption in sublist]
+                captions = tokenizer(flattened_captions, padding=True, return_tensors="pt", truncation=True)
+                captions = captions["input_ids"]
+                input_ids = captions.reshape(batch_size, num_queries, -1).to(device)
+                min_length = 2
+            except RuntimeError as e:
+                print("Reshape failed:", e)
+                continue
+            if input_ids.size(-1) < min_length:
+                padding_needed = min_length - input_ids.size(-1)
+                input_ids = F.pad(input_ids, (0, padding_needed), 'constant', PAD_TOKEN)
+            # input_ids = captions["input_ids"]
+            # input_ids = input_ids.reshape(batch_size, num_queries, -1).to(device)
+            targets = build_targets(bboxes, input_ids[:, :, 1:])
+            #targets = build_targets(bboxes, captions[:,:,1:])
+            images = list(image.to(device) for image in images)
+            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+            output = model(images,input_ids[:,:,:-1])
+            loss_dict = criterion(output, targets)
+            weight_dict = criterion.weight_dict
+            losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+            summary_loss.update(losses.item(),BATCH_SIZE)
+            #
+            # Detach and delete tensors
+            loss_dict = {k: v.detach() for k, v in loss_dict.items()}
+            del images, bboxes, captions, output, targets, loss_dict
+            torch.cuda.empty_cache()  # Clear cache
+            tk0.set_postfix(loss=summary_loss.avg)
+        #data_loader.on_epoch_end()
+    return summary_loss
+def build_targets(bboxes, captions):
+    targets = []
+    for  i, (bbox, caption) in enumerate(zip(bboxes, captions)):
+        target = {
+            "boxes": bbox,
+            "labels": caption,
+        }
+        targets.append(target)
+    return targets
+if __name__ == "__main__":
+    # Créer les datasets
+    train_dataset = CocoDataset(root_dir="../data/coco91/train2017",
+                                    annotation_file="../data/coco91/annotations/captions_train2017.json",
+                                 instance_file="../data/coco91/annotations/instances_train2017.json",
+                                    transform=transform)
+    val_dataset = CocoDataset(root_dir="../data/coco91/val2017", annotation_file="../data/coco91/annotations/captions_val2017.json",
+                                   instance_file="../data/coco91/annotations/instances_val2017.json",
+                                   transform=transform)
+    batch_size=4
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)
+    val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=custom_collate)
+    # Initialiser le tokenizer BERT
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    # Obtenir le token de padding et son ID
+    #PAD_TOKEN = tokenizer.pad_token
+    PAD_TOKEN = tokenizer.pad_token_id
+    # Obtenir le token de début de séquence et son ID
+    # Pour BERT, le token de début de séquence est souvent le même que le token [CLS]
+    #start_of_sequence_token = tokenizer.cls_token
+    PAD_SOS = tokenizer.cls_token_id
+    # Obtenir la taille du vocabulaire
+    vocab_size = tokenizer.vocab_size
+    print(f"Pad token: {PAD_TOKEN}")
+    print(f"Start of Sequence token: {PAD_SOS}, ID: {PAD_SOS}")
+    print(f"Vocab size: {vocab_size}")
+    matcher = HungarianMatcher()
+    weight_dict = weight_dict = {'loss_ce': 1, 'loss_bbox': 1 , 'loss_giou': 1}
+    losses = ['labels', 'boxes', 'cardinality']
+    criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
+    model = LLMEyaCapModel(num_queries=NUM_QUERIES,vocab_size=vocab_size)
+    model = model.to(device)
+    criterion = SetCriterion(vocab_size, matcher=matcher, weight_dict=weight_dict, eos_coef = NULL_CLASS_COEF, losses=losses)
+    criterion = criterion.to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
+    best_loss = 10**5
+    LR = 2e-6
+    #LR = 2e-4
+    optimizer = torch.optim.AdamW(model.parameters(), lr=LR) #, weight_decay=0.0001)
+    EPOCHS=1
+    num_queries=NUM_QUERIES
+    batch_size=4
+    for epoch in range(EPOCHS):
+        time_start = time.time()
+        train_loss = train_fn(train_loader, model,criterion, optimizer,device,scheduler=None,epoch=epoch)
+        valid_loss = eval_fn(val_loader, model,criterion, device)
+        elapsed = time.time() - time_start
+        chk_name = f'LLMEyeCap_01_e{epoch}.bin'
+        torch.save(model.state_dict(), chk_name)
+        print(f"[Epoch {epoch+1:2d} / {EPOCHS:2d}] Train loss: {train_loss.avg:.3f}. Val loss: {valid_loss.avg:.3f} --> {chk_name}  [{elapsed/60:.0f} mins]")
+        if valid_loss.avg < best_loss:
+            best_loss = valid_loss.avg
+            print(f'Best model found in epoch {epoch+1}........Saving Model')
+            torch.save(model.state_dict(), 'LLMEyeCap_01_model.bin')

tuto.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

utils.py ADDED Viewed

	@@ -0,0 +1,569 @@

+from torchvision.ops.boxes import box_area
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+# modified from torchvision to also return the union
+def box_iou_2(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+    union = area1[:, None] + area2 - inter
+    iou = inter / union
+    return iou , union
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+    The boxes should be in [x0, y0, x1, y1] format
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou_2(boxes1, boxes2)
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+    return iou - (area - union) / area
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+    h, w = masks.shape[-2:]
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+"""
+Misc functions, including distributed helpers.
+Mostly copy-paste from torchvision references.
+"""
+import os
+import subprocess
+import time
+from collections import defaultdict, deque
+import datetime
+import pickle
+from packaging import version
+from typing import Optional, List
+import torch
+import torch.distributed as dist
+from torch import Tensor
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+if version.parse(torchvision.__version__) < version.parse('0.7'):
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+    return data_list
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
+    sha = 'N/A'
+    diff = "clean"
+    branch = 'N/A'
+    try:
+        sha = _run(['git', 'rev-parse', 'HEAD'])
+        subprocess.check_output(['git', 'diff'], cwd=cwd)
+        diff = _run(['git', 'diff-index', 'HEAD'])
+        diff = "has uncommited changes" if diff else "clean"
+        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+def collate_fn(batch):
+    batch = list(zip(*batch))
+    batch[0] = nested_tensor_from_tensor_list(batch[0])
+    return tuple(batch)
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+    def decompose(self):
+        return self.tensors, self.mask
+    def __repr__(self):
+        return str(self.tensors)
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], :img.shape[2]] = False
+    else:
+        raise ValueError('not supported')
+    return NestedTensor(tensor, mask)
+# _onnx_nested_tensor_from_tensor_list() is an implementation of
+# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
+@torch.jit.unused
+def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
+    max_size = []
+    for i in range(tensor_list[0].dim()):
+        max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
+        max_size.append(max_size_i)
+    max_size = tuple(max_size)
+    # work around for
+    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+    # m[: img.shape[1], :img.shape[2]] = False
+    # which is not yet supported in onnx
+    padded_imgs = []
+    padded_masks = []
+    for img in tensor_list:
+        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_imgs.append(padded_img)
+        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
+        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
+        padded_masks.append(padded_mask.to(torch.bool))
+    tensor = torch.stack(padded_imgs)
+    mask = torch.stack(padded_masks)
+    return NestedTensor(tensor, mask=mask)
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+@torch.no_grad()
+def accuracy(output, target, topk=(1,)):
+    if output.dim() == 1:
+        output = output.unsqueeze(0)
+    maxk = max(topk)
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+'''
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    if target.numel() == 0:
+        return [torch.zeros([], device=output.device)]
+    maxk = max(topk)
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+'''
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if version.parse(torchvision.__version__) < version.parse('0.7'):
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(
+                input, size, scale_factor, mode, align_corners
+            )
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)