upload sailvit-large

Browse files

Files changed (4) hide show

config.json +24 -0
configuration_sailvit.py +62 -0
model.safetensors +3 -0
modeling_sailvit.py +198 -0

config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "SAILViTModel"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_sailvit.SAILViTConfig",
+    "AutoModel": "modeling_sailvit.SAILViTModel"
+  },
+  "hidden_size": 1024,
+  "image_size": 448,
+  "intermediate_size": 2816,
+  "model_type": "sailvit",
+  "num_attention_heads": 8,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "patch_size": 14,
+  "projection_dropout": 0.0,
+  "qkv_bias": false,
+  "rms_norm_eps": 1e-05,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.1",
+  "use_bias": false
+}

configuration_sailvit.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from typing import Any
+from transformers.configuration_utils import PretrainedConfig
+__all__ = ["SAILViTConfig"]
+class SAILViTConfig(PretrainedConfig):
+    """This is the configuration class to store the configuration of an [`SAILViTModel`].
+    Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the [apple/SAILViT-Huge-600M-448px](https://huggingface.co/BytedanceDouyinContent/SAILViT-Huge-600M-448px).
+    Args:
+        hidden_size: Dimension of the hidden representations.
+        intermediate_size: Dimension of the SwiGLU representations.
+        num_hidden_layers: Number of hidden layers in the Transformer.
+        num_attention_heads: Number of attention heads for each attention layer
+            in the Transformer.
+        num_channels: Number of input channels.
+        image_size: Image size.
+        patch_size: Patch size.
+        rms_norm_eps: Epsilon value used for the RMS normalization layer.
+        attention_dropout: Dropout ratio for attention probabilities.
+        projection_dropout: Dropout ratio for the projection layer after the attention.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        use_bias: Whether to add a bias in the feed-forward and projection layers.
+        kwargs: Keyword arguments for the [`PretrainedConfig`].
+    """
+    model_type: str = "sailvit"
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 2816,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 8,
+        num_channels: int = 3,
+        image_size: int = 224,
+        patch_size: int = 14,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+        self.projection_dropout = projection_dropout
+        self.qkv_bias = qkv_bias
+        self.use_bias = use_bias

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d131eccf6df91ea2586e139e457a6ed991a60ac082334011dc2ac86b1a608e97
+size 619986688

modeling_sailvit.py ADDED Viewed

	@@ -0,0 +1,198 @@

+# adapted from https://huggingface.co/apple/aimv2-huge-patch14-448 (modification: add gradient checkpoint support)
+from typing import Optional, Tuple, Union
+import torch
+from .configuration_sailvit import SAILViTConfig
+from torch import nn
+from torch.nn import functional as F
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention
+from transformers.modeling_utils import PreTrainedModel
+__all__ = ["SAILViTModel"]
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+    def extra_repr(self) -> str:
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+class SAILViTSwiGLUFFN(nn.Module):
+    def __init__(self, config: SAILViTConfig):
+        super().__init__()
+        hidden_features = config.intermediate_size
+        in_features = config.hidden_size
+        bias = config.use_bias
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
+        self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.fc1(x)) * self.fc3(x)
+        x = self.fc2(x)
+        return x
+class SAILViTPatchEmbed(nn.Module):
+    def __init__(self, config: SAILViTConfig):
+        super().__init__()
+        self.proj = nn.Conv2d(
+            config.num_channels,
+            config.hidden_size,
+            kernel_size=(config.patch_size, config.patch_size),
+            stride=(config.patch_size, config.patch_size),
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x
+class SAILViTPreprocessor(nn.Module):
+    def __init__(self, config: SAILViTConfig):
+        super().__init__()
+        num_patches = (config.image_size // config.patch_size) ** 2
+        self.patchifier = SAILViTPatchEmbed(config)
+        self.pos_embed = nn.Parameter(torch.zeros((1, num_patches, config.hidden_size)))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        tokens = self.patchifier(x)
+        _, N, _ = tokens.shape
+        pos_embed = self.pos_embed.to(tokens.device)
+        tokens = tokens + pos_embed[:, :N]
+        return tokens
+class SAILViTAttention(nn.Module):
+    def __init__(self, config: SAILViTConfig):
+        super().__init__()
+        dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj = nn.Linear(dim, dim, bias=config.use_bias)
+        self.proj_drop = nn.Dropout(config.projection_dropout)
+    def forward(
+        self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)
+        x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+        x = x.transpose(1, 2).contiguous().reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SAILViTBlock(nn.Module):
+    def __init__(self, config: SAILViTConfig):
+        super().__init__()
+        self.attn = SAILViTAttention(config)
+        self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = SAILViTSwiGLUFFN(config)
+        self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        x = x + self.attn(self.norm_1(x), mask)
+        x = x + self.mlp(self.norm_2(x))
+        return x
+class SAILViTTransformer(nn.Module):
+    def __init__(self, config: SAILViTConfig):
+        super().__init__()
+        self.blocks = nn.ModuleList(
+            [SAILViTBlock(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.post_trunk_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        output_hidden_states: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, ...]]]:
+        hidden_states = () if output_hidden_states else None
+        for block in self.blocks:
+            if self.gradient_checkpointing and self.training:
+                tokens = self._gradient_checkpointing_func(block.__call__, tokens, mask)
+            else:
+                tokens = block(tokens, mask)
+            if output_hidden_states:
+                hidden_states += (tokens,)
+        tokens = self.post_trunk_norm(tokens)
+        return tokens, hidden_states
+class SAILViTPretrainedModel(PreTrainedModel):
+    config_class = SAILViTConfig
+    base_model_prefix = "sailvit"
+    supports_gradient_checkpointing = True
+    main_input_name = "pixel_values"
+    _no_split_modules = ["SAILViTPreprocessor", "SAILViTBlock"]
+    _supports_sdpa = True
+class SAILViTModel(SAILViTPretrainedModel):
+    def __init__(self, config: SAILViTConfig):
+        super().__init__(config)
+        self.preprocessor = SAILViTPreprocessor(config)
+        self.trunk = SAILViTTransformer(config)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[
+        Tuple[torch.Tensor],
+        Tuple[torch.Tensor, Tuple[torch.Tensor, ...]],
+        BaseModelOutputWithNoAttention,
+    ]:
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+        if return_dict is None:
+            return_dict = self.config.use_return_dict
+        x = self.preprocessor(pixel_values)
+        x, hidden_states = self.trunk(
+            x, mask, output_hidden_states=output_hidden_states
+        )
+        if not return_dict:
+            res = (x,)
+            res += (hidden_states,) if output_hidden_states else ()
+            return res
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=x,
+            hidden_states=hidden_states,
+        )