Blue-skyyy commited on
Commit
23860fe
·
verified ·
1 Parent(s): 5ea02fd

upload sailvit-large

Browse files
Files changed (4) hide show
  1. config.json +24 -0
  2. configuration_sailvit.py +62 -0
  3. model.safetensors +3 -0
  4. modeling_sailvit.py +198 -0
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SAILViTModel"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_sailvit.SAILViTConfig",
8
+ "AutoModel": "modeling_sailvit.SAILViTModel"
9
+ },
10
+ "hidden_size": 1024,
11
+ "image_size": 448,
12
+ "intermediate_size": 2816,
13
+ "model_type": "sailvit",
14
+ "num_attention_heads": 8,
15
+ "num_channels": 3,
16
+ "num_hidden_layers": 24,
17
+ "patch_size": 14,
18
+ "projection_dropout": 0.0,
19
+ "qkv_bias": false,
20
+ "rms_norm_eps": 1e-05,
21
+ "torch_dtype": "bfloat16",
22
+ "transformers_version": "4.45.1",
23
+ "use_bias": false
24
+ }
configuration_sailvit.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+
5
+ __all__ = ["SAILViTConfig"]
6
+
7
+
8
+ class SAILViTConfig(PretrainedConfig):
9
+ """This is the configuration class to store the configuration of an [`SAILViTModel`].
10
+
11
+ Instantiating a configuration with the defaults will yield a similar configuration
12
+ to that of the [apple/SAILViT-Huge-600M-448px](https://huggingface.co/BytedanceDouyinContent/SAILViT-Huge-600M-448px).
13
+
14
+ Args:
15
+ hidden_size: Dimension of the hidden representations.
16
+ intermediate_size: Dimension of the SwiGLU representations.
17
+ num_hidden_layers: Number of hidden layers in the Transformer.
18
+ num_attention_heads: Number of attention heads for each attention layer
19
+ in the Transformer.
20
+ num_channels: Number of input channels.
21
+ image_size: Image size.
22
+ patch_size: Patch size.
23
+ rms_norm_eps: Epsilon value used for the RMS normalization layer.
24
+ attention_dropout: Dropout ratio for attention probabilities.
25
+ projection_dropout: Dropout ratio for the projection layer after the attention.
26
+ qkv_bias: Whether to add a bias to the queries, keys and values.
27
+ use_bias: Whether to add a bias in the feed-forward and projection layers.
28
+ kwargs: Keyword arguments for the [`PretrainedConfig`].
29
+ """
30
+
31
+ model_type: str = "sailvit"
32
+
33
+ def __init__(
34
+ self,
35
+ hidden_size: int = 1024,
36
+ intermediate_size: int = 2816,
37
+ num_hidden_layers: int = 24,
38
+ num_attention_heads: int = 8,
39
+ num_channels: int = 3,
40
+ image_size: int = 224,
41
+ patch_size: int = 14,
42
+ rms_norm_eps: float = 1e-5,
43
+ attention_dropout: float = 0.0,
44
+ projection_dropout: float = 0.0,
45
+ qkv_bias: bool = False,
46
+ use_bias: bool = False,
47
+ **kwargs: Any,
48
+ ):
49
+ super().__init__(**kwargs)
50
+ self.hidden_size = hidden_size
51
+ self.intermediate_size = intermediate_size
52
+ self.num_hidden_layers = num_hidden_layers
53
+ self.num_attention_heads = num_attention_heads
54
+ self.num_channels = num_channels
55
+ self.patch_size = patch_size
56
+ self.image_size = image_size
57
+ self.attention_dropout = attention_dropout
58
+ self.rms_norm_eps = rms_norm_eps
59
+
60
+ self.projection_dropout = projection_dropout
61
+ self.qkv_bias = qkv_bias
62
+ self.use_bias = use_bias
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d131eccf6df91ea2586e139e457a6ed991a60ac082334011dc2ac86b1a608e97
3
+ size 619986688
modeling_sailvit.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # adapted from https://huggingface.co/apple/aimv2-huge-patch14-448 (modification: add gradient checkpoint support)
2
+ from typing import Optional, Tuple, Union
3
+
4
+ import torch
5
+ from .configuration_sailvit import SAILViTConfig
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+ from transformers.modeling_outputs import BaseModelOutputWithNoAttention
9
+ from transformers.modeling_utils import PreTrainedModel
10
+
11
+ __all__ = ["SAILViTModel"]
12
+
13
+
14
+ class RMSNorm(nn.Module):
15
+ def __init__(self, dim: int, eps: float = 1e-6):
16
+ super().__init__()
17
+ self.weight = nn.Parameter(torch.ones(dim))
18
+ self.eps = eps
19
+
20
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
21
+ output = self._norm(x.float()).type_as(x)
22
+ return output * self.weight
23
+
24
+ def extra_repr(self) -> str:
25
+ return f"{tuple(self.weight.shape)}, eps={self.eps}"
26
+
27
+ def _norm(self, x: torch.Tensor) -> torch.Tensor:
28
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
29
+
30
+
31
+ class SAILViTSwiGLUFFN(nn.Module):
32
+ def __init__(self, config: SAILViTConfig):
33
+ super().__init__()
34
+ hidden_features = config.intermediate_size
35
+ in_features = config.hidden_size
36
+ bias = config.use_bias
37
+
38
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
39
+ self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
40
+ self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
41
+
42
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
43
+ x = F.silu(self.fc1(x)) * self.fc3(x)
44
+ x = self.fc2(x)
45
+ return x
46
+
47
+
48
+ class SAILViTPatchEmbed(nn.Module):
49
+ def __init__(self, config: SAILViTConfig):
50
+ super().__init__()
51
+ self.proj = nn.Conv2d(
52
+ config.num_channels,
53
+ config.hidden_size,
54
+ kernel_size=(config.patch_size, config.patch_size),
55
+ stride=(config.patch_size, config.patch_size),
56
+ )
57
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
58
+
59
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
60
+ x = self.proj(x).flatten(2).transpose(1, 2)
61
+ x = self.norm(x)
62
+ return x
63
+
64
+
65
+ class SAILViTPreprocessor(nn.Module):
66
+ def __init__(self, config: SAILViTConfig):
67
+ super().__init__()
68
+ num_patches = (config.image_size // config.patch_size) ** 2
69
+
70
+ self.patchifier = SAILViTPatchEmbed(config)
71
+ self.pos_embed = nn.Parameter(torch.zeros((1, num_patches, config.hidden_size)))
72
+
73
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
74
+ tokens = self.patchifier(x)
75
+ _, N, _ = tokens.shape
76
+ pos_embed = self.pos_embed.to(tokens.device)
77
+ tokens = tokens + pos_embed[:, :N]
78
+ return tokens
79
+
80
+
81
+ class SAILViTAttention(nn.Module):
82
+ def __init__(self, config: SAILViTConfig):
83
+ super().__init__()
84
+ dim = config.hidden_size
85
+
86
+ self.num_heads = config.num_attention_heads
87
+ self.qkv = nn.Linear(dim, dim * 3, bias=config.qkv_bias)
88
+ self.attn_drop = nn.Dropout(config.attention_dropout)
89
+ self.proj = nn.Linear(dim, dim, bias=config.use_bias)
90
+ self.proj_drop = nn.Dropout(config.projection_dropout)
91
+
92
+ def forward(
93
+ self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
94
+ ) -> torch.Tensor:
95
+ B, N, C = x.shape
96
+ qkv = (
97
+ self.qkv(x)
98
+ .reshape(B, N, 3, self.num_heads, C // self.num_heads)
99
+ .permute(2, 0, 3, 1, 4)
100
+ )
101
+ q, k, v = qkv.unbind(0)
102
+
103
+ x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
104
+ x = x.transpose(1, 2).contiguous().reshape(B, N, C)
105
+ x = self.proj(x)
106
+ x = self.proj_drop(x)
107
+ return x
108
+
109
+
110
+ class SAILViTBlock(nn.Module):
111
+ def __init__(self, config: SAILViTConfig):
112
+ super().__init__()
113
+ self.attn = SAILViTAttention(config)
114
+ self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
115
+ self.mlp = SAILViTSwiGLUFFN(config)
116
+ self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
117
+
118
+ def forward(
119
+ self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
120
+ ) -> torch.Tensor:
121
+ x = x + self.attn(self.norm_1(x), mask)
122
+ x = x + self.mlp(self.norm_2(x))
123
+ return x
124
+
125
+
126
+ class SAILViTTransformer(nn.Module):
127
+ def __init__(self, config: SAILViTConfig):
128
+ super().__init__()
129
+ self.blocks = nn.ModuleList(
130
+ [SAILViTBlock(config) for _ in range(config.num_hidden_layers)]
131
+ )
132
+ self.post_trunk_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
133
+ self.gradient_checkpointing = False
134
+
135
+ def forward(
136
+ self,
137
+ tokens: torch.Tensor,
138
+ mask: Optional[torch.Tensor] = None,
139
+ output_hidden_states: bool = False,
140
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, ...]]]:
141
+ hidden_states = () if output_hidden_states else None
142
+ for block in self.blocks:
143
+ if self.gradient_checkpointing and self.training:
144
+ tokens = self._gradient_checkpointing_func(block.__call__, tokens, mask)
145
+ else:
146
+ tokens = block(tokens, mask)
147
+ if output_hidden_states:
148
+ hidden_states += (tokens,)
149
+ tokens = self.post_trunk_norm(tokens)
150
+ return tokens, hidden_states
151
+
152
+
153
+ class SAILViTPretrainedModel(PreTrainedModel):
154
+ config_class = SAILViTConfig
155
+ base_model_prefix = "sailvit"
156
+ supports_gradient_checkpointing = True
157
+ main_input_name = "pixel_values"
158
+ _no_split_modules = ["SAILViTPreprocessor", "SAILViTBlock"]
159
+ _supports_sdpa = True
160
+
161
+
162
+ class SAILViTModel(SAILViTPretrainedModel):
163
+ def __init__(self, config: SAILViTConfig):
164
+ super().__init__(config)
165
+ self.preprocessor = SAILViTPreprocessor(config)
166
+ self.trunk = SAILViTTransformer(config)
167
+
168
+ def forward(
169
+ self,
170
+ pixel_values: torch.Tensor,
171
+ mask: Optional[torch.Tensor] = None,
172
+ output_hidden_states: Optional[bool] = None,
173
+ return_dict: Optional[bool] = None,
174
+ ) -> Union[
175
+ Tuple[torch.Tensor],
176
+ Tuple[torch.Tensor, Tuple[torch.Tensor, ...]],
177
+ BaseModelOutputWithNoAttention,
178
+ ]:
179
+ if output_hidden_states is None:
180
+ output_hidden_states = self.config.output_hidden_states
181
+ if return_dict is None:
182
+ return_dict = self.config.use_return_dict
183
+
184
+ x = self.preprocessor(pixel_values)
185
+ x, hidden_states = self.trunk(
186
+ x, mask, output_hidden_states=output_hidden_states
187
+ )
188
+
189
+ if not return_dict:
190
+ res = (x,)
191
+ res += (hidden_states,) if output_hidden_states else ()
192
+ return res
193
+
194
+ return BaseModelOutputWithNoAttention(
195
+ last_hidden_state=x,
196
+ hidden_states=hidden_states,
197
+ )
198
+