SVECTOR-OFFICIAL commited on
Commit
b76ef48
·
verified ·
1 Parent(s): ba26855

Upload configuration_spec_vision.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_spec_vision.py +157 -0
configuration_spec_vision.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """ Spec-Vision model configuration"""
17
+
18
+ from typing import Dict, Optional, Union
19
+
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.utils import logging
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+
26
+ class SpecVisionConfig(PretrainedConfig):
27
+ r"""
28
+ This is the configuration class to store the configuration of a [`SpecVisionModel`]. It is used to instantiate a Spec-Vision
29
+ model according to the specified arguments, defining the model architecture.
30
+
31
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
32
+ documentation from [`PretrainedConfig`] for more information.
33
+
34
+ Args:
35
+ vocab_size (`int`, *optional*, defaults to 32064):
36
+ Vocabulary size of the model. Defines the number of different tokens that can be represented by the
37
+ `inputs_ids` passed when calling [`SpecVisionModel`].
38
+ hidden_size (`int`, *optional*, defaults to 3072):
39
+ Dimension of the hidden representations.
40
+ intermediate_size (`int`, *optional*, defaults to 8192):
41
+ Dimension of the MLP representations.
42
+ num_hidden_layers (`int`, *optional*, defaults to 32):
43
+ Number of hidden layers in the Transformer decoder.
44
+ num_attention_heads (`int`, *optional*, defaults to 32):
45
+ Number of attention heads for each attention layer in the Transformer decoder.
46
+ num_key_value_heads (`int`, *optional*):
47
+ Number of key/value heads for implementing Grouped Query Attention.
48
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
49
+ Dropout probability for MLP outputs.
50
+ embd_pdrop (`float`, *optional*, defaults to 0.0):
51
+ The dropout ratio for embeddings.
52
+ attention_dropout (`float`, *optional*, defaults to 0.0):
53
+ The dropout ratio after computing attention scores.
54
+ hidden_act (`str`, *optional*, defaults to `"silu"`):
55
+ The non-linear activation function in the decoder.
56
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
57
+ The maximum sequence length that this model might ever be used with.
58
+ initializer_range (`float`, *optional*, defaults to 0.02):
59
+ The standard deviation for initializing all weight matrices.
60
+ rms_norm_eps (`float`, *optional*, defaults to 1e-5):
61
+ The epsilon value used for RMSNorm.
62
+ use_cache (`bool`, *optional*, defaults to `True`):
63
+ Whether to use the past key/values attentions for faster inference.
64
+ rope_theta (`float`, *optional*, defaults to 10000.0):
65
+ The base period of the RoPE embeddings.
66
+ rope_scaling (`dict`, *optional*):
67
+ Configuration for RoPE scaling strategy.
68
+ embd_layer (`dict`, *optional*):
69
+ Configuration for the embedding layer, including image embedding settings.
70
+ """
71
+ model_type = "spec_vision"
72
+ keys_to_ignore_at_inference = ["past_key_values"]
73
+
74
+ def __init__(
75
+ self,
76
+ vocab_size: int = 32064,
77
+ hidden_size: int = 3072,
78
+ intermediate_size: int = 8192,
79
+ num_hidden_layers: int = 32,
80
+ num_attention_heads: int = 32,
81
+ num_key_value_heads: Optional[int] = None,
82
+ resid_pdrop: float = 0.0,
83
+ embd_pdrop: float = 0.0,
84
+ attention_dropout: float = 0.0,
85
+ hidden_act: str = "silu",
86
+ max_position_embeddings: int = 4096,
87
+ initializer_range: float = 0.02,
88
+ rms_norm_eps: float = 1e-5,
89
+ use_cache: bool = True,
90
+ rope_theta: float = 10000.0,
91
+ rope_scaling: Optional[Dict] = None,
92
+ embd_layer: Dict[str, Union[str, bool]] = {
93
+ "embedding_cls": "image",
94
+ "hd_transform_order": "sub_glb",
95
+ "projection_cls": "mlp",
96
+ "use_hd_transform": True,
97
+ "with_learnable_separator": True
98
+ },
99
+ bos_token_id: int = 1,
100
+ eos_token_id: int = 32000,
101
+ pad_token_id: int = 32000,
102
+ tie_word_embeddings: bool = False,
103
+ **kwargs,
104
+ ):
105
+ self.vocab_size = vocab_size
106
+ self.hidden_size = hidden_size
107
+ self.intermediate_size = intermediate_size
108
+ self.num_hidden_layers = num_hidden_layers
109
+ self.num_attention_heads = num_attention_heads
110
+ self.num_key_value_heads = num_key_value_heads or num_attention_heads
111
+ self.resid_pdrop = resid_pdrop
112
+ self.embd_pdrop = embd_pdrop
113
+ self.attention_dropout = attention_dropout
114
+ self.hidden_act = hidden_act
115
+ self.max_position_embeddings = max_position_embeddings
116
+ self.initializer_range = initializer_range
117
+ self.rms_norm_eps = rms_norm_eps
118
+ self.use_cache = use_cache
119
+ self.rope_theta = rope_theta
120
+ self.rope_scaling = rope_scaling
121
+ self.embd_layer = embd_layer
122
+
123
+ super().__init__(
124
+ bos_token_id=bos_token_id,
125
+ eos_token_id=eos_token_id,
126
+ pad_token_id=pad_token_id,
127
+ tie_word_embeddings=tie_word_embeddings,
128
+ **kwargs,
129
+ )
130
+
131
+ def _rope_scaling_validation(self):
132
+ """
133
+ Validate the `rope_scaling` configuration.
134
+ """
135
+ if self.rope_scaling is None:
136
+ return
137
+
138
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
139
+ raise ValueError(
140
+ "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
141
+ f"got {self.rope_scaling}"
142
+ )
143
+
144
+ rope_scaling_type = self.rope_scaling.get("type", None)
145
+ rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
146
+ rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
147
+
148
+ if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
149
+ raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
150
+
151
+ head_dim = self.hidden_size // self.num_attention_heads // 2
152
+
153
+ for factor, name in [(rope_scaling_short_factor, "short_factor"), (rope_scaling_long_factor, "long_factor")]:
154
+ if not (isinstance(factor, list) and all(isinstance(x, (int, float)) for x in factor)):
155
+ raise ValueError(f"`rope_scaling`'s {name} field must be a list of numbers, got {factor}")
156
+ if len(factor) != head_dim:
157
+ raise ValueError(f"`rope_scaling`'s {name} field must have length {head_dim}, got {len(factor)}")