rippertnt commited on
Commit
5996162
·
verified ·
1 Parent(s): bb7410e

Upload 14 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nanoLLaVA",
3
+ "architectures": [
4
+ "LlavaQwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_llava_qwen2.LlavaQwen2Config",
9
+ "AutoModelForCausalLM": "modeling_llava_qwen2.LlavaQwen2ForCausalLM"
10
+ },
11
+ "bos_token_id": 151645,
12
+ "eos_token_id": 151645,
13
+ "freeze_mm_mlp_adapter": false,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 1024,
16
+ "image_aspect_ratio": "pad",
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 2816,
19
+ "language_model": "vilm/Quyen-SE-v0.1",
20
+ "max_position_embeddings": 32768,
21
+ "max_window_layers": 21,
22
+ "mm_hidden_size": 1152,
23
+ "mm_projector_lr": null,
24
+ "mm_projector_type": "mlp2x_gelu",
25
+ "mm_vision_tower": "google/siglip-so400m-patch14-384",
26
+ "model_type": "llava-qwen2",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 24,
29
+ "num_key_value_heads": 16,
30
+ "rms_norm_eps": 1e-06,
31
+ "rope_theta": 1000000.0,
32
+ "sliding_window": 4096,
33
+ "tie_word_embeddings": false,
34
+ "tokenizer_model_max_length": 4096,
35
+ "tokenizer_padding_side": "right",
36
+ "torch_dtype": "bfloat16",
37
+ "transformers_version": "4.42.3",
38
+ "tune_mm_mlp_adapter": false,
39
+ "use_cache": false,
40
+ "use_mm_proj": true,
41
+ "use_sliding_window": false,
42
+ "vocab_size": 151936
43
+ }
configuration_llava_qwen2.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Qwen2 model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.utils import logging
19
+
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+ QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
24
+ "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
25
+ }
26
+
27
+
28
+ class Qwen2Config(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
31
+ Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
32
+ with the defaults will yield a similar configuration to that of
33
+ Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
34
+
35
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
36
+ documentation from [`PretrainedConfig`] for more information.
37
+
38
+
39
+ Args:
40
+ vocab_size (`int`, *optional*, defaults to 151936):
41
+ Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
42
+ `inputs_ids` passed when calling [`Qwen2Model`]
43
+ hidden_size (`int`, *optional*, defaults to 4096):
44
+ Dimension of the hidden representations.
45
+ intermediate_size (`int`, *optional*, defaults to 22016):
46
+ Dimension of the MLP representations.
47
+ num_hidden_layers (`int`, *optional*, defaults to 32):
48
+ Number of hidden layers in the Transformer encoder.
49
+ num_attention_heads (`int`, *optional*, defaults to 32):
50
+ Number of attention heads for each attention layer in the Transformer encoder.
51
+ num_key_value_heads (`int`, *optional*, defaults to 32):
52
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
53
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
54
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
55
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
56
+ by meanpooling all the original heads within that group. For more details checkout [this
57
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
58
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
59
+ The non-linear activation function (function or string) in the decoder.
60
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
61
+ The maximum sequence length that this model might ever be used with.
62
+ initializer_range (`float`, *optional*, defaults to 0.02):
63
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
65
+ The epsilon used by the rms normalization layers.
66
+ use_cache (`bool`, *optional*, defaults to `True`):
67
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
68
+ relevant if `config.is_decoder=True`.
69
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
70
+ Whether the model's input and output word embeddings should be tied.
71
+ rope_theta (`float`, *optional*, defaults to 10000.0):
72
+ The base period of the RoPE embeddings.
73
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
74
+ Whether to use sliding window attention.
75
+ sliding_window (`int`, *optional*, defaults to 4096):
76
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
77
+ max_window_layers (`int`, *optional*, defaults to 28):
78
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
79
+ attention_dropout (`float`, *optional*, defaults to 0.0):
80
+ The dropout ratio for the attention probabilities.
81
+
82
+ ```python
83
+ >>> from transformers import Qwen2Model, Qwen2Config
84
+
85
+ >>> # Initializing a Qwen2 style configuration
86
+ >>> configuration = Qwen2Config()
87
+
88
+ >>> # Initializing a model from the Qwen2-7B style configuration
89
+ >>> model = Qwen2Model(configuration)
90
+
91
+ >>> # Accessing the model configuration
92
+ >>> configuration = model.config
93
+ ```"""
94
+
95
+ model_type = "qwen2"
96
+ keys_to_ignore_at_inference = ["past_key_values"]
97
+
98
+ def __init__(
99
+ self,
100
+ vocab_size=151936,
101
+ hidden_size=4096,
102
+ intermediate_size=22016,
103
+ num_hidden_layers=32,
104
+ num_attention_heads=32,
105
+ num_key_value_heads=32,
106
+ hidden_act="silu",
107
+ max_position_embeddings=32768,
108
+ initializer_range=0.02,
109
+ rms_norm_eps=1e-6,
110
+ use_cache=True,
111
+ tie_word_embeddings=False,
112
+ rope_theta=10000.0,
113
+ use_sliding_window=False,
114
+ sliding_window=4096,
115
+ max_window_layers=28,
116
+ attention_dropout=0.0,
117
+ **kwargs,
118
+ ):
119
+ self.vocab_size = vocab_size
120
+ self.max_position_embeddings = max_position_embeddings
121
+ self.hidden_size = hidden_size
122
+ self.intermediate_size = intermediate_size
123
+ self.num_hidden_layers = num_hidden_layers
124
+ self.num_attention_heads = num_attention_heads
125
+ self.use_sliding_window = use_sliding_window
126
+ self.sliding_window = sliding_window
127
+ self.max_window_layers = max_window_layers
128
+
129
+ # for backward compatibility
130
+ if num_key_value_heads is None:
131
+ num_key_value_heads = num_attention_heads
132
+
133
+ self.num_key_value_heads = num_key_value_heads
134
+ self.hidden_act = hidden_act
135
+ self.initializer_range = initializer_range
136
+ self.rms_norm_eps = rms_norm_eps
137
+ self.use_cache = use_cache
138
+ self.rope_theta = rope_theta
139
+ self.attention_dropout = attention_dropout
140
+
141
+ super().__init__(
142
+ tie_word_embeddings=tie_word_embeddings,
143
+ **kwargs,
144
+ )
145
+
146
+ from typing import Union
147
+ from transformers import PretrainedConfig
148
+ import os
149
+
150
+
151
+ class SigLipVisionConfig(PretrainedConfig):
152
+ model_type = "siglip_vision_model"
153
+
154
+ def __init__(
155
+ self,
156
+ hidden_size=1152,
157
+ image_mean=(0.5, 0.5, 0.5),
158
+ intermediate_size=4304,
159
+ num_hidden_layers=27,
160
+ num_attention_heads=16,
161
+ num_channels=3,
162
+ image_size=384,
163
+ patch_size=14,
164
+ hidden_act="gelu_pytorch_tanh",
165
+ layer_norm_eps=1e-6,
166
+ attention_dropout=0.0,
167
+ **kwargs,
168
+ ):
169
+ super().__init__(**kwargs)
170
+
171
+ self.hidden_size = hidden_size
172
+ self.intermediate_size = intermediate_size
173
+ self.num_hidden_layers = num_hidden_layers
174
+ self.num_attention_heads = num_attention_heads
175
+ self.num_channels = num_channels
176
+ self.patch_size = patch_size
177
+ self.image_size = image_size
178
+ self.attention_dropout = attention_dropout
179
+ self.layer_norm_eps = layer_norm_eps
180
+ self.hidden_act = hidden_act
181
+ self.image_mean = image_mean
182
+
183
+ @classmethod
184
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
185
+ cls._set_token_in_kwargs(kwargs)
186
+
187
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
188
+
189
+ # get the vision config dict if we are loading from SigLipConfig
190
+ if config_dict.get("model_type") == "siglip":
191
+ config_dict = config_dict["vision_config"]
192
+
193
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
194
+ logger.warning(
195
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
196
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
197
+ )
198
+
199
+ return cls.from_dict(config_dict, **kwargs)
200
+
201
+ class LlavaQwen2Config(Qwen2Config):
202
+ model_type = "llava-qwen2"
image_encoder.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1ae1818e572e70aa451c148a6b99bcbf29d9aa47904265a7caaa70fa42d92be
3
+ size 404443876
image_encoder.xml ADDED
The diff for this file is too large to render. See raw diff
 
llava_with_past.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baeeb098293702441ac0e2322ce69df0cf6cbc7be2206599fd98e6c1b1aea3b7
3
+ size 363505728
llava_with_past.xml ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|im_end|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ }
27
+ }
token_embed.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00aafd04f0edaf83ac1393de0611fe3a29135f47d3db793026a66e82a38f0027
3
+ size 311164932
token_embed.xml ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="Model3" version="11">
3
+ <layers>
4
+ <layer id="0" name="input" type="Parameter" version="opset1">
5
+ <data shape="?,?" element_type="i64" />
6
+ <output>
7
+ <port id="0" precision="I64" names="input">
8
+ <dim>-1</dim>
9
+ <dim>-1</dim>
10
+ </port>
11
+ </output>
12
+ </layer>
13
+ <layer id="1" name="self.weight_compressed" type="Const" version="opset1">
14
+ <data element_type="f16" shape="151936, 1024" offset="0" size="311164928" />
15
+ <output>
16
+ <port id="0" precision="FP16" names="self.weight">
17
+ <dim>151936</dim>
18
+ <dim>1024</dim>
19
+ </port>
20
+ </output>
21
+ </layer>
22
+ <layer id="2" name="self.weight" type="Convert" version="opset1">
23
+ <data destination_type="f32" />
24
+ <rt_info>
25
+ <attribute name="decompression" version="0" />
26
+ </rt_info>
27
+ <input>
28
+ <port id="0" precision="FP16">
29
+ <dim>151936</dim>
30
+ <dim>1024</dim>
31
+ </port>
32
+ </input>
33
+ <output>
34
+ <port id="1" precision="FP32">
35
+ <dim>151936</dim>
36
+ <dim>1024</dim>
37
+ </port>
38
+ </output>
39
+ </layer>
40
+ <layer id="3" name="aten::embedding/Convert" type="Convert" version="opset1">
41
+ <data destination_type="i32" />
42
+ <input>
43
+ <port id="0" precision="I64">
44
+ <dim>-1</dim>
45
+ <dim>-1</dim>
46
+ </port>
47
+ </input>
48
+ <output>
49
+ <port id="1" precision="I32">
50
+ <dim>-1</dim>
51
+ <dim>-1</dim>
52
+ </port>
53
+ </output>
54
+ </layer>
55
+ <layer id="4" name="aten::embedding/Constant" type="Const" version="opset1">
56
+ <data element_type="i32" shape="" offset="311164928" size="4" />
57
+ <output>
58
+ <port id="0" precision="I32" />
59
+ </output>
60
+ </layer>
61
+ <layer id="5" name="aten::embedding/Gather" type="Gather" version="opset8">
62
+ <data batch_dims="0" />
63
+ <input>
64
+ <port id="0" precision="FP32">
65
+ <dim>151936</dim>
66
+ <dim>1024</dim>
67
+ </port>
68
+ <port id="1" precision="I32">
69
+ <dim>-1</dim>
70
+ <dim>-1</dim>
71
+ </port>
72
+ <port id="2" precision="I32" />
73
+ </input>
74
+ <output>
75
+ <port id="3" precision="FP32">
76
+ <dim>-1</dim>
77
+ <dim>-1</dim>
78
+ <dim>1024</dim>
79
+ </port>
80
+ </output>
81
+ </layer>
82
+ <layer id="6" name="Result_26477" type="Result" version="opset1">
83
+ <input>
84
+ <port id="0" precision="FP32">
85
+ <dim>-1</dim>
86
+ <dim>-1</dim>
87
+ <dim>1024</dim>
88
+ </port>
89
+ </input>
90
+ </layer>
91
+ </layers>
92
+ <edges>
93
+ <edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
94
+ <edge from-layer="1" from-port="0" to-layer="2" to-port="0" />
95
+ <edge from-layer="2" from-port="1" to-layer="5" to-port="0" />
96
+ <edge from-layer="3" from-port="1" to-layer="5" to-port="1" />
97
+ <edge from-layer="4" from-port="0" to-layer="5" to-port="2" />
98
+ <edge from-layer="5" from-port="3" to-layer="6" to-port="0" />
99
+ </edges>
100
+ <rt_info>
101
+ <Runtime_version value="2024.3.0-15945-a349dc82f9a" />
102
+ <conversion_parameters>
103
+ <framework value="pytorch" />
104
+ <is_python_object value="True" />
105
+ </conversion_parameters>
106
+ </rt_info>
107
+ </net>
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": "<|im_end|>",
34
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nAnswer the questions.<|im_end|>' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "model_max_length": 4096,
39
+ "pad_token": "<|endoftext|>",
40
+ "padding_side": "right",
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "Qwen2Tokenizer",
43
+ "unk_token": null
44
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff