update weights from bf16 to fp32
Browse files- config.json +1 -1
- model-00001-of-00004.safetensors → model-00001-of-00007.safetensors +2 -2
- model-00002-of-00004.safetensors → model-00002-of-00007.safetensors +2 -2
- model-00003-of-00004.safetensors → model-00003-of-00007.safetensors +2 -2
- model-00004-of-00004.safetensors → model-00004-of-00007.safetensors +2 -2
- model-00005-of-00007.safetensors +3 -0
- model-00006-of-00007.safetensors +3 -0
- model-00007-of-00007.safetensors +3 -0
- model.safetensors.index.json +0 -0
- model.yaml +253 -0
config.json
CHANGED
@@ -183,7 +183,7 @@
|
|
183 |
}
|
184 |
},
|
185 |
"tie_word_embeddings": false,
|
186 |
-
"torch_dtype": "
|
187 |
"transformers_version": "4.52.3",
|
188 |
"use_cache": true,
|
189 |
"vit_config": {
|
|
|
183 |
}
|
184 |
},
|
185 |
"tie_word_embeddings": false,
|
186 |
+
"torch_dtype": "float32",
|
187 |
"transformers_version": "4.52.3",
|
188 |
"use_cache": true,
|
189 |
"vit_config": {
|
model-00001-of-00004.safetensors → model-00001-of-00007.safetensors
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d091a7fcf85d7e5e5b9cf7a2bcfd33dd2a5165594d40cbc4b577556e2efbe41
|
3 |
+
size 4978520816
|
model-00002-of-00004.safetensors → model-00002-of-00007.safetensors
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2469da3db18f67a7f2a0c8c32115b825ea1276568463e4311af31449451dfeb9
|
3 |
+
size 4778633920
|
model-00003-of-00004.safetensors → model-00003-of-00007.safetensors
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f09e1217e65a6a54749ec5af4b681a321038f3dfc23a95d78824f72a2ee2e20b
|
3 |
+
size 4661160168
|
model-00004-of-00004.safetensors → model-00004-of-00007.safetensors
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:01b29df00b9972d5a548527774daaaae540c44cddf380bd2d3f7c922f633d6a5
|
3 |
+
size 4661160192
|
model-00005-of-00007.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f027c8de3a20c6bd6e13b2d2485aef7a114cc8d2a2878df6361ba6477645d40
|
3 |
+
size 4661160192
|
model-00006-of-00007.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a40ccc85e6fa499f8af46ca0b1d7bdfe241df001d29df351280fc8f41013c395
|
3 |
+
size 4997750712
|
model-00007-of-00007.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a8db94d0998ccf9a252d84bc12ea065cc8f66280c7177a7c9e7e69c505efe8c
|
3 |
+
size 3739371680
|
model.safetensors.index.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
model.yaml
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_name: molmo
|
2 |
+
llm:
|
3 |
+
d_model: 3584
|
4 |
+
n_heads: 28
|
5 |
+
n_kv_heads: 4
|
6 |
+
head_dim: null
|
7 |
+
qkv_bias: true
|
8 |
+
clip_qkv: null
|
9 |
+
n_layers: 28
|
10 |
+
mlp_ratio: 4
|
11 |
+
mlp_hidden_size: 37888
|
12 |
+
activation_type: swiglu
|
13 |
+
block_type: sequential
|
14 |
+
rope: true
|
15 |
+
rope_full_precision: true
|
16 |
+
rope_theta: 1000000.0
|
17 |
+
rope_type: default
|
18 |
+
rope_factor: null
|
19 |
+
rope_high_freq_factor: null
|
20 |
+
rope_low_freq_factor: null
|
21 |
+
rope_original_max_position_embeddings: null
|
22 |
+
attention_type: sdpa
|
23 |
+
float32_attention: true
|
24 |
+
attention_dropout: 0.0
|
25 |
+
attention_layer_norm: false
|
26 |
+
attention_layer_norm_type: olmo
|
27 |
+
residual_dropout: 0.1
|
28 |
+
response_residual_dropout: 0.0
|
29 |
+
layer_norm_type: rms
|
30 |
+
layer_norm_with_affine: true
|
31 |
+
layer_norm_eps: 1.0e-06
|
32 |
+
attention_layer_norm_with_affine: true
|
33 |
+
max_sequence_length: 4096
|
34 |
+
max_position_embeddings: null
|
35 |
+
include_bias: false
|
36 |
+
bias_for_layer_norm: null
|
37 |
+
norm_after: false
|
38 |
+
moe_num_experts: 8
|
39 |
+
moe_top_k: 2
|
40 |
+
moe_mlp_impl: sparse
|
41 |
+
moe_log_expert_assignment: false
|
42 |
+
moe_shared_expert: false
|
43 |
+
moe_lbl_in_fp32: false
|
44 |
+
moe_interleave: false
|
45 |
+
moe_loss_weight: 0.1
|
46 |
+
moe_zloss_weight: null
|
47 |
+
moe_dropless: true
|
48 |
+
moe_capacity_factor: 1.25
|
49 |
+
embedding_dropout: 0.0
|
50 |
+
scale_logits: false
|
51 |
+
vocab_size: 152064
|
52 |
+
additional_vocab_size: 128
|
53 |
+
weight_tying: false
|
54 |
+
embedding_size: 152064
|
55 |
+
use_position_ids: true
|
56 |
+
tokenizer:
|
57 |
+
identifier: Qwen/Qwen2.5-7B
|
58 |
+
tokenizer_dir: null
|
59 |
+
depth_tokens: true
|
60 |
+
init_path: gs://mm-olmo/pretrained_llms/qwen2.5-7b.pt
|
61 |
+
init_incremental: null
|
62 |
+
new_embedding_init_range: 0.02
|
63 |
+
initializer_range: 0.02
|
64 |
+
normalize_input_embeds: false
|
65 |
+
activation_checkpoint: whole_layer
|
66 |
+
compile: blocks
|
67 |
+
fix_pad_tokenizer: false
|
68 |
+
resize_vocab: false
|
69 |
+
init_std: 0.02
|
70 |
+
init_fn: normal
|
71 |
+
init_cutoff_factor: null
|
72 |
+
vision_backbone:
|
73 |
+
vit:
|
74 |
+
image_model_type: siglip
|
75 |
+
image_default_input_size:
|
76 |
+
- 378
|
77 |
+
- 378
|
78 |
+
image_patch_size: 14
|
79 |
+
image_pos_patch_size: 14
|
80 |
+
image_emb_dim: 1152
|
81 |
+
image_num_heads: 16
|
82 |
+
image_num_key_value_heads: 16
|
83 |
+
image_num_layers: 27
|
84 |
+
image_head_dim: 72
|
85 |
+
image_mlp_dim: 4304
|
86 |
+
image_mlp_activations: gelu_pytorch_tanh
|
87 |
+
image_dropout_rate: 0.0
|
88 |
+
image_num_pos: 729
|
89 |
+
image_norm_eps: 1.0e-06
|
90 |
+
attention_dropout: 0.0
|
91 |
+
residual_dropout: 0.0
|
92 |
+
initializer_range: 0.02
|
93 |
+
float32_attention: true
|
94 |
+
attention_type: sdpa
|
95 |
+
activation_checkpointing: true
|
96 |
+
init_path: gs://mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
|
97 |
+
resize_mode: siglip
|
98 |
+
pad_value: 0.0
|
99 |
+
normalize: siglip
|
100 |
+
image_pooling_2d: attention_meanq
|
101 |
+
pooling_attention_mask: false
|
102 |
+
image_projector: mlp
|
103 |
+
image_padding_embed: null
|
104 |
+
vit_layers:
|
105 |
+
- -3
|
106 |
+
- -9
|
107 |
+
skip_unused_layers: true
|
108 |
+
image_feature_dropout: 0.0
|
109 |
+
connector_activation_checkpointing: true
|
110 |
+
compile_vit: blocks
|
111 |
+
data_formatter:
|
112 |
+
prompt_templates: uber_model
|
113 |
+
message_format: role
|
114 |
+
system_prompt: demo_or_style
|
115 |
+
always_start_with_space: false
|
116 |
+
default_inference_len: 65
|
117 |
+
select_answer: best
|
118 |
+
debug: false
|
119 |
+
image_last: false
|
120 |
+
format_message_list: null
|
121 |
+
p_one_message: 0.0
|
122 |
+
mm_preprocessor:
|
123 |
+
crop_mode: overlap-and-resize-c2
|
124 |
+
max_crops: 8
|
125 |
+
max_images: 2
|
126 |
+
max_multi_image_crops: 8
|
127 |
+
pooling_w: 2
|
128 |
+
pooling_h: 2
|
129 |
+
overlap_margins:
|
130 |
+
- 4
|
131 |
+
- 4
|
132 |
+
use_col_tokens: true
|
133 |
+
loss_token_weighting: root_subsegments
|
134 |
+
legacy_image_mask: false
|
135 |
+
max_answer_len: null
|
136 |
+
img_aug: false
|
137 |
+
bi_directional_attn: null
|
138 |
+
lora_enable: false
|
139 |
+
lora_rank: 64
|
140 |
+
lora_alpha: 16
|
141 |
+
lora_dropout: 0.05
|
142 |
+
lora_bias: none
|
143 |
+
n_action_bins: 256
|
144 |
+
norm_stats:
|
145 |
+
fractal20220817_data:
|
146 |
+
action:
|
147 |
+
mean:
|
148 |
+
- 0.006987582892179489
|
149 |
+
- 0.006265917327255011
|
150 |
+
- -0.01262515690177679
|
151 |
+
- 0.04333311319351196
|
152 |
+
- -0.005756212864071131
|
153 |
+
- 0.0009130256366916001
|
154 |
+
- 0.5354204773902893
|
155 |
+
std:
|
156 |
+
- 0.0692116990685463
|
157 |
+
- 0.05970962345600128
|
158 |
+
- 0.07353084534406662
|
159 |
+
- 0.15610496699810028
|
160 |
+
- 0.13164450228214264
|
161 |
+
- 0.14593800902366638
|
162 |
+
- 0.497110515832901
|
163 |
+
max:
|
164 |
+
- 2.9984593391418457
|
165 |
+
- 22.09052848815918
|
166 |
+
- 2.7507524490356445
|
167 |
+
- 1.570636510848999
|
168 |
+
- 1.5321086645126343
|
169 |
+
- 1.5691522359848022
|
170 |
+
- 1.0
|
171 |
+
min:
|
172 |
+
- -2.0204520225524902
|
173 |
+
- -5.497899532318115
|
174 |
+
- -2.031663417816162
|
175 |
+
- -1.569917917251587
|
176 |
+
- -1.569892168045044
|
177 |
+
- -1.570419430732727
|
178 |
+
- 0.0
|
179 |
+
q01:
|
180 |
+
- -0.22453527510166169
|
181 |
+
- -0.14820013284683228
|
182 |
+
- -0.231589707583189
|
183 |
+
- -0.3517994859814644
|
184 |
+
- -0.4193011274933815
|
185 |
+
- -0.43643461108207704
|
186 |
+
- 0.0
|
187 |
+
q99:
|
188 |
+
- 0.17824687153100965
|
189 |
+
- 0.14938379630446405
|
190 |
+
- 0.21842354819178575
|
191 |
+
- 0.5892666035890578
|
192 |
+
- 0.35272657424211445
|
193 |
+
- 0.44796681255102094
|
194 |
+
- 1.0
|
195 |
+
mask:
|
196 |
+
- true
|
197 |
+
- true
|
198 |
+
- true
|
199 |
+
- true
|
200 |
+
- true
|
201 |
+
- true
|
202 |
+
- false
|
203 |
+
proprio:
|
204 |
+
mean:
|
205 |
+
- 0.0
|
206 |
+
- 0.0
|
207 |
+
- 0.0
|
208 |
+
- 0.0
|
209 |
+
- 0.0
|
210 |
+
- 0.0
|
211 |
+
- 0.0
|
212 |
+
std:
|
213 |
+
- 0.0
|
214 |
+
- 0.0
|
215 |
+
- 0.0
|
216 |
+
- 0.0
|
217 |
+
- 0.0
|
218 |
+
- 0.0
|
219 |
+
- 0.0
|
220 |
+
max:
|
221 |
+
- 0.0
|
222 |
+
- 0.0
|
223 |
+
- 0.0
|
224 |
+
- 0.0
|
225 |
+
- 0.0
|
226 |
+
- 0.0
|
227 |
+
- 0.0
|
228 |
+
min:
|
229 |
+
- 0.0
|
230 |
+
- 0.0
|
231 |
+
- 0.0
|
232 |
+
- 0.0
|
233 |
+
- 0.0
|
234 |
+
- 0.0
|
235 |
+
- 0.0
|
236 |
+
q01:
|
237 |
+
- 0.0
|
238 |
+
- 0.0
|
239 |
+
- 0.0
|
240 |
+
- 0.0
|
241 |
+
- 0.0
|
242 |
+
- 0.0
|
243 |
+
- 0.0
|
244 |
+
q99:
|
245 |
+
- 0.0
|
246 |
+
- 0.0
|
247 |
+
- 0.0
|
248 |
+
- 0.0
|
249 |
+
- 0.0
|
250 |
+
- 0.0
|
251 |
+
- 0.0
|
252 |
+
num_transitions: 3786400
|
253 |
+
num_trajectories: 87212
|