prasadsachin commited on
Commit
239441b
·
verified ·
1 Parent(s): 338a441

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/tokenizer/vocabulary.spm filter=lfs diff=lfs merge=lfs -text
assets/tokenizer/vocabulary.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "module": "keras_hub.src.models.mixtral.mixtral_backbone",
3
+ "class_name": "MixtralBackbone",
4
+ "config": {
5
+ "name": "mixtral_backbone",
6
+ "trainable": true,
7
+ "vocabulary_size": 32000,
8
+ "num_layers": 32,
9
+ "num_query_heads": 32,
10
+ "hidden_dim": 4096,
11
+ "intermediate_dim": 14336,
12
+ "num_experts": 8,
13
+ "top_k": 2,
14
+ "router_jitter_noise": 0.0,
15
+ "rope_max_wavelength": 1000000.0,
16
+ "rope_scaling_factor": 1.0,
17
+ "num_key_value_heads": 8,
18
+ "router_aux_loss_coef": 0.02,
19
+ "sliding_window": null,
20
+ "layer_norm_epsilon": 1e-05,
21
+ "dropout": 0
22
+ },
23
+ "registered_name": "keras_hub>MixtralBackbone"
24
+ }
metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "keras_version": "3.10.0.dev2025061603",
3
+ "keras_hub_version": "0.22.0.dev202506160417",
4
+ "parameter_count": 46702792704,
5
+ "date_saved": "2025-06-16@22:25:46",
6
+ "tasks": [
7
+ "CausalLM"
8
+ ]
9
+ }
model.weights.json ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 93405585408.0
4
+ },
5
+ "weight_map": {
6
+ "/layers/reversible_embedding/vars": [
7
+ "model_00000.weights.h5"
8
+ ],
9
+ "/layers/mixtral_transformer_decoder/_feedforward_layernorm/vars": "model_00000.weights.h5",
10
+ "/layers/mixtral_transformer_decoder/_self_attention_layer/key_dense/vars": "model_00000.weights.h5",
11
+ "/layers/mixtral_transformer_decoder/_self_attention_layer/output_dense/vars": "model_00000.weights.h5",
12
+ "/layers/mixtral_transformer_decoder/_self_attention_layer/query_dense/vars": "model_00000.weights.h5",
13
+ "/layers/mixtral_transformer_decoder/_self_attention_layer/value_dense/vars": "model_00000.weights.h5",
14
+ "/layers/mixtral_transformer_decoder/_self_attention_layernorm/vars": "model_00000.weights.h5",
15
+ "/layers/mixtral_transformer_decoder/_sparse_moe_block/expert_bank/vars": [
16
+ "model_00000.weights.h5"
17
+ ],
18
+ "/layers/mixtral_transformer_decoder/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00000.weights.h5",
19
+ "/layers/mixtral_transformer_decoder_1/_feedforward_layernorm/vars": "model_00000.weights.h5",
20
+ "/layers/mixtral_transformer_decoder_1/_self_attention_layer/key_dense/vars": "model_00000.weights.h5",
21
+ "/layers/mixtral_transformer_decoder_1/_self_attention_layer/output_dense/vars": "model_00000.weights.h5",
22
+ "/layers/mixtral_transformer_decoder_1/_self_attention_layer/query_dense/vars": "model_00000.weights.h5",
23
+ "/layers/mixtral_transformer_decoder_1/_self_attention_layer/value_dense/vars": "model_00000.weights.h5",
24
+ "/layers/mixtral_transformer_decoder_1/_self_attention_layernorm/vars": "model_00000.weights.h5",
25
+ "/layers/mixtral_transformer_decoder_1/_sparse_moe_block/expert_bank/vars": [
26
+ "model_00000.weights.h5"
27
+ ],
28
+ "/layers/mixtral_transformer_decoder_1/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00000.weights.h5",
29
+ "/layers/mixtral_transformer_decoder_2/_feedforward_layernorm/vars": "model_00000.weights.h5",
30
+ "/layers/mixtral_transformer_decoder_2/_self_attention_layer/key_dense/vars": "model_00000.weights.h5",
31
+ "/layers/mixtral_transformer_decoder_2/_self_attention_layer/output_dense/vars": "model_00000.weights.h5",
32
+ "/layers/mixtral_transformer_decoder_2/_self_attention_layer/query_dense/vars": "model_00000.weights.h5",
33
+ "/layers/mixtral_transformer_decoder_2/_self_attention_layer/value_dense/vars": "model_00000.weights.h5",
34
+ "/layers/mixtral_transformer_decoder_2/_self_attention_layernorm/vars": "model_00000.weights.h5",
35
+ "/layers/mixtral_transformer_decoder_2/_sparse_moe_block/expert_bank/vars": [
36
+ "model_00000.weights.h5"
37
+ ],
38
+ "/layers/mixtral_transformer_decoder_2/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00000.weights.h5",
39
+ "/layers/mixtral_transformer_decoder_3/_feedforward_layernorm/vars": "model_00000.weights.h5",
40
+ "/layers/mixtral_transformer_decoder_3/_self_attention_layer/key_dense/vars": "model_00000.weights.h5",
41
+ "/layers/mixtral_transformer_decoder_3/_self_attention_layer/output_dense/vars": "model_00000.weights.h5",
42
+ "/layers/mixtral_transformer_decoder_3/_self_attention_layer/query_dense/vars": "model_00000.weights.h5",
43
+ "/layers/mixtral_transformer_decoder_3/_self_attention_layer/value_dense/vars": "model_00000.weights.h5",
44
+ "/layers/mixtral_transformer_decoder_3/_self_attention_layernorm/vars": "model_00000.weights.h5",
45
+ "/layers/mixtral_transformer_decoder_3/_sparse_moe_block/expert_bank/vars": [
46
+ "model_00000.weights.h5",
47
+ "model_00001.weights.h5"
48
+ ],
49
+ "/layers/mixtral_transformer_decoder_3/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
50
+ "/layers/mixtral_transformer_decoder_4/_feedforward_layernorm/vars": "model_00001.weights.h5",
51
+ "/layers/mixtral_transformer_decoder_4/_self_attention_layer/key_dense/vars": "model_00001.weights.h5",
52
+ "/layers/mixtral_transformer_decoder_4/_self_attention_layer/output_dense/vars": "model_00001.weights.h5",
53
+ "/layers/mixtral_transformer_decoder_4/_self_attention_layer/query_dense/vars": "model_00001.weights.h5",
54
+ "/layers/mixtral_transformer_decoder_4/_self_attention_layer/value_dense/vars": "model_00001.weights.h5",
55
+ "/layers/mixtral_transformer_decoder_4/_self_attention_layernorm/vars": "model_00001.weights.h5",
56
+ "/layers/mixtral_transformer_decoder_4/_sparse_moe_block/expert_bank/vars": [
57
+ "model_00001.weights.h5"
58
+ ],
59
+ "/layers/mixtral_transformer_decoder_4/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
60
+ "/layers/mixtral_transformer_decoder_5/_feedforward_layernorm/vars": "model_00001.weights.h5",
61
+ "/layers/mixtral_transformer_decoder_5/_self_attention_layer/key_dense/vars": "model_00001.weights.h5",
62
+ "/layers/mixtral_transformer_decoder_5/_self_attention_layer/output_dense/vars": "model_00001.weights.h5",
63
+ "/layers/mixtral_transformer_decoder_5/_self_attention_layer/query_dense/vars": "model_00001.weights.h5",
64
+ "/layers/mixtral_transformer_decoder_5/_self_attention_layer/value_dense/vars": "model_00001.weights.h5",
65
+ "/layers/mixtral_transformer_decoder_5/_self_attention_layernorm/vars": "model_00001.weights.h5",
66
+ "/layers/mixtral_transformer_decoder_5/_sparse_moe_block/expert_bank/vars": [
67
+ "model_00001.weights.h5"
68
+ ],
69
+ "/layers/mixtral_transformer_decoder_5/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
70
+ "/layers/mixtral_transformer_decoder_6/_feedforward_layernorm/vars": "model_00001.weights.h5",
71
+ "/layers/mixtral_transformer_decoder_6/_self_attention_layer/key_dense/vars": "model_00001.weights.h5",
72
+ "/layers/mixtral_transformer_decoder_6/_self_attention_layer/output_dense/vars": "model_00001.weights.h5",
73
+ "/layers/mixtral_transformer_decoder_6/_self_attention_layer/query_dense/vars": "model_00001.weights.h5",
74
+ "/layers/mixtral_transformer_decoder_6/_self_attention_layer/value_dense/vars": "model_00001.weights.h5",
75
+ "/layers/mixtral_transformer_decoder_6/_self_attention_layernorm/vars": "model_00001.weights.h5",
76
+ "/layers/mixtral_transformer_decoder_6/_sparse_moe_block/expert_bank/vars": [
77
+ "model_00001.weights.h5"
78
+ ],
79
+ "/layers/mixtral_transformer_decoder_6/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
80
+ "/layers/mixtral_transformer_decoder_7/_feedforward_layernorm/vars": "model_00001.weights.h5",
81
+ "/layers/mixtral_transformer_decoder_7/_self_attention_layer/key_dense/vars": "model_00001.weights.h5",
82
+ "/layers/mixtral_transformer_decoder_7/_self_attention_layer/output_dense/vars": "model_00001.weights.h5",
83
+ "/layers/mixtral_transformer_decoder_7/_self_attention_layer/query_dense/vars": "model_00001.weights.h5",
84
+ "/layers/mixtral_transformer_decoder_7/_self_attention_layer/value_dense/vars": "model_00001.weights.h5",
85
+ "/layers/mixtral_transformer_decoder_7/_self_attention_layernorm/vars": "model_00001.weights.h5",
86
+ "/layers/mixtral_transformer_decoder_7/_sparse_moe_block/expert_bank/vars": [
87
+ "model_00002.weights.h5"
88
+ ],
89
+ "/layers/mixtral_transformer_decoder_7/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00002.weights.h5",
90
+ "/layers/mixtral_transformer_decoder_8/_feedforward_layernorm/vars": "model_00002.weights.h5",
91
+ "/layers/mixtral_transformer_decoder_8/_self_attention_layer/key_dense/vars": "model_00002.weights.h5",
92
+ "/layers/mixtral_transformer_decoder_8/_self_attention_layer/output_dense/vars": "model_00002.weights.h5",
93
+ "/layers/mixtral_transformer_decoder_8/_self_attention_layer/query_dense/vars": "model_00002.weights.h5",
94
+ "/layers/mixtral_transformer_decoder_8/_self_attention_layer/value_dense/vars": "model_00002.weights.h5",
95
+ "/layers/mixtral_transformer_decoder_8/_self_attention_layernorm/vars": "model_00002.weights.h5",
96
+ "/layers/mixtral_transformer_decoder_8/_sparse_moe_block/expert_bank/vars": [
97
+ "model_00002.weights.h5"
98
+ ],
99
+ "/layers/mixtral_transformer_decoder_8/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00002.weights.h5",
100
+ "/layers/mixtral_transformer_decoder_9/_feedforward_layernorm/vars": "model_00002.weights.h5",
101
+ "/layers/mixtral_transformer_decoder_9/_self_attention_layer/key_dense/vars": "model_00002.weights.h5",
102
+ "/layers/mixtral_transformer_decoder_9/_self_attention_layer/output_dense/vars": "model_00002.weights.h5",
103
+ "/layers/mixtral_transformer_decoder_9/_self_attention_layer/query_dense/vars": "model_00002.weights.h5",
104
+ "/layers/mixtral_transformer_decoder_9/_self_attention_layer/value_dense/vars": "model_00002.weights.h5",
105
+ "/layers/mixtral_transformer_decoder_9/_self_attention_layernorm/vars": "model_00002.weights.h5",
106
+ "/layers/mixtral_transformer_decoder_9/_sparse_moe_block/expert_bank/vars": [
107
+ "model_00002.weights.h5"
108
+ ],
109
+ "/layers/mixtral_transformer_decoder_9/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00002.weights.h5",
110
+ "/layers/mixtral_transformer_decoder_10/_feedforward_layernorm/vars": "model_00002.weights.h5",
111
+ "/layers/mixtral_transformer_decoder_10/_self_attention_layer/key_dense/vars": "model_00002.weights.h5",
112
+ "/layers/mixtral_transformer_decoder_10/_self_attention_layer/output_dense/vars": "model_00002.weights.h5",
113
+ "/layers/mixtral_transformer_decoder_10/_self_attention_layer/query_dense/vars": "model_00002.weights.h5",
114
+ "/layers/mixtral_transformer_decoder_10/_self_attention_layer/value_dense/vars": "model_00002.weights.h5",
115
+ "/layers/mixtral_transformer_decoder_10/_self_attention_layernorm/vars": "model_00002.weights.h5",
116
+ "/layers/mixtral_transformer_decoder_10/_sparse_moe_block/expert_bank/vars": [
117
+ "model_00002.weights.h5",
118
+ "model_00003.weights.h5"
119
+ ],
120
+ "/layers/mixtral_transformer_decoder_10/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00003.weights.h5",
121
+ "/layers/mixtral_transformer_decoder_11/_feedforward_layernorm/vars": "model_00003.weights.h5",
122
+ "/layers/mixtral_transformer_decoder_11/_self_attention_layer/key_dense/vars": "model_00003.weights.h5",
123
+ "/layers/mixtral_transformer_decoder_11/_self_attention_layer/output_dense/vars": "model_00003.weights.h5",
124
+ "/layers/mixtral_transformer_decoder_11/_self_attention_layer/query_dense/vars": "model_00003.weights.h5",
125
+ "/layers/mixtral_transformer_decoder_11/_self_attention_layer/value_dense/vars": "model_00003.weights.h5",
126
+ "/layers/mixtral_transformer_decoder_11/_self_attention_layernorm/vars": "model_00003.weights.h5",
127
+ "/layers/mixtral_transformer_decoder_11/_sparse_moe_block/expert_bank/vars": [
128
+ "model_00003.weights.h5"
129
+ ],
130
+ "/layers/mixtral_transformer_decoder_11/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00003.weights.h5",
131
+ "/layers/mixtral_transformer_decoder_12/_feedforward_layernorm/vars": "model_00003.weights.h5",
132
+ "/layers/mixtral_transformer_decoder_12/_self_attention_layer/key_dense/vars": "model_00003.weights.h5",
133
+ "/layers/mixtral_transformer_decoder_12/_self_attention_layer/output_dense/vars": "model_00003.weights.h5",
134
+ "/layers/mixtral_transformer_decoder_12/_self_attention_layer/query_dense/vars": "model_00003.weights.h5",
135
+ "/layers/mixtral_transformer_decoder_12/_self_attention_layer/value_dense/vars": "model_00003.weights.h5",
136
+ "/layers/mixtral_transformer_decoder_12/_self_attention_layernorm/vars": "model_00003.weights.h5",
137
+ "/layers/mixtral_transformer_decoder_12/_sparse_moe_block/expert_bank/vars": [
138
+ "model_00003.weights.h5"
139
+ ],
140
+ "/layers/mixtral_transformer_decoder_12/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00003.weights.h5",
141
+ "/layers/mixtral_transformer_decoder_13/_feedforward_layernorm/vars": "model_00003.weights.h5",
142
+ "/layers/mixtral_transformer_decoder_13/_self_attention_layer/key_dense/vars": "model_00003.weights.h5",
143
+ "/layers/mixtral_transformer_decoder_13/_self_attention_layer/output_dense/vars": "model_00003.weights.h5",
144
+ "/layers/mixtral_transformer_decoder_13/_self_attention_layer/query_dense/vars": "model_00003.weights.h5",
145
+ "/layers/mixtral_transformer_decoder_13/_self_attention_layer/value_dense/vars": "model_00003.weights.h5",
146
+ "/layers/mixtral_transformer_decoder_13/_self_attention_layernorm/vars": "model_00003.weights.h5",
147
+ "/layers/mixtral_transformer_decoder_13/_sparse_moe_block/expert_bank/vars": [
148
+ "model_00003.weights.h5"
149
+ ],
150
+ "/layers/mixtral_transformer_decoder_13/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00003.weights.h5",
151
+ "/layers/mixtral_transformer_decoder_14/_feedforward_layernorm/vars": "model_00003.weights.h5",
152
+ "/layers/mixtral_transformer_decoder_14/_self_attention_layer/key_dense/vars": "model_00003.weights.h5",
153
+ "/layers/mixtral_transformer_decoder_14/_self_attention_layer/output_dense/vars": "model_00003.weights.h5",
154
+ "/layers/mixtral_transformer_decoder_14/_self_attention_layer/query_dense/vars": "model_00003.weights.h5",
155
+ "/layers/mixtral_transformer_decoder_14/_self_attention_layer/value_dense/vars": "model_00003.weights.h5",
156
+ "/layers/mixtral_transformer_decoder_14/_self_attention_layernorm/vars": "model_00003.weights.h5",
157
+ "/layers/mixtral_transformer_decoder_14/_sparse_moe_block/expert_bank/vars": [
158
+ "model_00003.weights.h5",
159
+ "model_00004.weights.h5"
160
+ ],
161
+ "/layers/mixtral_transformer_decoder_14/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00004.weights.h5",
162
+ "/layers/mixtral_transformer_decoder_15/_feedforward_layernorm/vars": "model_00004.weights.h5",
163
+ "/layers/mixtral_transformer_decoder_15/_self_attention_layer/key_dense/vars": "model_00004.weights.h5",
164
+ "/layers/mixtral_transformer_decoder_15/_self_attention_layer/output_dense/vars": "model_00004.weights.h5",
165
+ "/layers/mixtral_transformer_decoder_15/_self_attention_layer/query_dense/vars": "model_00004.weights.h5",
166
+ "/layers/mixtral_transformer_decoder_15/_self_attention_layer/value_dense/vars": "model_00004.weights.h5",
167
+ "/layers/mixtral_transformer_decoder_15/_self_attention_layernorm/vars": "model_00004.weights.h5",
168
+ "/layers/mixtral_transformer_decoder_15/_sparse_moe_block/expert_bank/vars": [
169
+ "model_00004.weights.h5"
170
+ ],
171
+ "/layers/mixtral_transformer_decoder_15/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00004.weights.h5",
172
+ "/layers/mixtral_transformer_decoder_16/_feedforward_layernorm/vars": "model_00004.weights.h5",
173
+ "/layers/mixtral_transformer_decoder_16/_self_attention_layer/key_dense/vars": "model_00004.weights.h5",
174
+ "/layers/mixtral_transformer_decoder_16/_self_attention_layer/output_dense/vars": "model_00004.weights.h5",
175
+ "/layers/mixtral_transformer_decoder_16/_self_attention_layer/query_dense/vars": "model_00004.weights.h5",
176
+ "/layers/mixtral_transformer_decoder_16/_self_attention_layer/value_dense/vars": "model_00004.weights.h5",
177
+ "/layers/mixtral_transformer_decoder_16/_self_attention_layernorm/vars": "model_00004.weights.h5",
178
+ "/layers/mixtral_transformer_decoder_16/_sparse_moe_block/expert_bank/vars": [
179
+ "model_00004.weights.h5"
180
+ ],
181
+ "/layers/mixtral_transformer_decoder_16/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00004.weights.h5",
182
+ "/layers/mixtral_transformer_decoder_17/_feedforward_layernorm/vars": "model_00004.weights.h5",
183
+ "/layers/mixtral_transformer_decoder_17/_self_attention_layer/key_dense/vars": "model_00004.weights.h5",
184
+ "/layers/mixtral_transformer_decoder_17/_self_attention_layer/output_dense/vars": "model_00004.weights.h5",
185
+ "/layers/mixtral_transformer_decoder_17/_self_attention_layer/query_dense/vars": "model_00004.weights.h5",
186
+ "/layers/mixtral_transformer_decoder_17/_self_attention_layer/value_dense/vars": "model_00004.weights.h5",
187
+ "/layers/mixtral_transformer_decoder_17/_self_attention_layernorm/vars": "model_00004.weights.h5",
188
+ "/layers/mixtral_transformer_decoder_17/_sparse_moe_block/expert_bank/vars": [
189
+ "model_00004.weights.h5"
190
+ ],
191
+ "/layers/mixtral_transformer_decoder_17/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00004.weights.h5",
192
+ "/layers/mixtral_transformer_decoder_18/_feedforward_layernorm/vars": "model_00004.weights.h5",
193
+ "/layers/mixtral_transformer_decoder_18/_self_attention_layer/key_dense/vars": "model_00004.weights.h5",
194
+ "/layers/mixtral_transformer_decoder_18/_self_attention_layer/output_dense/vars": "model_00004.weights.h5",
195
+ "/layers/mixtral_transformer_decoder_18/_self_attention_layer/query_dense/vars": "model_00004.weights.h5",
196
+ "/layers/mixtral_transformer_decoder_18/_self_attention_layer/value_dense/vars": "model_00004.weights.h5",
197
+ "/layers/mixtral_transformer_decoder_18/_self_attention_layernorm/vars": "model_00004.weights.h5",
198
+ "/layers/mixtral_transformer_decoder_18/_sparse_moe_block/expert_bank/vars": [
199
+ "model_00005.weights.h5"
200
+ ],
201
+ "/layers/mixtral_transformer_decoder_18/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00005.weights.h5",
202
+ "/layers/mixtral_transformer_decoder_19/_feedforward_layernorm/vars": "model_00005.weights.h5",
203
+ "/layers/mixtral_transformer_decoder_19/_self_attention_layer/key_dense/vars": "model_00005.weights.h5",
204
+ "/layers/mixtral_transformer_decoder_19/_self_attention_layer/output_dense/vars": "model_00005.weights.h5",
205
+ "/layers/mixtral_transformer_decoder_19/_self_attention_layer/query_dense/vars": "model_00005.weights.h5",
206
+ "/layers/mixtral_transformer_decoder_19/_self_attention_layer/value_dense/vars": "model_00005.weights.h5",
207
+ "/layers/mixtral_transformer_decoder_19/_self_attention_layernorm/vars": "model_00005.weights.h5",
208
+ "/layers/mixtral_transformer_decoder_19/_sparse_moe_block/expert_bank/vars": [
209
+ "model_00005.weights.h5"
210
+ ],
211
+ "/layers/mixtral_transformer_decoder_19/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00005.weights.h5",
212
+ "/layers/mixtral_transformer_decoder_20/_feedforward_layernorm/vars": "model_00005.weights.h5",
213
+ "/layers/mixtral_transformer_decoder_20/_self_attention_layer/key_dense/vars": "model_00005.weights.h5",
214
+ "/layers/mixtral_transformer_decoder_20/_self_attention_layer/output_dense/vars": "model_00005.weights.h5",
215
+ "/layers/mixtral_transformer_decoder_20/_self_attention_layer/query_dense/vars": "model_00005.weights.h5",
216
+ "/layers/mixtral_transformer_decoder_20/_self_attention_layer/value_dense/vars": "model_00005.weights.h5",
217
+ "/layers/mixtral_transformer_decoder_20/_self_attention_layernorm/vars": "model_00005.weights.h5",
218
+ "/layers/mixtral_transformer_decoder_20/_sparse_moe_block/expert_bank/vars": [
219
+ "model_00005.weights.h5"
220
+ ],
221
+ "/layers/mixtral_transformer_decoder_20/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00005.weights.h5",
222
+ "/layers/mixtral_transformer_decoder_21/_feedforward_layernorm/vars": "model_00005.weights.h5",
223
+ "/layers/mixtral_transformer_decoder_21/_self_attention_layer/key_dense/vars": "model_00005.weights.h5",
224
+ "/layers/mixtral_transformer_decoder_21/_self_attention_layer/output_dense/vars": "model_00005.weights.h5",
225
+ "/layers/mixtral_transformer_decoder_21/_self_attention_layer/query_dense/vars": "model_00005.weights.h5",
226
+ "/layers/mixtral_transformer_decoder_21/_self_attention_layer/value_dense/vars": "model_00005.weights.h5",
227
+ "/layers/mixtral_transformer_decoder_21/_self_attention_layernorm/vars": "model_00005.weights.h5",
228
+ "/layers/mixtral_transformer_decoder_21/_sparse_moe_block/expert_bank/vars": [
229
+ "model_00005.weights.h5",
230
+ "model_00006.weights.h5"
231
+ ],
232
+ "/layers/mixtral_transformer_decoder_21/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00006.weights.h5",
233
+ "/layers/mixtral_transformer_decoder_22/_feedforward_layernorm/vars": "model_00006.weights.h5",
234
+ "/layers/mixtral_transformer_decoder_22/_self_attention_layer/key_dense/vars": "model_00006.weights.h5",
235
+ "/layers/mixtral_transformer_decoder_22/_self_attention_layer/output_dense/vars": "model_00006.weights.h5",
236
+ "/layers/mixtral_transformer_decoder_22/_self_attention_layer/query_dense/vars": "model_00006.weights.h5",
237
+ "/layers/mixtral_transformer_decoder_22/_self_attention_layer/value_dense/vars": "model_00006.weights.h5",
238
+ "/layers/mixtral_transformer_decoder_22/_self_attention_layernorm/vars": "model_00006.weights.h5",
239
+ "/layers/mixtral_transformer_decoder_22/_sparse_moe_block/expert_bank/vars": [
240
+ "model_00006.weights.h5"
241
+ ],
242
+ "/layers/mixtral_transformer_decoder_22/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00006.weights.h5",
243
+ "/layers/mixtral_transformer_decoder_23/_feedforward_layernorm/vars": "model_00006.weights.h5",
244
+ "/layers/mixtral_transformer_decoder_23/_self_attention_layer/key_dense/vars": "model_00006.weights.h5",
245
+ "/layers/mixtral_transformer_decoder_23/_self_attention_layer/output_dense/vars": "model_00006.weights.h5",
246
+ "/layers/mixtral_transformer_decoder_23/_self_attention_layer/query_dense/vars": "model_00006.weights.h5",
247
+ "/layers/mixtral_transformer_decoder_23/_self_attention_layer/value_dense/vars": "model_00006.weights.h5",
248
+ "/layers/mixtral_transformer_decoder_23/_self_attention_layernorm/vars": "model_00006.weights.h5",
249
+ "/layers/mixtral_transformer_decoder_23/_sparse_moe_block/expert_bank/vars": [
250
+ "model_00006.weights.h5"
251
+ ],
252
+ "/layers/mixtral_transformer_decoder_23/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00006.weights.h5",
253
+ "/layers/mixtral_transformer_decoder_24/_feedforward_layernorm/vars": "model_00006.weights.h5",
254
+ "/layers/mixtral_transformer_decoder_24/_self_attention_layer/key_dense/vars": "model_00006.weights.h5",
255
+ "/layers/mixtral_transformer_decoder_24/_self_attention_layer/output_dense/vars": "model_00006.weights.h5",
256
+ "/layers/mixtral_transformer_decoder_24/_self_attention_layer/query_dense/vars": "model_00006.weights.h5",
257
+ "/layers/mixtral_transformer_decoder_24/_self_attention_layer/value_dense/vars": "model_00006.weights.h5",
258
+ "/layers/mixtral_transformer_decoder_24/_self_attention_layernorm/vars": "model_00006.weights.h5",
259
+ "/layers/mixtral_transformer_decoder_24/_sparse_moe_block/expert_bank/vars": [
260
+ "model_00006.weights.h5"
261
+ ],
262
+ "/layers/mixtral_transformer_decoder_24/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00006.weights.h5",
263
+ "/layers/mixtral_transformer_decoder_25/_feedforward_layernorm/vars": "model_00006.weights.h5",
264
+ "/layers/mixtral_transformer_decoder_25/_self_attention_layer/key_dense/vars": "model_00006.weights.h5",
265
+ "/layers/mixtral_transformer_decoder_25/_self_attention_layer/output_dense/vars": "model_00006.weights.h5",
266
+ "/layers/mixtral_transformer_decoder_25/_self_attention_layer/query_dense/vars": "model_00006.weights.h5",
267
+ "/layers/mixtral_transformer_decoder_25/_self_attention_layer/value_dense/vars": "model_00006.weights.h5",
268
+ "/layers/mixtral_transformer_decoder_25/_self_attention_layernorm/vars": "model_00006.weights.h5",
269
+ "/layers/mixtral_transformer_decoder_25/_sparse_moe_block/expert_bank/vars": [
270
+ "model_00006.weights.h5",
271
+ "model_00007.weights.h5"
272
+ ],
273
+ "/layers/mixtral_transformer_decoder_25/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00007.weights.h5",
274
+ "/layers/mixtral_transformer_decoder_26/_feedforward_layernorm/vars": "model_00007.weights.h5",
275
+ "/layers/mixtral_transformer_decoder_26/_self_attention_layer/key_dense/vars": "model_00007.weights.h5",
276
+ "/layers/mixtral_transformer_decoder_26/_self_attention_layer/output_dense/vars": "model_00007.weights.h5",
277
+ "/layers/mixtral_transformer_decoder_26/_self_attention_layer/query_dense/vars": "model_00007.weights.h5",
278
+ "/layers/mixtral_transformer_decoder_26/_self_attention_layer/value_dense/vars": "model_00007.weights.h5",
279
+ "/layers/mixtral_transformer_decoder_26/_self_attention_layernorm/vars": "model_00007.weights.h5",
280
+ "/layers/mixtral_transformer_decoder_26/_sparse_moe_block/expert_bank/vars": [
281
+ "model_00007.weights.h5"
282
+ ],
283
+ "/layers/mixtral_transformer_decoder_26/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00007.weights.h5",
284
+ "/layers/mixtral_transformer_decoder_27/_feedforward_layernorm/vars": "model_00007.weights.h5",
285
+ "/layers/mixtral_transformer_decoder_27/_self_attention_layer/key_dense/vars": "model_00007.weights.h5",
286
+ "/layers/mixtral_transformer_decoder_27/_self_attention_layer/output_dense/vars": "model_00007.weights.h5",
287
+ "/layers/mixtral_transformer_decoder_27/_self_attention_layer/query_dense/vars": "model_00007.weights.h5",
288
+ "/layers/mixtral_transformer_decoder_27/_self_attention_layer/value_dense/vars": "model_00007.weights.h5",
289
+ "/layers/mixtral_transformer_decoder_27/_self_attention_layernorm/vars": "model_00007.weights.h5",
290
+ "/layers/mixtral_transformer_decoder_27/_sparse_moe_block/expert_bank/vars": [
291
+ "model_00007.weights.h5"
292
+ ],
293
+ "/layers/mixtral_transformer_decoder_27/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00007.weights.h5",
294
+ "/layers/mixtral_transformer_decoder_28/_feedforward_layernorm/vars": "model_00007.weights.h5",
295
+ "/layers/mixtral_transformer_decoder_28/_self_attention_layer/key_dense/vars": "model_00007.weights.h5",
296
+ "/layers/mixtral_transformer_decoder_28/_self_attention_layer/output_dense/vars": "model_00007.weights.h5",
297
+ "/layers/mixtral_transformer_decoder_28/_self_attention_layer/query_dense/vars": "model_00007.weights.h5",
298
+ "/layers/mixtral_transformer_decoder_28/_self_attention_layer/value_dense/vars": "model_00007.weights.h5",
299
+ "/layers/mixtral_transformer_decoder_28/_self_attention_layernorm/vars": "model_00007.weights.h5",
300
+ "/layers/mixtral_transformer_decoder_28/_sparse_moe_block/expert_bank/vars": [
301
+ "model_00007.weights.h5"
302
+ ],
303
+ "/layers/mixtral_transformer_decoder_28/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00007.weights.h5",
304
+ "/layers/mixtral_transformer_decoder_29/_feedforward_layernorm/vars": "model_00007.weights.h5",
305
+ "/layers/mixtral_transformer_decoder_29/_self_attention_layer/key_dense/vars": "model_00007.weights.h5",
306
+ "/layers/mixtral_transformer_decoder_29/_self_attention_layer/output_dense/vars": "model_00007.weights.h5",
307
+ "/layers/mixtral_transformer_decoder_29/_self_attention_layer/query_dense/vars": "model_00007.weights.h5",
308
+ "/layers/mixtral_transformer_decoder_29/_self_attention_layer/value_dense/vars": "model_00007.weights.h5",
309
+ "/layers/mixtral_transformer_decoder_29/_self_attention_layernorm/vars": "model_00007.weights.h5",
310
+ "/layers/mixtral_transformer_decoder_29/_sparse_moe_block/expert_bank/vars": [
311
+ "model_00008.weights.h5"
312
+ ],
313
+ "/layers/mixtral_transformer_decoder_29/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00008.weights.h5",
314
+ "/layers/mixtral_transformer_decoder_30/_feedforward_layernorm/vars": "model_00008.weights.h5",
315
+ "/layers/mixtral_transformer_decoder_30/_self_attention_layer/key_dense/vars": "model_00008.weights.h5",
316
+ "/layers/mixtral_transformer_decoder_30/_self_attention_layer/output_dense/vars": "model_00008.weights.h5",
317
+ "/layers/mixtral_transformer_decoder_30/_self_attention_layer/query_dense/vars": "model_00008.weights.h5",
318
+ "/layers/mixtral_transformer_decoder_30/_self_attention_layer/value_dense/vars": "model_00008.weights.h5",
319
+ "/layers/mixtral_transformer_decoder_30/_self_attention_layernorm/vars": "model_00008.weights.h5",
320
+ "/layers/mixtral_transformer_decoder_30/_sparse_moe_block/expert_bank/vars": [
321
+ "model_00008.weights.h5"
322
+ ],
323
+ "/layers/mixtral_transformer_decoder_30/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00008.weights.h5",
324
+ "/layers/mixtral_transformer_decoder_31/_feedforward_layernorm/vars": "model_00008.weights.h5",
325
+ "/layers/mixtral_transformer_decoder_31/_self_attention_layer/key_dense/vars": "model_00008.weights.h5",
326
+ "/layers/mixtral_transformer_decoder_31/_self_attention_layer/output_dense/vars": "model_00008.weights.h5",
327
+ "/layers/mixtral_transformer_decoder_31/_self_attention_layer/query_dense/vars": "model_00008.weights.h5",
328
+ "/layers/mixtral_transformer_decoder_31/_self_attention_layer/value_dense/vars": "model_00008.weights.h5",
329
+ "/layers/mixtral_transformer_decoder_31/_self_attention_layernorm/vars": "model_00008.weights.h5",
330
+ "/layers/mixtral_transformer_decoder_31/_sparse_moe_block/expert_bank/vars": [
331
+ "model_00008.weights.h5"
332
+ ],
333
+ "/layers/mixtral_transformer_decoder_31/_sparse_moe_block/_sparse_feedforward_gate_dense/vars": "model_00008.weights.h5",
334
+ "/layers/mixtral_layer_normalization/vars": "model_00008.weights.h5"
335
+ }
336
+ }
model_00000.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bd781f94920ae078bd6762fbb620d70f12c3e8dbde27fe4bdc6a7ac113fad15
3
+ size 10255478248
model_00001.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:914fe74f45ddc092a46e2d09d069966e2952c2d66d6c8a56d39d9585d4c676a5
3
+ size 10670779248
model_00002.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61cc422ed9127b90f20b4b6008f5e8d19890b022835d277427e6aaa453d989f3
3
+ size 10586780928
model_00003.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95cfc1aaf94aae7dc74c978f28466cdc036126857de4fda26aa0b551a9613af4
3
+ size 10670779568
model_00004.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96940759d70b5d16d6023611ec96c339c0d77be50da001ac776509f0b21d2157
3
+ size 10670779248
model_00005.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fd03008bbae40afbdb55332ebac6bd569cd715fe3e872c7e7e49564bd44e072
3
+ size 10586780928
model_00006.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:021da30ae6c89a9b9eeb433ccae95490609f4056834ada895519abb1a226fd26
3
+ size 10670779568
model_00007.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:220b921c709fbe8ec8c68c5379e3c18ec0b7e2ba0129567478fd1fa466848ace
3
+ size 10670779248
model_00008.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5704809aa3aeb934ffe99f73833a03de47c0952b7b5098dca7f6cbedbbd19c1
3
+ size 8623809544
preprocessor.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "module": "keras_hub.src.models.mixtral.mixtral_causal_lm_preprocessor",
3
+ "class_name": "MixtralCausalLMPreprocessor",
4
+ "config": {
5
+ "name": "mixtral_causal_lm_preprocessor_2",
6
+ "trainable": true,
7
+ "dtype": {
8
+ "module": "keras",
9
+ "class_name": "DTypePolicy",
10
+ "config": {
11
+ "name": "float32"
12
+ },
13
+ "registered_name": null
14
+ },
15
+ "tokenizer": {
16
+ "module": "keras_hub.src.models.mixtral.mixtral_tokenizer",
17
+ "class_name": "MixtralTokenizer",
18
+ "config": {
19
+ "name": "mixtral_tokenizer",
20
+ "trainable": true,
21
+ "dtype": {
22
+ "module": "keras",
23
+ "class_name": "DTypePolicy",
24
+ "config": {
25
+ "name": "int32"
26
+ },
27
+ "registered_name": null
28
+ },
29
+ "config_file": "tokenizer.json",
30
+ "proto": null,
31
+ "sequence_length": null,
32
+ "add_bos": false,
33
+ "add_eos": false
34
+ },
35
+ "registered_name": "keras_hub>MixtralTokenizer"
36
+ },
37
+ "config_file": "preprocessor.json",
38
+ "sequence_length": 1024,
39
+ "add_start_token": true,
40
+ "add_end_token": true
41
+ },
42
+ "registered_name": "keras_hub>MixtralCausalLMPreprocessor"
43
+ }
task.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "module": "keras_hub.src.models.mixtral.mixtral_causal_lm",
3
+ "class_name": "MixtralCausalLM",
4
+ "config": {
5
+ "backbone": {
6
+ "module": "keras_hub.src.models.mixtral.mixtral_backbone",
7
+ "class_name": "MixtralBackbone",
8
+ "config": {
9
+ "name": "mixtral_backbone",
10
+ "trainable": true,
11
+ "vocabulary_size": 32000,
12
+ "num_layers": 32,
13
+ "num_query_heads": 32,
14
+ "hidden_dim": 4096,
15
+ "intermediate_dim": 14336,
16
+ "num_experts": 8,
17
+ "top_k": 2,
18
+ "router_jitter_noise": 0.0,
19
+ "rope_max_wavelength": 1000000.0,
20
+ "rope_scaling_factor": 1.0,
21
+ "num_key_value_heads": 8,
22
+ "router_aux_loss_coef": 0.02,
23
+ "sliding_window": null,
24
+ "layer_norm_epsilon": 1e-05,
25
+ "dropout": 0
26
+ },
27
+ "registered_name": "keras_hub>MixtralBackbone"
28
+ },
29
+ "preprocessor": {
30
+ "module": "keras_hub.src.models.mixtral.mixtral_causal_lm_preprocessor",
31
+ "class_name": "MixtralCausalLMPreprocessor",
32
+ "config": {
33
+ "name": "mixtral_causal_lm_preprocessor_2",
34
+ "trainable": true,
35
+ "dtype": {
36
+ "module": "keras",
37
+ "class_name": "DTypePolicy",
38
+ "config": {
39
+ "name": "float32"
40
+ },
41
+ "registered_name": null
42
+ },
43
+ "tokenizer": {
44
+ "module": "keras_hub.src.models.mixtral.mixtral_tokenizer",
45
+ "class_name": "MixtralTokenizer",
46
+ "config": {
47
+ "name": "mixtral_tokenizer",
48
+ "trainable": true,
49
+ "dtype": {
50
+ "module": "keras",
51
+ "class_name": "DTypePolicy",
52
+ "config": {
53
+ "name": "int32"
54
+ },
55
+ "registered_name": null
56
+ },
57
+ "config_file": "tokenizer.json",
58
+ "proto": null,
59
+ "sequence_length": null,
60
+ "add_bos": false,
61
+ "add_eos": false
62
+ },
63
+ "registered_name": "keras_hub>MixtralTokenizer"
64
+ },
65
+ "config_file": "preprocessor.json",
66
+ "sequence_length": 1024,
67
+ "add_start_token": true,
68
+ "add_end_token": true
69
+ },
70
+ "registered_name": "keras_hub>MixtralCausalLMPreprocessor"
71
+ },
72
+ "name": "mixtral_causal_lm"
73
+ },
74
+ "registered_name": "keras_hub>MixtralCausalLM"
75
+ }
tokenizer.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "module": "keras_hub.src.models.mixtral.mixtral_tokenizer",
3
+ "class_name": "MixtralTokenizer",
4
+ "config": {
5
+ "name": "mixtral_tokenizer",
6
+ "trainable": true,
7
+ "dtype": {
8
+ "module": "keras",
9
+ "class_name": "DTypePolicy",
10
+ "config": {
11
+ "name": "int32"
12
+ },
13
+ "registered_name": null
14
+ },
15
+ "config_file": "tokenizer.json",
16
+ "proto": null,
17
+ "sequence_length": null,
18
+ "add_bos": false,
19
+ "add_eos": false
20
+ },
21
+ "registered_name": "keras_hub>MixtralTokenizer"
22
+ }