Upload folder using huggingface_hub
Browse files- README.md +16 -13
- config.json +2 -2
- model.safetensors +2 -2
README.md
CHANGED
|
@@ -17,7 +17,11 @@ This tiny model is for debugging. It is randomly initialized with the config ada
|
|
| 17 |
- vLLM
|
| 18 |
|
| 19 |
```bash
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
```
|
| 22 |
|
| 23 |
- Transformers
|
|
@@ -57,7 +61,6 @@ from transformers import (
|
|
| 57 |
GenerationConfig,
|
| 58 |
set_seed,
|
| 59 |
)
|
| 60 |
-
|
| 61 |
from transformers.models.glm4_moe.modeling_glm4_moe import Glm4MoeRMSNorm
|
| 62 |
source_model_id = "deepseek-ai/DeepSeek-V3.1-Base"
|
| 63 |
save_folder = "/tmp/yujiepan/deepseek-v3.1-tiny-random"
|
|
@@ -79,9 +82,9 @@ config_json.update({
|
|
| 79 |
'moe_intermediate_size': 64,
|
| 80 |
'n_routed_experts': 32,
|
| 81 |
'n_shared_experts': 1,
|
| 82 |
-
'num_attention_heads':
|
| 83 |
'num_experts_per_tok': 8,
|
| 84 |
-
'num_key_value_heads':
|
| 85 |
'q_lora_rank': 32,
|
| 86 |
'qk_nope_head_dim': 64,
|
| 87 |
'qk_rope_head_dim': 192, # vllm mla kernel supports 576 only, FA supports head dim <= 256
|
|
@@ -169,11 +172,11 @@ DeepseekV3ForCausalLM(
|
|
| 169 |
(self_attn): DeepseekV3Attention(
|
| 170 |
(q_a_proj): Linear(in_features=8, out_features=32, bias=False)
|
| 171 |
(q_a_layernorm): DeepseekV3RMSNorm()
|
| 172 |
-
(q_b_proj): Linear(in_features=32, out_features=
|
| 173 |
(kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
|
| 174 |
(kv_a_layernorm): DeepseekV3RMSNorm()
|
| 175 |
-
(kv_b_proj): Linear(in_features=384, out_features=
|
| 176 |
-
(o_proj): Linear(in_features=
|
| 177 |
(rotary_emb): DeepseekV3YarnRotaryEmbedding()
|
| 178 |
)
|
| 179 |
(mlp): DeepseekV3MLP(
|
|
@@ -189,11 +192,11 @@ DeepseekV3ForCausalLM(
|
|
| 189 |
(self_attn): DeepseekV3Attention(
|
| 190 |
(q_a_proj): Linear(in_features=8, out_features=32, bias=False)
|
| 191 |
(q_a_layernorm): DeepseekV3RMSNorm()
|
| 192 |
-
(q_b_proj): Linear(in_features=32, out_features=
|
| 193 |
(kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
|
| 194 |
(kv_a_layernorm): DeepseekV3RMSNorm()
|
| 195 |
-
(kv_b_proj): Linear(in_features=384, out_features=
|
| 196 |
-
(o_proj): Linear(in_features=
|
| 197 |
(rotary_emb): DeepseekV3YarnRotaryEmbedding()
|
| 198 |
)
|
| 199 |
(mlp): DeepseekV3MoE(
|
|
@@ -220,11 +223,11 @@ DeepseekV3ForCausalLM(
|
|
| 220 |
(self_attn): DeepseekV3Attention(
|
| 221 |
(q_a_proj): Linear(in_features=8, out_features=32, bias=False)
|
| 222 |
(q_a_layernorm): DeepseekV3RMSNorm()
|
| 223 |
-
(q_b_proj): Linear(in_features=32, out_features=
|
| 224 |
(kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
|
| 225 |
(kv_a_layernorm): DeepseekV3RMSNorm()
|
| 226 |
-
(kv_b_proj): Linear(in_features=384, out_features=
|
| 227 |
-
(o_proj): Linear(in_features=
|
| 228 |
(rotary_emb): DeepseekV3YarnRotaryEmbedding()
|
| 229 |
)
|
| 230 |
(mlp): DeepseekV3MoE(
|
|
|
|
| 17 |
- vLLM
|
| 18 |
|
| 19 |
```bash
|
| 20 |
+
python -m vllm.entrypoints.openai.api_server \
|
| 21 |
+
--tensor-parallel-size 2 \
|
| 22 |
+
--model yujiepan/deepseek-v3.1-tiny-random \
|
| 23 |
+
--trust-remote-code \
|
| 24 |
+
--speculative-config='{"method": "deepseek_mtp", "num_speculative_tokens": 1}'
|
| 25 |
```
|
| 26 |
|
| 27 |
- Transformers
|
|
|
|
| 61 |
GenerationConfig,
|
| 62 |
set_seed,
|
| 63 |
)
|
|
|
|
| 64 |
from transformers.models.glm4_moe.modeling_glm4_moe import Glm4MoeRMSNorm
|
| 65 |
source_model_id = "deepseek-ai/DeepSeek-V3.1-Base"
|
| 66 |
save_folder = "/tmp/yujiepan/deepseek-v3.1-tiny-random"
|
|
|
|
| 82 |
'moe_intermediate_size': 64,
|
| 83 |
'n_routed_experts': 32,
|
| 84 |
'n_shared_experts': 1,
|
| 85 |
+
'num_attention_heads': 4,
|
| 86 |
'num_experts_per_tok': 8,
|
| 87 |
+
'num_key_value_heads': 4,
|
| 88 |
'q_lora_rank': 32,
|
| 89 |
'qk_nope_head_dim': 64,
|
| 90 |
'qk_rope_head_dim': 192, # vllm mla kernel supports 576 only, FA supports head dim <= 256
|
|
|
|
| 172 |
(self_attn): DeepseekV3Attention(
|
| 173 |
(q_a_proj): Linear(in_features=8, out_features=32, bias=False)
|
| 174 |
(q_a_layernorm): DeepseekV3RMSNorm()
|
| 175 |
+
(q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
|
| 176 |
(kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
|
| 177 |
(kv_a_layernorm): DeepseekV3RMSNorm()
|
| 178 |
+
(kv_b_proj): Linear(in_features=384, out_features=512, bias=False)
|
| 179 |
+
(o_proj): Linear(in_features=256, out_features=8, bias=False)
|
| 180 |
(rotary_emb): DeepseekV3YarnRotaryEmbedding()
|
| 181 |
)
|
| 182 |
(mlp): DeepseekV3MLP(
|
|
|
|
| 192 |
(self_attn): DeepseekV3Attention(
|
| 193 |
(q_a_proj): Linear(in_features=8, out_features=32, bias=False)
|
| 194 |
(q_a_layernorm): DeepseekV3RMSNorm()
|
| 195 |
+
(q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
|
| 196 |
(kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
|
| 197 |
(kv_a_layernorm): DeepseekV3RMSNorm()
|
| 198 |
+
(kv_b_proj): Linear(in_features=384, out_features=512, bias=False)
|
| 199 |
+
(o_proj): Linear(in_features=256, out_features=8, bias=False)
|
| 200 |
(rotary_emb): DeepseekV3YarnRotaryEmbedding()
|
| 201 |
)
|
| 202 |
(mlp): DeepseekV3MoE(
|
|
|
|
| 223 |
(self_attn): DeepseekV3Attention(
|
| 224 |
(q_a_proj): Linear(in_features=8, out_features=32, bias=False)
|
| 225 |
(q_a_layernorm): DeepseekV3RMSNorm()
|
| 226 |
+
(q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
|
| 227 |
(kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
|
| 228 |
(kv_a_layernorm): DeepseekV3RMSNorm()
|
| 229 |
+
(kv_b_proj): Linear(in_features=384, out_features=512, bias=False)
|
| 230 |
+
(o_proj): Linear(in_features=256, out_features=8, bias=False)
|
| 231 |
(rotary_emb): DeepseekV3YarnRotaryEmbedding()
|
| 232 |
)
|
| 233 |
(mlp): DeepseekV3MoE(
|
config.json
CHANGED
|
@@ -26,10 +26,10 @@
|
|
| 26 |
"n_routed_experts": 32,
|
| 27 |
"n_shared_experts": 1,
|
| 28 |
"norm_topk_prob": true,
|
| 29 |
-
"num_attention_heads":
|
| 30 |
"num_experts_per_tok": 8,
|
| 31 |
"num_hidden_layers": 2,
|
| 32 |
-
"num_key_value_heads":
|
| 33 |
"num_nextn_predict_layers": 1,
|
| 34 |
"q_lora_rank": 32,
|
| 35 |
"qk_nope_head_dim": 64,
|
|
|
|
| 26 |
"n_routed_experts": 32,
|
| 27 |
"n_shared_experts": 1,
|
| 28 |
"norm_topk_prob": true,
|
| 29 |
+
"num_attention_heads": 4,
|
| 30 |
"num_experts_per_tok": 8,
|
| 31 |
"num_hidden_layers": 2,
|
| 32 |
+
"num_key_value_heads": 4,
|
| 33 |
"num_nextn_predict_layers": 1,
|
| 34 |
"q_lora_rank": 32,
|
| 35 |
"qk_nope_head_dim": 64,
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d0260eb708d0e3cd43e4b374eb0df160a0032948f72bbec48782bd7aae59e1e
|
| 3 |
+
size 9928552
|