yujiepan
/

deepseek-v3.1-tiny-random

@@ -17,7 +17,11 @@ This tiny model is for debugging. It is randomly initialized with the config ada
 - vLLM
 ```bash
-vllm serve yujiepan/deepseek-v3.1-tiny-random --trust-remote-code
 ```
 - Transformers
@@ -57,7 +61,6 @@ from transformers import (
     GenerationConfig,
     set_seed,
 )
 from transformers.models.glm4_moe.modeling_glm4_moe import Glm4MoeRMSNorm
 source_model_id = "deepseek-ai/DeepSeek-V3.1-Base"
 save_folder = "/tmp/yujiepan/deepseek-v3.1-tiny-random"
@@ -79,9 +82,9 @@ config_json.update({
     'moe_intermediate_size': 64,
     'n_routed_experts': 32,
     'n_shared_experts': 1,
-    'num_attention_heads': 1,
     'num_experts_per_tok': 8,
-    'num_key_value_heads': 1,
     'q_lora_rank': 32,
     'qk_nope_head_dim': 64,
     'qk_rope_head_dim': 192,  # vllm mla kernel supports 576 only, FA supports head dim <= 256
@@ -169,11 +172,11 @@ DeepseekV3ForCausalLM(
         (self_attn): DeepseekV3Attention(
           (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
           (q_a_layernorm): DeepseekV3RMSNorm()
-          (q_b_proj): Linear(in_features=32, out_features=256, bias=False)
           (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
           (kv_a_layernorm): DeepseekV3RMSNorm()
-          (kv_b_proj): Linear(in_features=384, out_features=128, bias=False)
-          (o_proj): Linear(in_features=64, out_features=8, bias=False)
           (rotary_emb): DeepseekV3YarnRotaryEmbedding()
         )
         (mlp): DeepseekV3MLP(
@@ -189,11 +192,11 @@ DeepseekV3ForCausalLM(
         (self_attn): DeepseekV3Attention(
           (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
           (q_a_layernorm): DeepseekV3RMSNorm()
-          (q_b_proj): Linear(in_features=32, out_features=256, bias=False)
           (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
           (kv_a_layernorm): DeepseekV3RMSNorm()
-          (kv_b_proj): Linear(in_features=384, out_features=128, bias=False)
-          (o_proj): Linear(in_features=64, out_features=8, bias=False)
           (rotary_emb): DeepseekV3YarnRotaryEmbedding()
         )
         (mlp): DeepseekV3MoE(
@@ -220,11 +223,11 @@ DeepseekV3ForCausalLM(
         (self_attn): DeepseekV3Attention(
           (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
           (q_a_layernorm): DeepseekV3RMSNorm()
-          (q_b_proj): Linear(in_features=32, out_features=256, bias=False)
           (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
           (kv_a_layernorm): DeepseekV3RMSNorm()
-          (kv_b_proj): Linear(in_features=384, out_features=128, bias=False)
-          (o_proj): Linear(in_features=64, out_features=8, bias=False)
           (rotary_emb): DeepseekV3YarnRotaryEmbedding()
         )
         (mlp): DeepseekV3MoE(

 - vLLM
 ```bash
+python -m vllm.entrypoints.openai.api_server \
+    --tensor-parallel-size 2 \
+    --model yujiepan/deepseek-v3.1-tiny-random \
+    --trust-remote-code \
+    --speculative-config='{"method": "deepseek_mtp", "num_speculative_tokens": 1}'
 ```
 - Transformers
     GenerationConfig,
     set_seed,
 )
 from transformers.models.glm4_moe.modeling_glm4_moe import Glm4MoeRMSNorm
 source_model_id = "deepseek-ai/DeepSeek-V3.1-Base"
 save_folder = "/tmp/yujiepan/deepseek-v3.1-tiny-random"
     'moe_intermediate_size': 64,
     'n_routed_experts': 32,
     'n_shared_experts': 1,
+    'num_attention_heads': 4,
     'num_experts_per_tok': 8,
+    'num_key_value_heads': 4,
     'q_lora_rank': 32,
     'qk_nope_head_dim': 64,
     'qk_rope_head_dim': 192,  # vllm mla kernel supports 576 only, FA supports head dim <= 256
         (self_attn): DeepseekV3Attention(
           (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
           (q_a_layernorm): DeepseekV3RMSNorm()
+          (q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
           (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
           (kv_a_layernorm): DeepseekV3RMSNorm()
+          (kv_b_proj): Linear(in_features=384, out_features=512, bias=False)
+          (o_proj): Linear(in_features=256, out_features=8, bias=False)
           (rotary_emb): DeepseekV3YarnRotaryEmbedding()
         )
         (mlp): DeepseekV3MLP(
         (self_attn): DeepseekV3Attention(
           (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
           (q_a_layernorm): DeepseekV3RMSNorm()
+          (q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
           (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
           (kv_a_layernorm): DeepseekV3RMSNorm()
+          (kv_b_proj): Linear(in_features=384, out_features=512, bias=False)
+          (o_proj): Linear(in_features=256, out_features=8, bias=False)
           (rotary_emb): DeepseekV3YarnRotaryEmbedding()
         )
         (mlp): DeepseekV3MoE(
         (self_attn): DeepseekV3Attention(
           (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
           (q_a_layernorm): DeepseekV3RMSNorm()
+          (q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
           (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
           (kv_a_layernorm): DeepseekV3RMSNorm()
+          (kv_b_proj): Linear(in_features=384, out_features=512, bias=False)
+          (o_proj): Linear(in_features=256, out_features=8, bias=False)
           (rotary_emb): DeepseekV3YarnRotaryEmbedding()
         )
         (mlp): DeepseekV3MoE(

config.json CHANGED Viewed

@@ -26,10 +26,10 @@
   "n_routed_experts": 32,
   "n_shared_experts": 1,
   "norm_topk_prob": true,
-  "num_attention_heads": 1,
   "num_experts_per_tok": 8,
   "num_hidden_layers": 2,
-  "num_key_value_heads": 1,
   "num_nextn_predict_layers": 1,
   "q_lora_rank": 32,
   "qk_nope_head_dim": 64,

   "n_routed_experts": 32,
   "n_shared_experts": 1,
   "norm_topk_prob": true,
+  "num_attention_heads": 4,
   "num_experts_per_tok": 8,
   "num_hidden_layers": 2,
+  "num_key_value_heads": 4,
   "num_nextn_predict_layers": 1,
   "q_lora_rank": 32,
   "qk_nope_head_dim": 64,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac1b518d57a554f76b7e2598e9d5c64bd73fad132184859811de843787ecda34
-size 8887136

 version https://git-lfs.github.com/spec/v1
+oid sha256:5d0260eb708d0e3cd43e4b374eb0df160a0032948f72bbec48782bd7aae59e1e
+size 9928552