The model only contains mamba and no attention

#11
by AscendingGrass - opened

Am I doing something wrong? because when I try to download the model and print the named parameters there's only mamba in the layers

from transformers import AutoModelForCausalLM

repo_name = "nvidia/Hymba-1.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True)

for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

here's the output I got:

model.memory_tokens: torch.Size([128, 1600])
model.embed_tokens.weight: torch.Size([32001, 1600])
model.layers.0.mamba.pre_avg_layernorm1.weight: torch.Size([3200])
model.layers.0.mamba.pre_avg_layernorm2.weight: torch.Size([3200])
model.layers.0.mamba.in_proj.weight: torch.Size([8960, 1600])
model.layers.0.mamba.out_proj.weight: torch.Size([1600, 3200])
model.layers.0.mamba.conv1d.weight: torch.Size([3200, 1, 4])
model.layers.0.mamba.conv1d.bias: torch.Size([3200])
model.layers.0.mamba.x_proj.0.weight: torch.Size([132, 3200])
model.layers.0.mamba.dt_proj.0.weight: torch.Size([3200, 100])
model.layers.0.mamba.dt_proj.0.bias: torch.Size([3200])
model.layers.0.mamba.A_log.0: torch.Size([3200, 16])
model.layers.0.mamba.D.0: torch.Size([3200])
model.layers.0.mamba.dt_layernorm.weight: torch.Size([100])
model.layers.0.mamba.B_layernorm.weight: torch.Size([16])
model.layers.0.mamba.C_layernorm.weight: torch.Size([16])
model.layers.0.input_layernorm.weight: torch.Size([1600])
model.layers.0.moe.experts.0.gate_proj.weight: torch.Size([5504, 1600])
model.layers.0.moe.experts.0.down_proj.weight: torch.Size([1600, 5504])
model.layers.0.moe.experts.0.up_proj.weight: torch.Size([5504, 1600])
model.layers.0.pre_moe_layernorm.weight: torch.Size([1600])
model.layers.1.mamba.pre_avg_layernorm1.weight: torch.Size([3200])
model.layers.1.mamba.pre_avg_layernorm2.weight: torch.Size([3200])
model.layers.1.mamba.in_proj.weight: torch.Size([8960, 1600])
model.layers.1.mamba.out_proj.weight: torch.Size([1600, 3200])
model.layers.1.mamba.conv1d.weight: torch.Size([3200, 1, 4])
model.layers.1.mamba.conv1d.bias: torch.Size([3200])
model.layers.1.mamba.x_proj.0.weight: torch.Size([132, 3200])
model.layers.1.mamba.dt_proj.0.weight: torch.Size([3200, 100])
model.layers.1.mamba.dt_proj.0.bias: torch.Size([3200])
model.layers.1.mamba.A_log.0: torch.Size([3200, 16])
model.layers.1.mamba.D.0: torch.Size([3200])
model.layers.1.mamba.dt_layernorm.weight: torch.Size([100])
model.layers.1.mamba.B_layernorm.weight: torch.Size([16])
model.layers.1.mamba.C_layernorm.weight: torch.Size([16])
model.layers.1.input_layernorm.weight: torch.Size([1600])
model.layers.1.moe.experts.0.gate_proj.weight: torch.Size([5504, 1600])
model.layers.1.moe.experts.0.down_proj.weight: torch.Size([1600, 5504])
model.layers.1.moe.experts.0.up_proj.weight: torch.Size([5504, 1600])
model.layers.1.pre_moe_layernorm.weight: torch.Size([1600])
model.layers.2.mamba.pre_avg_layernorm1.weight: torch.Size([3200])
model.layers.2.mamba.pre_avg_layernorm2.weight: torch.Size([3200])
model.layers.2.mamba.in_proj.weight: torch.Size([8000, 1600])
model.layers.2.mamba.out_proj.weight: torch.Size([1600, 3200])
model.layers.2.mamba.conv1d.weight: torch.Size([3200, 1, 4])
model.layers.2.mamba.conv1d.bias: torch.Size([3200])
model.layers.2.mamba.x_proj.0.weight: torch.Size([132, 3200])
model.layers.2.mamba.dt_proj.0.weight: torch.Size([3200, 100])
model.layers.2.mamba.dt_proj.0.bias: torch.Size([3200])
model.layers.2.mamba.A_log.0: torch.Size([3200, 16])
model.layers.2.mamba.D.0: torch.Size([3200])
model.layers.2.mamba.dt_layernorm.weight: torch.Size([100])
model.layers.2.mamba.B_layernorm.weight: torch.Size([16])
model.layers.2.mamba.C_layernorm.weight: torch.Size([16])
model.layers.2.input_layernorm.weight: torch.Size([1600])
model.layers.2.moe.experts.0.gate_proj.weight: torch.Size([5504, 1600])
model.layers.2.moe.experts.0.down_proj.weight: torch.Size([1600, 5504])
model.layers.2.moe.experts.0.up_proj.weight: torch.Size([5504, 1600])
model.layers.2.pre_moe_layernorm.weight: torch.Size([1600])
...
model.layers.31.mamba.pre_avg_layernorm1.weight: torch.Size([3200])
model.layers.31.mamba.pre_avg_layernorm2.weight: torch.Size([3200])
model.layers.31.mamba.in_proj.weight: torch.Size([8960, 1600])
model.layers.31.mamba.out_proj.weight: torch.Size([1600, 3200])
model.layers.31.mamba.conv1d.weight: torch.Size([3200, 1, 4])
model.layers.31.mamba.conv1d.bias: torch.Size([3200])
model.layers.31.mamba.x_proj.0.weight: torch.Size([132, 3200])
model.layers.31.mamba.dt_proj.0.weight: torch.Size([3200, 100])
model.layers.31.mamba.dt_proj.0.bias: torch.Size([3200])
model.layers.31.mamba.A_log.0: torch.Size([3200, 16])
model.layers.31.mamba.D.0: torch.Size([3200])
model.layers.31.mamba.dt_layernorm.weight: torch.Size([100])
model.layers.31.mamba.B_layernorm.weight: torch.Size([16])
model.layers.31.mamba.C_layernorm.weight: torch.Size([16])
model.layers.31.input_layernorm.weight: torch.Size([1600])
model.layers.31.moe.experts.0.gate_proj.weight: torch.Size([5504, 1600])
model.layers.31.moe.experts.0.down_proj.weight: torch.Size([1600, 5504])
model.layers.31.moe.experts.0.up_proj.weight: torch.Size([5504, 1600])
model.layers.31.pre_moe_layernorm.weight: torch.Size([1600])
model.final_layernorm.weight: torch.Size([1600])

Sign up or log in to comment