Hybrid-Phi-Mamba / config.json
AvivBick's picture
Update config.json
5ec5524 verified
{
"LanguageModel":{
"type":"LayeredMambaLM",
"input":{
"vocab_size":51200,
"pad_vocab_size_multiple":8
}
},
"MixerModel":{
"type":"LayerMixerModel",
"input":{
"d_model":2048,
"n_layer":24,
"lm_head_prenorm":"layer"
}
},
"Block1":{
"n_layers":4,
"BlockType":"modules.phi_block",
"block_input":{
"resid_dropout":0.0
},
"CoreType":"modules.mixers.discrete_mamba2",
"core_input":{
"d_state":64,
"n_v_heads":32,
"n_qk_heads":32,
"d_conv":4,
"conv_bias":true,
"expand":1,
"chunk_size":128,
"activation":"identity",
"bias":false
}
},
"Block2":{
"n_layers":1,
"BlockType":"modules.phi_block",
"block_input":{
"resid_dropout":0.0
},
"CoreType":"modules.mixers.phi_attention",
"core_input":{
}
},
"Block3":{
"n_layers":1,
"BlockType":"modules.phi_block",
"block_input":{
"resid_dropout":0.0
},
"CoreType":"modules.mixers.discrete_mamba2",
"core_input":{
"d_state":64,
"n_v_heads":32,
"n_qk_heads":32,
"d_conv":4,
"conv_bias":true,
"expand":1,
"chunk_size":128,
"activation":"identity",
"bias":false
}
},
"Block4":{
"n_layers":4,
"BlockType":"modules.phi_block",
"block_input":{
"resid_dropout":0.0
},
"CoreType":"modules.mixers.discrete_mamba2",
"core_input":{
"d_state":64,
"nheads":32,
"expand":1,
"activation":"identity",
"use_ref_impl":false,
"bias":false,
"norm_cls":"none"
}
},
"Block5":{
"n_layers":1,
"BlockType":"modules.phi_block",
"block_input":{
"resid_dropout":0.0
},
"CoreType":"modules.mixers.phi_attention",
"core_input":{
}
},
"Block6":{
"n_layers":1,
"BlockType":"modules.phi_block",
"block_input":{
"resid_dropout":0.0
},
"CoreType":"modules.mixers.discrete_mamba2",
"core_input":{
"d_state":64,
"n_v_heads":32,
"n_qk_heads":32,
"d_conv":4,
"conv_bias":true,
"expand":1,
"chunk_size":128,
"activation":"identity",
"bias":false
}
},
"Block7":{
"n_layers":4,
"BlockType":"modules.phi_block",
"block_input":{
"resid_dropout":0.0
},
"CoreType":"modules.mixers.discrete_mamba2",
"core_input":{
"d_state":64,
"n_v_heads":32,
"n_qk_heads":32,
"d_conv":4,
"conv_bias":true,
"expand":1,
"chunk_size":128,
"activation":"identity",
"bias":false
}
},
"Block8":{
"n_layers":1,
"BlockType":"modules.phi_block",
"block_input":{
"resid_dropout":0.0
},
"CoreType":"modules.mixers.phi_attention",
"core_input":{
}
},
"Block9":{
"n_layers":1,
"BlockType":"modules.phi_block",
"block_input":{
"resid_dropout":0.0
},
"CoreType":"modules.mixers.discrete_mamba2",
"core_input":{
"d_state":64,
"n_v_heads":32,
"n_qk_heads":32,
"d_conv":4,
"conv_bias":true,
"expand":1,
"chunk_size":128,
"activation":"identity",
"bias":false
}
},
"Block10":{
"n_layers":4,
"BlockType":"modules.phi_block",
"block_input":{
"resid_dropout":0.0
},
"CoreType":"modules.mixers.discrete_mamba2",
"core_input":{
"d_state":64,
"n_v_heads":32,
"n_qk_heads":32,
"d_conv":4,
"conv_bias":true,
"expand":1,
"chunk_size":128,
"activation":"identity",
"bias":false
}
},
"Block11":{
"n_layers":1,
"BlockType":"modules.phi_block",
"block_input":{
"resid_dropout":0.0
},
"CoreType":"modules.mixers.phi_attention",
"core_input":{
}
},
"Block12":{
"n_layers":1,
"BlockType":"modules.phi_block",
"block_input":{
"resid_dropout":0.0
},
"CoreType":"modules.mixers.discrete_mamba2",
"core_input":{
"d_state":64,
"n_v_heads":32,
"n_qk_heads":32,
"d_conv":4,
"conv_bias":true,
"expand":1,
"chunk_size":128,
"activation":"identity",
"bias":false
}
}
}