|
{ |
|
"LanguageModel":{ |
|
"type":"LayeredMambaLM", |
|
"input":{ |
|
"vocab_size":51200, |
|
"pad_vocab_size_multiple":8 |
|
} |
|
}, |
|
"MixerModel":{ |
|
"type":"LayerMixerModel", |
|
"input":{ |
|
"d_model":2048, |
|
"n_layer":24, |
|
"lm_head_prenorm":"layer" |
|
} |
|
}, |
|
"Block1":{ |
|
"n_layers":4, |
|
"BlockType":"modules.phi_block", |
|
"block_input":{ |
|
"resid_dropout":0.0 |
|
}, |
|
"CoreType":"modules.mixers.discrete_mamba2", |
|
"core_input":{ |
|
"d_state":64, |
|
"n_v_heads":32, |
|
"n_qk_heads":32, |
|
"d_conv":4, |
|
"conv_bias":true, |
|
"expand":1, |
|
"chunk_size":128, |
|
"activation":"identity", |
|
"bias":false |
|
} |
|
}, |
|
"Block2":{ |
|
"n_layers":1, |
|
"BlockType":"modules.phi_block", |
|
"block_input":{ |
|
"resid_dropout":0.0 |
|
}, |
|
"CoreType":"modules.mixers.phi_attention", |
|
"core_input":{ |
|
|
|
} |
|
}, |
|
"Block3":{ |
|
"n_layers":1, |
|
"BlockType":"modules.phi_block", |
|
"block_input":{ |
|
"resid_dropout":0.0 |
|
}, |
|
"CoreType":"modules.mixers.discrete_mamba2", |
|
"core_input":{ |
|
"d_state":64, |
|
"n_v_heads":32, |
|
"n_qk_heads":32, |
|
"d_conv":4, |
|
"conv_bias":true, |
|
"expand":1, |
|
"chunk_size":128, |
|
"activation":"identity", |
|
"bias":false |
|
} |
|
}, |
|
"Block4":{ |
|
"n_layers":4, |
|
"BlockType":"modules.phi_block", |
|
"block_input":{ |
|
"resid_dropout":0.0 |
|
}, |
|
"CoreType":"modules.mixers.discrete_mamba2", |
|
"core_input":{ |
|
"d_state":64, |
|
"nheads":32, |
|
"expand":1, |
|
"activation":"identity", |
|
"use_ref_impl":false, |
|
"bias":false, |
|
"norm_cls":"none" |
|
} |
|
}, |
|
"Block5":{ |
|
"n_layers":1, |
|
"BlockType":"modules.phi_block", |
|
"block_input":{ |
|
"resid_dropout":0.0 |
|
}, |
|
"CoreType":"modules.mixers.phi_attention", |
|
"core_input":{ |
|
|
|
} |
|
}, |
|
"Block6":{ |
|
"n_layers":1, |
|
"BlockType":"modules.phi_block", |
|
"block_input":{ |
|
"resid_dropout":0.0 |
|
}, |
|
"CoreType":"modules.mixers.discrete_mamba2", |
|
"core_input":{ |
|
"d_state":64, |
|
"n_v_heads":32, |
|
"n_qk_heads":32, |
|
"d_conv":4, |
|
"conv_bias":true, |
|
"expand":1, |
|
"chunk_size":128, |
|
"activation":"identity", |
|
"bias":false |
|
} |
|
}, |
|
"Block7":{ |
|
"n_layers":4, |
|
"BlockType":"modules.phi_block", |
|
"block_input":{ |
|
"resid_dropout":0.0 |
|
}, |
|
"CoreType":"modules.mixers.discrete_mamba2", |
|
"core_input":{ |
|
"d_state":64, |
|
"n_v_heads":32, |
|
"n_qk_heads":32, |
|
"d_conv":4, |
|
"conv_bias":true, |
|
"expand":1, |
|
"chunk_size":128, |
|
"activation":"identity", |
|
"bias":false |
|
} |
|
}, |
|
"Block8":{ |
|
"n_layers":1, |
|
"BlockType":"modules.phi_block", |
|
"block_input":{ |
|
"resid_dropout":0.0 |
|
}, |
|
"CoreType":"modules.mixers.phi_attention", |
|
"core_input":{ |
|
|
|
} |
|
}, |
|
"Block9":{ |
|
"n_layers":1, |
|
"BlockType":"modules.phi_block", |
|
"block_input":{ |
|
"resid_dropout":0.0 |
|
}, |
|
"CoreType":"modules.mixers.discrete_mamba2", |
|
"core_input":{ |
|
"d_state":64, |
|
"n_v_heads":32, |
|
"n_qk_heads":32, |
|
"d_conv":4, |
|
"conv_bias":true, |
|
"expand":1, |
|
"chunk_size":128, |
|
"activation":"identity", |
|
"bias":false |
|
} |
|
}, |
|
"Block10":{ |
|
"n_layers":4, |
|
"BlockType":"modules.phi_block", |
|
"block_input":{ |
|
"resid_dropout":0.0 |
|
}, |
|
"CoreType":"modules.mixers.discrete_mamba2", |
|
"core_input":{ |
|
"d_state":64, |
|
"n_v_heads":32, |
|
"n_qk_heads":32, |
|
"d_conv":4, |
|
"conv_bias":true, |
|
"expand":1, |
|
"chunk_size":128, |
|
"activation":"identity", |
|
"bias":false |
|
} |
|
}, |
|
"Block11":{ |
|
"n_layers":1, |
|
"BlockType":"modules.phi_block", |
|
"block_input":{ |
|
"resid_dropout":0.0 |
|
}, |
|
"CoreType":"modules.mixers.phi_attention", |
|
"core_input":{ |
|
|
|
} |
|
}, |
|
"Block12":{ |
|
"n_layers":1, |
|
"BlockType":"modules.phi_block", |
|
"block_input":{ |
|
"resid_dropout":0.0 |
|
}, |
|
"CoreType":"modules.mixers.discrete_mamba2", |
|
"core_input":{ |
|
"d_state":64, |
|
"n_v_heads":32, |
|
"n_qk_heads":32, |
|
"d_conv":4, |
|
"conv_bias":true, |
|
"expand":1, |
|
"chunk_size":128, |
|
"activation":"identity", |
|
"bias":false |
|
} |
|
} |
|
} |
|
|