arch: type: TransformerLMHeadModel args: transformer_config: type: TransformerDecoderOnlyModel args: embed_config: type: TransformerEmbeddingBlock args: token_embed_config: type: TokenEmbedding args: n_embed: 1024 n_vocab: 151936 pos_embed_config: null type_embed_config: null ln_config: null p_drop_embed: 0.0 concat_strategy: id_first decoder_config: type: TransformerDecoderBlock args: attn_config: type: LlamaAttention args: n_embed: 1024 n_pos: 32768 n_head: 16 n_key_value_head: 16 head_size: 64 p_drop_attn: 0.0 p_drop_resid: 0.0 bias_attn: true bias_proj: false cross_attn: false scale_dot_product: true scale_layer_wise: false layer_idx: null rope_config: type: MistralRotaryEmbedding args: rotary_head_size: 64 n_pos: 32768 base: 1.0e06 scaling_type: null scaling_factor: null mlp_config: type: LlamaMLP args: n_embed: 1024 n_inner: 2816 act_fn_config: type: SiLUActivation args: {} ln_config: type: LlamaRMSNorm args: n_embed: 1024 ln_eps: 1.0e-06 n_embed: 1024 post_norm: false add_cross_attn: false n_embed: 1024 n_layer: 24 n_head: 16 ln_config: type: LlamaRMSNorm args: n_embed: 1024 ln_eps: 1.0e-06 perform_linear_bias: false attn_window_size_loop_unit: null lm_head_config: type: TransformerLMHead args: n_vocab: 151936 n_embed: 1024 perform_transform: false act_fn_config: null ln_config: null