|
from __future__ import annotations |
|
|
|
from typing import Sequence |
|
|
|
from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES |
|
|
|
|
|
class TensorNameMap: |
|
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { |
|
|
|
MODEL_TENSOR.TOKEN_EMBD: ( |
|
"gpt_neox.embed_in", |
|
"transformer.wte", |
|
"transformer.word_embeddings", |
|
"word_embeddings", |
|
"model.embed_tokens", |
|
"tok_embeddings", |
|
"embeddings.word_embeddings", |
|
"language_model.embedding.word_embeddings", |
|
"wte", |
|
"transformer.embd.wte", |
|
"model.tok_embeddings", |
|
"model.embedding", |
|
"backbone.embedding", |
|
"backbone.embeddings", |
|
"transformer.in_out_embed", |
|
"embedding.word_embeddings", |
|
"transformer.token_embeddings", |
|
"shared", |
|
"rwkv.embeddings", |
|
), |
|
|
|
|
|
MODEL_TENSOR.TOKEN_TYPES: ( |
|
"embeddings.token_type_embeddings", |
|
), |
|
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM: ( |
|
"word_embeddings_layernorm", |
|
"embeddings.LayerNorm", |
|
"emb_ln", |
|
"transformer.norm", |
|
"rwkv.blocks.0.pre_ln", |
|
), |
|
|
|
|
|
MODEL_TENSOR.POS_EMBD: ( |
|
"transformer.wpe", |
|
"embeddings.position_embeddings", |
|
"wpe", |
|
), |
|
|
|
|
|
MODEL_TENSOR.OUTPUT: ( |
|
"embed_out", |
|
"lm_head", |
|
"output", |
|
"word_embeddings_for_head", |
|
"lm_head.linear", |
|
"output_layer", |
|
"head", |
|
), |
|
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM: ( |
|
"gpt_neox.final_layer_norm", |
|
"transformer.ln_f", |
|
"model.norm", |
|
"norm", |
|
"transformer.norm_f", |
|
"ln_f", |
|
"language_model.encoder.final_layernorm", |
|
"model.final_layernorm", |
|
"lm_head.ln", |
|
"model.norm_f", |
|
"backbone.norm_f", |
|
"transformer.rms_norm", |
|
"encoder.final_layernorm", |
|
"transformer.norm", |
|
"model.norm", |
|
"rwkv.ln_out", |
|
), |
|
|
|
|
|
MODEL_TENSOR.ROPE_FREQS: ( |
|
"rope.freqs", |
|
"rotary_pos_emb.inv_freq", |
|
), |
|
|
|
MODEL_TENSOR.ROPE_FACTORS_LONG: (), |
|
MODEL_TENSOR.ROPE_FACTORS_SHORT: (), |
|
} |
|
|
|
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { |
|
|
|
MODEL_TENSOR.ATTN_NORM: ( |
|
"gpt_neox.layers.{bid}.input_layernorm", |
|
"transformer.h.{bid}.ln_1", |
|
"transformer.blocks.{bid}.norm_1", |
|
"transformer.h.{bid}.input_layernorm", |
|
"h.{bid}.input_layernorm", |
|
"transformer.h.{bid}.ln_mlp", |
|
"model.layers.{bid}.input_layernorm", |
|
"layers.{bid}.attention_norm", |
|
"language_model.encoder.layers.{bid}.input_layernorm", |
|
"model.layers.{bid}.ln1", |
|
"h.{bid}.ln_1", |
|
"transformer.h.{bid}.ln", |
|
"model.layers.layers.{bid}.norm", |
|
"model.layers.{bid}.attention_norm", |
|
"model.layers.{bid}.norm", |
|
"backbone.layers.{bid}.norm", |
|
"transformer.decoder_layer.{bid}.rms_norm", |
|
"transformer.blocks.{bid}.norm_attn_norm.norm_1", |
|
"encoder.layers.{bid}.input_layernorm", |
|
"transformer.layers.{bid}.attn_norm", |
|
"rwkv.blocks.{bid}.ln1", |
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_NORM_2: ( |
|
"transformer.h.{bid}.ln_attn", |
|
"encoder.layer.{bid}.layer_norm_1", |
|
"rwkv.blocks.{bid}.ln2", |
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_QKV: ( |
|
"gpt_neox.layers.{bid}.attention.query_key_value", |
|
"transformer.h.{bid}.attn.c_attn", |
|
"transformer.blocks.{bid}.attn.Wqkv", |
|
"transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", |
|
"transformer.h.{bid}.self_attention.query_key_value", |
|
"h.{bid}.self_attention.query_key_value", |
|
"language_model.encoder.layers.{bid}.self_attention.query_key_value", |
|
"model.layers.{bid}.self_attn.query_key_value", |
|
"h.{bid}.attn.c_attn", |
|
"transformer.h.{bid}.mixer.Wqkv", |
|
"encoder.layers.{bid}.attn.Wqkv", |
|
"model.layers.{bid}.self_attn.qkv_proj", |
|
"encoder.layers.{bid}.self_attention.query_key_value", |
|
"transformer.layers.{bid}.attn.qkv_proj", |
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_Q: ( |
|
"model.layers.{bid}.self_attn.q_proj", |
|
"layers.{bid}.attention.wq", |
|
"encoder.layer.{bid}.attention.self.query", |
|
"transformer.h.{bid}.attn.q_proj", |
|
"model.layers.layers.{bid}.self_attn.q_proj", |
|
"model.layers.{bid}.attention.wq", |
|
"transformer.decoder_layer.{bid}.multi_head_attention.query", |
|
"transformer.h.{bid}.attn.attention.q_proj", |
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_K: ( |
|
"model.layers.{bid}.self_attn.k_proj", |
|
"layers.{bid}.attention.wk", |
|
"encoder.layer.{bid}.attention.self.key", |
|
"transformer.h.{bid}.attn.k_proj", |
|
"transformer.h.{bid}.attn.k", |
|
"model.layers.layers.{bid}.self_attn.k_proj", |
|
"model.layers.{bid}.attention.wk", |
|
"transformer.decoder_layer.{bid}.multi_head_attention.key", |
|
"transformer.h.{bid}.attn.attention.k_proj", |
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_V: ( |
|
"model.layers.{bid}.self_attn.v_proj", |
|
"layers.{bid}.attention.wv", |
|
"encoder.layer.{bid}.attention.self.value", |
|
"transformer.h.{bid}.attn.v_proj", |
|
"transformer.h.{bid}.attn.v", |
|
"model.layers.layers.{bid}.self_attn.v_proj", |
|
"model.layers.{bid}.attention.wv", |
|
"transformer.decoder_layer.{bid}.multi_head_attention.value", |
|
"transformer.h.{bid}.attn.attention.v_proj", |
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_OUT: ( |
|
"gpt_neox.layers.{bid}.attention.dense", |
|
"transformer.h.{bid}.attn.c_proj", |
|
"transformer.blocks.{bid}.attn.out_proj", |
|
"transformer.h.{bid}.self_attention.dense", |
|
"h.{bid}.self_attention.dense", |
|
"model.layers.{bid}.self_attn.o_proj", |
|
"layers.{bid}.attention.wo", |
|
"encoder.layer.{bid}.attention.output.dense", |
|
"transformer.h.{bid}.attn.out_proj", |
|
"language_model.encoder.layers.{bid}.self_attention.dense", |
|
"model.layers.{bid}.self_attn.dense", |
|
"h.{bid}.attn.c_proj", |
|
"transformer.h.{bid}.mixer.out_proj", |
|
"model.layers.layers.{bid}.self_attn.o_proj", |
|
"model.layers.{bid}.attention.wo", |
|
"encoder.layers.{bid}.attn.out_proj", |
|
"transformer.decoder_layer.{bid}.multi_head_attention.linear", |
|
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", |
|
"encoder.layers.{bid}.self_attention.dense", |
|
"transformer.layers.{bid}.attn.out_proj", |
|
"transformer.h.{bid}.attn.attention.out_proj", |
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM: ( |
|
"encoder.layer.{bid}.attention.output.LayerNorm", |
|
"encoder.layers.{bid}.norm1", |
|
"transformer.decoder_layer.{bid}.rms_norm_1", |
|
"transformer.blocks.{bid}.norm_attn_norm.norm_2", |
|
), |
|
|
|
MODEL_TENSOR.ATTN_POST_NORM: ( |
|
"model.layers.{bid}.post_attention_layernorm", |
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD: ( |
|
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", |
|
"layers.{bid}.attention.inner_attention.rope.freqs", |
|
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", |
|
"transformer.h.{bid}.attn.rotary_emb.inv_freq", |
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_NORM: ( |
|
"gpt_neox.layers.{bid}.post_attention_layernorm", |
|
"transformer.h.{bid}.ln_2", |
|
"h.{bid}.post_attention_layernorm", |
|
"transformer.blocks.{bid}.norm_2", |
|
"model.layers.{bid}.post_attention_layernorm", |
|
"layers.{bid}.ffn_norm", |
|
"language_model.encoder.layers.{bid}.post_attention_layernorm", |
|
"model.layers.{bid}.ln2", |
|
"h.{bid}.ln_2", |
|
"model.layers.{bid}.ffn_norm", |
|
"transformer.decoder_layer.{bid}.rms_norm_2", |
|
"encoder.layers.{bid}.post_attention_layernorm", |
|
"transformer.layers.{bid}.ffn_norm", |
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_PRE_NORM: ( |
|
"model.layers.{bid}.pre_feedforward_layernorm", |
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_POST_NORM: ( |
|
"model.layers.{bid}.post_feedforward_layernorm", |
|
), |
|
|
|
MODEL_TENSOR.FFN_GATE_INP: ( |
|
"layers.{bid}.feed_forward.gate", |
|
"model.layers.{bid}.block_sparse_moe.gate", |
|
"model.layers.{bid}.mlp.gate", |
|
"transformer.decoder_layer.{bid}.router", |
|
"transformer.blocks.{bid}.ffn.router.layer", |
|
"model.layers.{bid}.block_sparse_moe.router.layer", |
|
), |
|
|
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( |
|
"model.layers.{bid}.mlp.shared_expert_gate", |
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_UP: ( |
|
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", |
|
"transformer.h.{bid}.mlp.c_fc", |
|
"transformer.blocks.{bid}.ffn.up_proj", |
|
"transformer.h.{bid}.mlp.dense_h_to_4h", |
|
"h.{bid}.mlp.dense_h_to_4h", |
|
"model.layers.{bid}.mlp.up_proj", |
|
"layers.{bid}.feed_forward.w3", |
|
"encoder.layer.{bid}.intermediate.dense", |
|
"transformer.h.{bid}.mlp.fc_in", |
|
"transformer.h.{bid}.mlp.linear_3", |
|
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", |
|
"model.layers.{bid}.mlp.dense_h_to_4h", |
|
"transformer.h.{bid}.mlp.w1", |
|
"h.{bid}.mlp.c_fc", |
|
"transformer.h.{bid}.mlp.fc1", |
|
"model.layers.{bid}.mlp.fc1", |
|
"model.layers.{bid}.mlp.gate_up_proj", |
|
"model.layers.layers.{bid}.mlp.up_proj", |
|
"model.layers.{bid}.feed_forward.w3", |
|
"encoder.layers.{bid}.mlp.fc11", |
|
"model.layers.{bid}.mlp.c_fc", |
|
"encoder.layer.{bid}.mlp.gated_layers_v", |
|
"model.layers.{bid}.residual_mlp.w3", |
|
"encoder.layers.{bid}.mlp.dense_h_to_4h", |
|
"transformer.h.{bid}.mlp.c_fc_1", |
|
), |
|
|
|
MODEL_TENSOR.FFN_UP_EXP: ( |
|
"layers.{bid}.feed_forward.experts.w3", |
|
"transformer.decoder_layer.{bid}.moe.linear_v", |
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", |
|
"model.layers.{bid}.mlp.experts.up_proj", |
|
), |
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP: ( |
|
"model.layers.{bid}.mlp.shared_expert.up_proj", |
|
"model.layers.{bid}.mlp.shared_experts.up_proj", |
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_ACT: ( |
|
"transformer.blocks.{bid}.ffn.act", |
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_GATE: ( |
|
"model.layers.{bid}.mlp.gate_proj", |
|
"layers.{bid}.feed_forward.w1", |
|
"transformer.h.{bid}.mlp.w2", |
|
"transformer.h.{bid}.mlp.c_fc2", |
|
"model.layers.layers.{bid}.mlp.gate_proj", |
|
"model.layers.{bid}.feed_forward.w1", |
|
"encoder.layers.{bid}.mlp.fc12", |
|
"encoder.layer.{bid}.mlp.gated_layers_w", |
|
"transformer.h.{bid}.mlp.linear_1", |
|
"model.layers.{bid}.residual_mlp.w1", |
|
"transformer.h.{bid}.mlp.c_fc_0", |
|
), |
|
|
|
MODEL_TENSOR.FFN_GATE_EXP: ( |
|
"layers.{bid}.feed_forward.experts.w1", |
|
"transformer.decoder_layer.{bid}.moe.linear", |
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", |
|
"model.layers.{bid}.mlp.experts.gate_proj", |
|
), |
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP: ( |
|
"model.layers.{bid}.mlp.shared_expert.gate_proj", |
|
"model.layers.{bid}.mlp.shared_experts.gate_proj", |
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_DOWN: ( |
|
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", |
|
"transformer.h.{bid}.mlp.c_proj", |
|
"transformer.blocks.{bid}.ffn.down_proj", |
|
"transformer.h.{bid}.mlp.dense_4h_to_h", |
|
"h.{bid}.mlp.dense_4h_to_h", |
|
"model.layers.{bid}.mlp.down_proj", |
|
"layers.{bid}.feed_forward.w2", |
|
"encoder.layer.{bid}.output.dense", |
|
"transformer.h.{bid}.mlp.fc_out", |
|
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", |
|
"model.layers.{bid}.mlp.dense_4h_to_h", |
|
"h.{bid}.mlp.c_proj", |
|
"transformer.h.{bid}.mlp.fc2", |
|
"model.layers.{bid}.mlp.fc2", |
|
"model.layers.layers.{bid}.mlp.down_proj", |
|
"model.layers.{bid}.feed_forward.w2", |
|
"encoder.layers.{bid}.mlp.fc2", |
|
"model.layers.{bid}.mlp.c_proj", |
|
"encoder.layer.{bid}.mlp.wo", |
|
"transformer.layers.{bid}.ffn.proj_2", |
|
"model.layers.{bid}.residual_mlp.w2", |
|
"encoder.layer.{bid}.mlp.down_layer", |
|
"encoder.layers.{bid}.mlp.dense_4h_to_h", |
|
"model.layers.h.{bid}.mlp.c_proj", |
|
), |
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP: ( |
|
"layers.{bid}.feed_forward.experts.w2", |
|
"transformer.decoder_layer.{bid}.moe.linear_1", |
|
"transformer.blocks.{bid}.ffn.experts.mlp.w2", |
|
"model.layers.{bid}.mlp.experts.down_proj", |
|
"model.layers.{bid}.block_sparse_moe.output_linear", |
|
), |
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP: ( |
|
"model.layers.{bid}.mlp.shared_expert.down_proj", |
|
"model.layers.{bid}.mlp.shared_experts.down_proj", |
|
), |
|
|
|
MODEL_TENSOR.ATTN_Q_NORM: ( |
|
"language_model.encoder.layers.{bid}.self_attention.q_layernorm", |
|
"model.layers.{bid}.self_attn.q_layernorm", |
|
"model.layers.{bid}.self_attn.q_norm", |
|
"transformer.blocks.{bid}.attn.q_ln", |
|
"encoder.layer.{bid}.attention.self.layer_norm_q", |
|
"transformer.layers.{bid}.attn.q_norm", |
|
), |
|
|
|
MODEL_TENSOR.ATTN_K_NORM: ( |
|
"language_model.encoder.layers.{bid}.self_attention.k_layernorm", |
|
"model.layers.{bid}.self_attn.k_layernorm", |
|
"model.layers.{bid}.self_attn.k_norm", |
|
"transformer.blocks.{bid}.attn.k_ln", |
|
"encoder.layer.{bid}.attention.self.layer_norm_k", |
|
"transformer.layers.{bid}.attn.k_norm", |
|
), |
|
|
|
MODEL_TENSOR.ROPE_FREQS: ( |
|
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", |
|
), |
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM: ( |
|
"encoder.layer.{bid}.output.LayerNorm", |
|
"encoder.layers.{bid}.norm2", |
|
"transformer.decoder_layer.{bid}.rms_norm_3", |
|
"encoder.layer.{bid}.mlp.layernorm", |
|
"encoder.layer.{bid}.layer_norm_2" |
|
), |
|
|
|
MODEL_TENSOR.SSM_IN: ( |
|
"model.layers.{bid}.in_proj", |
|
"backbone.layers.{bid}.mixer.in_proj", |
|
), |
|
|
|
MODEL_TENSOR.SSM_CONV1D: ( |
|
"model.layers.{bid}.conv1d", |
|
"backbone.layers.{bid}.mixer.conv1d", |
|
), |
|
|
|
MODEL_TENSOR.SSM_X: ( |
|
"model.layers.{bid}.x_proj", |
|
"backbone.layers.{bid}.mixer.x_proj", |
|
), |
|
|
|
MODEL_TENSOR.SSM_DT: ( |
|
"model.layers.{bid}.dt_proj", |
|
"backbone.layers.{bid}.mixer.dt_proj", |
|
), |
|
|
|
MODEL_TENSOR.SSM_A: ( |
|
"model.layers.{bid}.A_log", |
|
"backbone.layers.{bid}.mixer.A_log", |
|
), |
|
|
|
MODEL_TENSOR.SSM_D: ( |
|
"model.layers.{bid}.D", |
|
"backbone.layers.{bid}.mixer.D", |
|
), |
|
|
|
MODEL_TENSOR.SSM_OUT: ( |
|
"model.layers.{bid}.out_proj", |
|
"backbone.layers.{bid}.mixer.out_proj", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_W1: ( |
|
"rwkv.blocks.{bid}.attention.time_maa_w1", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_W2: ( |
|
"rwkv.blocks.{bid}.attention.time_maa_w2", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_X: ( |
|
"rwkv.blocks.{bid}.attention.time_maa_x", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_K: ( |
|
"rwkv.blocks.{bid}.attention.time_maa_k", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_V: ( |
|
"rwkv.blocks.{bid}.attention.time_maa_v", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_R: ( |
|
"rwkv.blocks.{bid}.attention.time_maa_r", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_G: ( |
|
"rwkv.blocks.{bid}.attention.time_maa_g", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_W: ( |
|
"rwkv.blocks.{bid}.attention.time_maa_w", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_FIRST: ( |
|
"rwkv.blocks.{bid}.attention.time_faaaa", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY: ( |
|
"rwkv.blocks.{bid}.attention.time_decay", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W1: ( |
|
"rwkv.blocks.{bid}.attention.time_decay_w1", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W2: ( |
|
"rwkv.blocks.{bid}.attention.time_decay_w2", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_KEY: ( |
|
"rwkv.blocks.{bid}.attention.key", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_VALUE: ( |
|
"rwkv.blocks.{bid}.attention.value", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE: ( |
|
"rwkv.blocks.{bid}.attention.receptance", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_GATE: ( |
|
"rwkv.blocks.{bid}.attention.gate", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_LN: ( |
|
"rwkv.blocks.{bid}.attention.ln_x", |
|
), |
|
|
|
MODEL_TENSOR.TIME_MIX_OUTPUT: ( |
|
"rwkv.blocks.{bid}.attention.output", |
|
), |
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K: ( |
|
"rwkv.blocks.{bid}.feed_forward.time_maa_k", |
|
), |
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_R: ( |
|
"rwkv.blocks.{bid}.feed_forward.time_maa_r", |
|
), |
|
|
|
MODEL_TENSOR.CHANNEL_MIX_KEY: ( |
|
"rwkv.blocks.{bid}.feed_forward.key", |
|
), |
|
|
|
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: ( |
|
"rwkv.blocks.{bid}.feed_forward.receptance", |
|
), |
|
|
|
MODEL_TENSOR.CHANNEL_MIX_VALUE: ( |
|
"rwkv.blocks.{bid}.feed_forward.value", |
|
), |
|
|
|
MODEL_TENSOR.ATTN_Q_A: ( |
|
"model.layers.{bid}.self_attn.q_a_proj", |
|
), |
|
|
|
MODEL_TENSOR.ATTN_Q_B: ( |
|
"model.layers.{bid}.self_attn.q_b_proj", |
|
), |
|
|
|
MODEL_TENSOR.ATTN_KV_A_MQA: ( |
|
"model.layers.{bid}.self_attn.kv_a_proj_with_mqa", |
|
), |
|
|
|
MODEL_TENSOR.ATTN_KV_B: ( |
|
"model.layers.{bid}.self_attn.kv_b_proj", |
|
), |
|
|
|
MODEL_TENSOR.ATTN_Q_A_NORM: ( |
|
"model.layers.{bid}.self_attn.q_a_layernorm", |
|
), |
|
|
|
MODEL_TENSOR.ATTN_KV_A_NORM: ( |
|
"model.layers.{bid}.self_attn.kv_a_layernorm", |
|
), |
|
|
|
MODEL_TENSOR.ATTN_SUB_NORM: ( |
|
"model.layers.{bid}.self_attn.inner_attn_ln", |
|
), |
|
|
|
MODEL_TENSOR.FFN_SUB_NORM: ( |
|
"model.layers.{bid}.mlp.ffn_layernorm", |
|
), |
|
|
|
MODEL_TENSOR.DEC_ATTN_NORM: ( |
|
"decoder.block.{bid}.layer.0.layer_norm", |
|
), |
|
|
|
MODEL_TENSOR.DEC_ATTN_Q: ( |
|
"decoder.block.{bid}.layer.0.SelfAttention.q", |
|
), |
|
|
|
MODEL_TENSOR.DEC_ATTN_K: ( |
|
"decoder.block.{bid}.layer.0.SelfAttention.k", |
|
), |
|
|
|
MODEL_TENSOR.DEC_ATTN_V: ( |
|
"decoder.block.{bid}.layer.0.SelfAttention.v", |
|
), |
|
|
|
MODEL_TENSOR.DEC_ATTN_OUT: ( |
|
"decoder.block.{bid}.layer.0.SelfAttention.o", |
|
), |
|
|
|
MODEL_TENSOR.DEC_ATTN_REL_B: ( |
|
"decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", |
|
), |
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: ( |
|
"decoder.block.{bid}.layer.1.layer_norm", |
|
), |
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: ( |
|
"decoder.block.{bid}.layer.1.EncDecAttention.q", |
|
), |
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: ( |
|
"decoder.block.{bid}.layer.1.EncDecAttention.k", |
|
), |
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: ( |
|
"decoder.block.{bid}.layer.1.EncDecAttention.v", |
|
), |
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: ( |
|
"decoder.block.{bid}.layer.1.EncDecAttention.o", |
|
), |
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: ( |
|
"decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", |
|
), |
|
|
|
MODEL_TENSOR.DEC_FFN_NORM: ( |
|
"decoder.block.{bid}.layer.2.layer_norm", |
|
), |
|
|
|
MODEL_TENSOR.DEC_FFN_GATE: ( |
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_0", |
|
), |
|
|
|
MODEL_TENSOR.DEC_FFN_UP: ( |
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi", |
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_1", |
|
), |
|
|
|
MODEL_TENSOR.DEC_FFN_DOWN: ( |
|
"decoder.block.{bid}.layer.2.DenseReluDense.wo", |
|
), |
|
|
|
MODEL_TENSOR.DEC_OUTPUT_NORM: ( |
|
"decoder.final_layer_norm", |
|
), |
|
|
|
MODEL_TENSOR.ENC_ATTN_NORM: ( |
|
"encoder.block.{bid}.layer.0.layer_norm", |
|
), |
|
|
|
MODEL_TENSOR.ENC_ATTN_Q: ( |
|
"encoder.block.{bid}.layer.0.SelfAttention.q", |
|
), |
|
|
|
MODEL_TENSOR.ENC_ATTN_K: ( |
|
"encoder.block.{bid}.layer.0.SelfAttention.k", |
|
), |
|
|
|
MODEL_TENSOR.ENC_ATTN_V: ( |
|
"encoder.block.{bid}.layer.0.SelfAttention.v", |
|
), |
|
|
|
MODEL_TENSOR.ENC_ATTN_OUT: ( |
|
"encoder.block.{bid}.layer.0.SelfAttention.o", |
|
), |
|
|
|
MODEL_TENSOR.ENC_ATTN_REL_B: ( |
|
"encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", |
|
), |
|
|
|
MODEL_TENSOR.ENC_FFN_NORM: ( |
|
"encoder.block.{bid}.layer.1.layer_norm", |
|
), |
|
|
|
MODEL_TENSOR.ENC_FFN_GATE: ( |
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_0", |
|
), |
|
|
|
MODEL_TENSOR.ENC_FFN_UP: ( |
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi", |
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_1", |
|
), |
|
|
|
MODEL_TENSOR.ENC_FFN_DOWN: ( |
|
"encoder.block.{bid}.layer.1.DenseReluDense.wo", |
|
), |
|
|
|
MODEL_TENSOR.ENC_OUTPUT_NORM: ( |
|
"encoder.final_layer_norm", |
|
), |
|
|
|
MODEL_TENSOR.CLS: ( |
|
"classifier", |
|
"classifier.dense", |
|
), |
|
|
|
MODEL_TENSOR.CLS_OUT: ( |
|
"classifier.out_proj", |
|
), |
|
} |
|
|
|
|
|
arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = { |
|
MODEL_ARCH.ARCTIC: { |
|
MODEL_TENSOR.FFN_NORM: ( |
|
"model.layers.{bid}.residual_layernorm", |
|
), |
|
MODEL_TENSOR.FFN_NORM_EXP: ( |
|
"model.layers.{bid}.post_attention_layernorm", |
|
), |
|
}, |
|
} |
|
|
|
mapping: dict[str, tuple[MODEL_TENSOR, str]] |
|
|
|
def __init__(self, arch: MODEL_ARCH, n_blocks: int): |
|
self.mapping = {} |
|
for tensor, keys in self.mappings_cfg.items(): |
|
if tensor not in MODEL_TENSORS[arch]: |
|
continue |
|
tensor_name = TENSOR_NAMES[tensor] |
|
self.mapping[tensor_name] = (tensor, tensor_name) |
|
for key in keys: |
|
self.mapping[key] = (tensor, tensor_name) |
|
if arch in self.arch_block_mappings_cfg: |
|
self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch]) |
|
for bid in range(n_blocks): |
|
for tensor, keys in self.block_mappings_cfg.items(): |
|
if tensor not in MODEL_TENSORS[arch]: |
|
continue |
|
|
|
tensor_name = TENSOR_NAMES[tensor].format(bid = bid) |
|
self.mapping[tensor_name] = (tensor, tensor_name) |
|
for key in keys: |
|
key = key.format(bid = bid) |
|
self.mapping[key] = (tensor, tensor_name) |
|
|
|
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: |
|
result = self.mapping.get(key) |
|
if result is not None: |
|
return result |
|
for suffix in try_suffixes: |
|
if key.endswith(suffix): |
|
result = self.mapping.get(key[:-len(suffix)]) |
|
if result is not None: |
|
return result[0], result[1] + suffix |
|
return None |
|
|
|
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: |
|
result = self.get_type_and_name(key, try_suffixes = try_suffixes) |
|
if result is None: |
|
return None |
|
return result[1] |
|
|
|
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: |
|
result = self.get_type_and_name(key, try_suffixes = try_suffixes) |
|
if result is None: |
|
return None |
|
return result[0] |
|
|
|
def __getitem__(self, key: str) -> str: |
|
try: |
|
return self.mapping[key][1] |
|
except KeyError: |
|
raise KeyError(key) |
|
|
|
def __contains__(self, key: str) -> bool: |
|
return key in self.mapping |
|
|
|
def __repr__(self) -> str: |
|
return repr(self.mapping) |
|
|
|
|
|
def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap: |
|
return TensorNameMap(arch, n_blocks) |
|
|