diff --git "a/Llama-2-7b-hf_chunk6.mlmodelc/model.mil" "b/Llama-2-7b-hf_chunk6.mlmodelc/model.mil"
--- "a/Llama-2-7b-hf_chunk6.mlmodelc/model.mil"
+++ "b/Llama-2-7b-hf_chunk6.mlmodelc/model.mil"
@@ -1,7 +1,7 @@
 program(1.0)
-[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "5.33.5"}, {"coremlc-version", "1877.40.3"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "7.2"}})]
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
 {
-    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 32, 128, 448]> k_cache_0, tensor<fp16, [1, 32, 128, 448]> k_cache_1, tensor<fp16, [1, 32, 128, 448]> k_cache_2, tensor<fp16, [1, 1, 64, 512]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 32, 128, 448]> v_cache_0, tensor<fp16, [1, 32, 128, 448]> v_cache_1, tensor<fp16, [1, 32, 128, 448]> v_cache_2, tensor<fp16, [1, 4096, 1, 64]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 4096]> k_cache_0, tensor<fp16, [1, 448, 1, 4096]> k_cache_1, tensor<fp16, [1, 448, 1, 4096]> k_cache_2, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 4096, 1, 448]> v_cache_0, tensor<fp16, [1, 4096, 1, 448]> v_cache_1, tensor<fp16, [1, 4096, 1, 448]> v_cache_2, tensor<fp16, [1, 4096, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"k_cache_2", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}, {"v_cache_2", 0}})] {
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_q_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense()[indices = tensor<uint8, [8388608]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8388736))), name = tensor<string, []>("blocks_0_attn_q_proj_weight_palettized_cast_fp16"), shape = tensor<uint32, [4]>([4096, 4096, 1, 1])];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_k_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense()[indices = tensor<uint8, [8388608]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8388864))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16777536))), name = tensor<string, []>("blocks_0_attn_k_proj_weight_palettized_cast_fp16"), shape = tensor<uint32, [4]>([4096, 4096, 1, 1])];
             tensor<fp16, [4096, 4096, 1, 1]> blocks_0_attn_v_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense()[indices = tensor<uint8, [8388608]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16777664))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25166336))), name = tensor<string, []>("blocks_0_attn_v_proj_weight_palettized_cast_fp16"), shape = tensor<uint32, [4]>([4096, 4096, 1, 1])];
@@ -23,407 +23,2315 @@ program(1.0)
             tensor<fp16, [11008, 4096, 1, 1]> blocks_2_mlp_fc_1_weight_palettized_cast_fp16 = constexpr_lut_to_dense()[indices = tensor<uint8, [22544384]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(235933120))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(258477568))), name = tensor<string, []>("blocks_2_mlp_fc_1_weight_palettized_cast_fp16"), shape = tensor<uint32, [4]>([11008, 4096, 1, 1])];
             tensor<fp16, [11008, 4096, 1, 1]> blocks_2_mlp_fc_2_weight_palettized_cast_fp16 = constexpr_lut_to_dense()[indices = tensor<uint8, [22544384]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(258477696))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(281022144))), name = tensor<string, []>("blocks_2_mlp_fc_2_weight_palettized_cast_fp16"), shape = tensor<uint32, [4]>([11008, 4096, 1, 1])];
             tensor<fp16, [4096, 11008, 1, 1]> blocks_2_mlp_proj_weight_palettized_cast_fp16 = constexpr_lut_to_dense()[indices = tensor<uint8, [22544384]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(281022272))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303566720))), name = tensor<string, []>("blocks_2_mlp_proj_weight_palettized_cast_fp16"), shape = tensor<uint32, [4]>([4096, 11008, 1, 1])];
-            tensor<int32, []> var_18 = const()[name = tensor<string, []>("op_18"), val = tensor<int32, []>(3)];
-            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(-2)];
-            tensor<int32, []> var_25 = const()[name = tensor<string, []>("op_25"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_32 = const()[name = tensor<string, []>("op_32"), val = tensor<int32, []>(1)];
-            tensor<bool, []> var_33 = const()[name = tensor<string, []>("op_33"), val = tensor<bool, []>(true)];
-            tensor<fp16, [1, 4096, 1, 64]> var_41_cast_fp16 = mul(x = x, y = x)[name = tensor<string, []>("op_41_cast_fp16")];
-            tensor<int32, [1]> var_42 = const()[name = tensor<string, []>("op_42"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 64]> norm_x_1_cast_fp16 = reduce_mean(axes = var_42, keep_dims = var_33, x = var_41_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
-            tensor<fp16, []> var_44_to_fp16 = const()[name = tensor<string, []>("op_44_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
-            tensor<fp16, [1, 1, 1, 64]> var_45_cast_fp16 = add(x = norm_x_1_cast_fp16, y = var_44_to_fp16)[name = tensor<string, []>("op_45_cast_fp16")];
-            tensor<fp16, []> var_46_epsilon_0_to_fp16 = const()[name = tensor<string, []>("op_46_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1p-24)];
-            tensor<fp16, [1, 1, 1, 64]> var_46_cast_fp16 = rsqrt(epsilon = var_46_epsilon_0_to_fp16, x = var_45_cast_fp16)[name = tensor<string, []>("op_46_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> x_normed_1_cast_fp16 = mul(x = x, y = var_46_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303566848)))];
-            tensor<fp16, [1, 4096, 1, 64]> x_5_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
-            tensor<int32, [2]> var_58 = const()[name = tensor<string, []>("op_58"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_60 = const()[name = tensor<string, []>("op_60"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_62_pad_type_0 = const()[name = tensor<string, []>("op_62_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_62_pad_0 = const()[name = tensor<string, []>("op_62_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_62_cast_fp16 = conv(dilations = var_60, groups = var_32, pad = var_62_pad_0, pad_type = var_62_pad_type_0, strides = var_58, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = tensor<string, []>("op_62_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303575104)))];
-            tensor<fp16, [1, 4096, 1, 64]> q_1_cast_fp16 = mul(x = var_62_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
-            tensor<int32, [2]> var_66 = const()[name = tensor<string, []>("op_66"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_68 = const()[name = tensor<string, []>("op_68"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_70_pad_type_0 = const()[name = tensor<string, []>("op_70_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_70_pad_0 = const()[name = tensor<string, []>("op_70_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_70_cast_fp16 = conv(dilations = var_68, groups = var_32, pad = var_70_pad_0, pad_type = var_70_pad_type_0, strides = var_66, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = tensor<string, []>("op_70_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303583360)))];
-            tensor<fp16, [1, 4096, 1, 64]> k_1_cast_fp16 = mul(x = var_70_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
-            tensor<int32, [2]> var_74 = const()[name = tensor<string, []>("op_74"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_76 = const()[name = tensor<string, []>("op_76"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_78_pad_type_0 = const()[name = tensor<string, []>("op_78_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_78_pad_0 = const()[name = tensor<string, []>("op_78_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_78_cast_fp16 = conv(dilations = var_76, groups = var_32, pad = var_78_pad_0, pad_type = var_78_pad_type_0, strides = var_74, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = x_5_cast_fp16)[name = tensor<string, []>("op_78_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303591616)))];
-            tensor<fp16, [1, 4096, 1, 64]> v_1_cast_fp16 = mul(x = var_78_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
-            tensor<int32, [4]> var_80 = const()[name = tensor<string, []>("op_80"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<fp16, [1, 32, 128, 64]> q_3_cast_fp16 = reshape(shape = var_80, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
-            tensor<int32, [4]> var_82 = const()[name = tensor<string, []>("op_82"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<fp16, [1, 32, 128, 64]> k_3_cast_fp16 = reshape(shape = var_82, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
-            tensor<int32, [4]> var_84 = const()[name = tensor<string, []>("op_84"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<fp16, [1, 32, 128, 64]> new_v_cache_0 = reshape(shape = var_84, x = v_1_cast_fp16)[name = tensor<string, []>("v_3_cast_fp16")];
-            tensor<int32, [4]> var_96_begin_0 = const()[name = tensor<string, []>("op_96_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_96_end_0 = const()[name = tensor<string, []>("op_96_end_0"), val = tensor<int32, [4]>([1, 32, 64, 64])];
-            tensor<bool, [4]> var_96_end_mask_0 = const()[name = tensor<string, []>("op_96_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 64]> var_96_cast_fp16 = slice_by_index(begin = var_96_begin_0, end = var_96_end_0, end_mask = var_96_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_96_cast_fp16")];
-            tensor<int32, [4]> var_102_begin_0 = const()[name = tensor<string, []>("op_102_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_102_end_0 = const()[name = tensor<string, []>("op_102_end_0"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<bool, [4]> var_102_end_mask_0 = const()[name = tensor<string, []>("op_102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 64]> var_102_cast_fp16 = slice_by_index(begin = var_102_begin_0, end = var_102_end_0, end_mask = var_102_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_102_cast_fp16")];
-            tensor<fp16, []> const_3_promoted_to_fp16 = const()[name = tensor<string, []>("const_3_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 64]> var_104_cast_fp16 = mul(x = var_102_cast_fp16, y = const_3_promoted_to_fp16)[name = tensor<string, []>("op_104_cast_fp16")];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_21 = const()[name = tensor<string, []>("op_21"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_23 = const()[name = tensor<string, []>("op_23"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_64 = const()[name = tensor<string, []>("op_64"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_67 = const()[name = tensor<string, []>("op_67"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303566848)))];
+            tensor<fp16, [1, 4097, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_64, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_67, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_91_to_fp16 = const()[name = tensor<string, []>("op_91_to_fp16"), val = tensor<fp16, []>(0x1p+6)];
+            tensor<fp16, [1, 4096, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_91_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303567040)))];
+            tensor<fp16, [1, 4096, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_113 = const()[name = tensor<string, []>("op_113"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 4096, 1, 64]> input_1_cast_fp16 = reshape(shape = var_113, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_119 = const()[name = tensor<string, []>("op_119"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_121_pad_type_0 = const()[name = tensor<string, []>("op_121_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_121_pad_0 = const()[name = tensor<string, []>("op_121_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 64]> var_121_cast_fp16 = conv(dilations = var_119, groups = var_64, pad = var_121_pad_0, pad_type = var_121_pad_type_0, strides = var_117, weight = blocks_0_attn_q_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("op_121_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_q_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303575296)))];
+            tensor<fp16, [1, 4096, 1, 64]> q_1_cast_fp16 = mul(x = var_121_cast_fp16, y = blocks_0_attn_q_proj_output_scales_to_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_125 = const()[name = tensor<string, []>("op_125"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_127 = const()[name = tensor<string, []>("op_127"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_129_pad_type_0 = const()[name = tensor<string, []>("op_129_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_129_pad_0 = const()[name = tensor<string, []>("op_129_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 64]> var_129_cast_fp16 = conv(dilations = var_127, groups = var_64, pad = var_129_pad_0, pad_type = var_129_pad_type_0, strides = var_125, weight = blocks_0_attn_k_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("op_129_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_k_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303583552)))];
+            tensor<fp16, [1, 4096, 1, 64]> k_1_cast_fp16 = mul(x = var_129_cast_fp16, y = blocks_0_attn_k_proj_output_scales_to_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_133 = const()[name = tensor<string, []>("op_133"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_135 = const()[name = tensor<string, []>("op_135"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_137_pad_type_0 = const()[name = tensor<string, []>("op_137_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_137_pad_0 = const()[name = tensor<string, []>("op_137_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 64]> var_137_cast_fp16 = conv(dilations = var_135, groups = var_64, pad = var_137_pad_0, pad_type = var_137_pad_type_0, strides = var_133, weight = blocks_0_attn_v_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("op_137_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_v_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303591808)))];
+            tensor<fp16, [1, 4096, 1, 64]> v_1_cast_fp16 = mul(x = var_137_cast_fp16, y = blocks_0_attn_v_proj_output_scales_to_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_139 = const()[name = tensor<string, []>("op_139"), val = tensor<int32, [4]>([1, 32, 128, 64])];
+            tensor<fp16, [1, 32, 128, 64]> q_3_cast_fp16 = reshape(shape = var_139, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_141 = const()[name = tensor<string, []>("op_141"), val = tensor<int32, [4]>([1, 32, 128, 64])];
+            tensor<fp16, [1, 32, 128, 64]> k_3_cast_fp16 = reshape(shape = var_141, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_155_begin_0 = const()[name = tensor<string, []>("op_155_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_155_end_0 = const()[name = tensor<string, []>("op_155_end_0"), val = tensor<int32, [4]>([1, 32, 64, 64])];
+            tensor<bool, [4]> var_155_end_mask_0 = const()[name = tensor<string, []>("op_155_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 64]> var_155_cast_fp16 = slice_by_index(begin = var_155_begin_0, end = var_155_end_0, end_mask = var_155_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_155_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 32, 128, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<fp16, []> const_11_promoted_to_fp16 = const()[name = tensor<string, []>("const_11_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 64]> var_163_cast_fp16 = mul(x = var_161_cast_fp16, y = const_11_promoted_to_fp16)[name = tensor<string, []>("op_163_cast_fp16")];
             tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_23, interleave = rotated_1_interleave_0, values = (var_104_cast_fp16, var_96_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> var_107_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_107_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> var_108_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_108_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> roped_1_cast_fp16 = add(x = var_107_cast_fp16, y = var_108_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
-            tensor<int32, [4]> var_121_begin_0 = const()[name = tensor<string, []>("op_121_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_121_end_0 = const()[name = tensor<string, []>("op_121_end_0"), val = tensor<int32, [4]>([1, 32, 64, 64])];
-            tensor<bool, [4]> var_121_end_mask_0 = const()[name = tensor<string, []>("op_121_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 64]> var_121_cast_fp16 = slice_by_index(begin = var_121_begin_0, end = var_121_end_0, end_mask = var_121_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_121_cast_fp16")];
-            tensor<int32, [4]> var_127_begin_0 = const()[name = tensor<string, []>("op_127_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_127_end_0 = const()[name = tensor<string, []>("op_127_end_0"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<bool, [4]> var_127_end_mask_0 = const()[name = tensor<string, []>("op_127_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 64]> var_127_cast_fp16 = slice_by_index(begin = var_127_begin_0, end = var_127_end_0, end_mask = var_127_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_127_cast_fp16")];
-            tensor<fp16, []> const_5_promoted_to_fp16 = const()[name = tensor<string, []>("const_5_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 64]> var_129_cast_fp16 = mul(x = var_127_cast_fp16, y = const_5_promoted_to_fp16)[name = tensor<string, []>("op_129_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_21, interleave = rotated_1_interleave_0, values = (var_163_cast_fp16, var_155_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> var_166_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_166_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> var_167_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> roped_1_cast_fp16 = add(x = var_166_cast_fp16, y = var_167_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_180_begin_0 = const()[name = tensor<string, []>("op_180_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_180_end_0 = const()[name = tensor<string, []>("op_180_end_0"), val = tensor<int32, [4]>([1, 32, 64, 64])];
+            tensor<bool, [4]> var_180_end_mask_0 = const()[name = tensor<string, []>("op_180_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 64]> var_180_cast_fp16 = slice_by_index(begin = var_180_begin_0, end = var_180_end_0, end_mask = var_180_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_180_cast_fp16")];
+            tensor<int32, [4]> var_186_begin_0 = const()[name = tensor<string, []>("op_186_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_186_end_0 = const()[name = tensor<string, []>("op_186_end_0"), val = tensor<int32, [4]>([1, 32, 128, 64])];
+            tensor<bool, [4]> var_186_end_mask_0 = const()[name = tensor<string, []>("op_186_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 64]> var_186_cast_fp16 = slice_by_index(begin = var_186_begin_0, end = var_186_end_0, end_mask = var_186_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_186_cast_fp16")];
+            tensor<fp16, []> const_13_promoted_to_fp16 = const()[name = tensor<string, []>("const_13_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 64]> var_188_cast_fp16 = mul(x = var_186_cast_fp16, y = const_13_promoted_to_fp16)[name = tensor<string, []>("op_188_cast_fp16")];
             tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_23, interleave = rotated_3_interleave_0, values = (var_129_cast_fp16, var_121_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> var_132_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_132_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> var_133_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_133_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> roped_3_cast_fp16 = add(x = var_132_cast_fp16, y = var_133_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
-            tensor<bool, []> q_5_interleave_0 = const()[name = tensor<string, []>("q_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 64]> q_5_cast_fp16 = concat(axis = var_23, interleave = q_5_interleave_0, values = roped_1_cast_fp16)[name = tensor<string, []>("q_5_cast_fp16")];
-            tensor<bool, []> k_5_interleave_0 = const()[name = tensor<string, []>("k_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 64]> new_k_cache_0 = concat(axis = var_23, interleave = k_5_interleave_0, values = roped_3_cast_fp16)[name = tensor<string, []>("k_5_cast_fp16")];
-            tensor<bool, []> k_7_interleave_0 = const()[name = tensor<string, []>("k_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_7_cast_fp16 = concat(axis = var_25, interleave = k_7_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_7_cast_fp16")];
-            tensor<bool, []> v_5_interleave_0 = const()[name = tensor<string, []>("v_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_5_cast_fp16 = concat(axis = var_25, interleave = v_5_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_5_cast_fp16")];
-            tensor<fp16, []> var_155_to_fp16 = const()[name = tensor<string, []>("op_155_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 64]> var_156_cast_fp16 = mul(x = q_5_cast_fp16, y = var_155_to_fp16)[name = tensor<string, []>("op_156_cast_fp16")];
-            tensor<bool, []> attn_weights_1_transpose_x_0 = const()[name = tensor<string, []>("attn_weights_1_transpose_x_0"), val = tensor<bool, []>(true)];
-            tensor<bool, []> attn_weights_1_transpose_y_0 = const()[name = tensor<string, []>("attn_weights_1_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 64, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = var_156_cast_fp16, y = k_7_cast_fp16)[name = tensor<string, []>("attn_weights_1_cast_fp16")];
-            tensor<fp16, [1, 32, 64, 512]> attn_weights_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = mask)[name = tensor<string, []>("attn_weights_3_cast_fp16")];
-            tensor<fp16, [1, 32, 64, 512]> var_164_cast_fp16 = softmax(axis = var_18, x = attn_weights_3_cast_fp16)[name = tensor<string, []>("op_164_cast_fp16")];
-            tensor<bool, []> attn_1_transpose_x_0 = const()[name = tensor<string, []>("attn_1_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> attn_1_transpose_y_0 = const()[name = tensor<string, []>("attn_1_transpose_y_0"), val = tensor<bool, []>(true)];
-            tensor<fp16, [1, 32, 128, 64]> attn_1_cast_fp16 = matmul(transpose_x = attn_1_transpose_x_0, transpose_y = attn_1_transpose_y_0, x = v_5_cast_fp16, y = var_164_cast_fp16)[name = tensor<string, []>("attn_1_cast_fp16")];
-            tensor<int32, [4]> var_168 = const()[name = tensor<string, []>("op_168"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 64]> input_1_cast_fp16 = reshape(shape = var_168, x = attn_1_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
-            tensor<int32, [2]> var_172 = const()[name = tensor<string, []>("op_172"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_174 = const()[name = tensor<string, []>("op_174"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_176_pad_type_0 = const()[name = tensor<string, []>("op_176_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_176_pad_0 = const()[name = tensor<string, []>("op_176_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_176_cast_fp16 = conv(dilations = var_174, groups = var_32, pad = var_176_pad_0, pad_type = var_176_pad_type_0, strides = var_172, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("op_176_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303599872)))];
-            tensor<fp16, [1, 4096, 1, 64]> attention_output_1_cast_fp16 = mul(x = var_176_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> x_11_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_11_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> var_185_cast_fp16 = mul(x = x_11_cast_fp16, y = x_11_cast_fp16)[name = tensor<string, []>("op_185_cast_fp16")];
-            tensor<int32, [1]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 64]> norm_x_3_cast_fp16 = reduce_mean(axes = var_186, keep_dims = var_33, x = var_185_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
-            tensor<fp16, []> var_188_to_fp16 = const()[name = tensor<string, []>("op_188_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
-            tensor<fp16, [1, 1, 1, 64]> var_189_cast_fp16 = add(x = norm_x_3_cast_fp16, y = var_188_to_fp16)[name = tensor<string, []>("op_189_cast_fp16")];
-            tensor<fp16, []> var_190_epsilon_0_to_fp16 = const()[name = tensor<string, []>("op_190_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1p-24)];
-            tensor<fp16, [1, 1, 1, 64]> var_190_cast_fp16 = rsqrt(epsilon = var_190_epsilon_0_to_fp16, x = var_189_cast_fp16)[name = tensor<string, []>("op_190_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> x_normed_5_cast_fp16 = mul(x = x_11_cast_fp16, y = var_190_cast_fp16)[name = tensor<string, []>("x_normed_5_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303608128)))];
-            tensor<fp16, [1, 4096, 1, 64]> input_3_cast_fp16 = mul(x = x_normed_5_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
-            tensor<int32, [2]> var_202 = const()[name = tensor<string, []>("op_202"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_204 = const()[name = tensor<string, []>("op_204"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_206_pad_type_0 = const()[name = tensor<string, []>("op_206_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_206_pad_0 = const()[name = tensor<string, []>("op_206_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 64]> var_206_cast_fp16 = conv(dilations = var_204, groups = var_32, pad = var_206_pad_0, pad_type = var_206_pad_type_0, strides = var_202, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("op_206_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303616384)))];
-            tensor<fp16, [1, 11008, 1, 64]> input_5_cast_fp16 = mul(x = var_206_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
-            tensor<int32, [2]> var_210 = const()[name = tensor<string, []>("op_210"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_212 = const()[name = tensor<string, []>("op_212"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_214_pad_type_0 = const()[name = tensor<string, []>("op_214_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_214_pad_0 = const()[name = tensor<string, []>("op_214_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 64]> var_214_cast_fp16 = conv(dilations = var_212, groups = var_32, pad = var_214_pad_0, pad_type = var_214_pad_type_0, strides = var_210, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("op_214_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303638464)))];
-            tensor<fp16, [1, 11008, 1, 64]> x_fc_2_1_cast_fp16 = mul(x = var_214_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 64]> var_216_cast_fp16 = silu(x = input_5_cast_fp16)[name = tensor<string, []>("op_216_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 64]> input_7_cast_fp16 = mul(x = var_216_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
-            tensor<int32, [2]> var_220 = const()[name = tensor<string, []>("op_220"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_222 = const()[name = tensor<string, []>("op_222"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_224_pad_type_0 = const()[name = tensor<string, []>("op_224_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_224_pad_0 = const()[name = tensor<string, []>("op_224_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_224_cast_fp16 = conv(dilations = var_222, groups = var_32, pad = var_224_pad_0, pad_type = var_224_pad_type_0, strides = var_220, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("op_224_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303660544)))];
-            tensor<fp16, [1, 4096, 1, 64]> var_225_cast_fp16 = mul(x = var_224_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = tensor<string, []>("op_225_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> x_15_cast_fp16 = add(x = var_225_cast_fp16, y = x_11_cast_fp16)[name = tensor<string, []>("x_15_cast_fp16")];
-            tensor<int32, []> var_232 = const()[name = tensor<string, []>("op_232"), val = tensor<int32, []>(3)];
-            tensor<int32, []> var_237 = const()[name = tensor<string, []>("op_237"), val = tensor<int32, []>(-2)];
-            tensor<int32, []> var_239 = const()[name = tensor<string, []>("op_239"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_246 = const()[name = tensor<string, []>("op_246"), val = tensor<int32, []>(1)];
-            tensor<bool, []> var_247 = const()[name = tensor<string, []>("op_247"), val = tensor<bool, []>(true)];
-            tensor<fp16, [1, 4096, 1, 64]> var_254_cast_fp16 = mul(x = x_15_cast_fp16, y = x_15_cast_fp16)[name = tensor<string, []>("op_254_cast_fp16")];
-            tensor<int32, [1]> var_255 = const()[name = tensor<string, []>("op_255"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 64]> norm_x_5_cast_fp16 = reduce_mean(axes = var_255, keep_dims = var_247, x = var_254_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
-            tensor<fp16, []> var_257_to_fp16 = const()[name = tensor<string, []>("op_257_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
-            tensor<fp16, [1, 1, 1, 64]> var_258_cast_fp16 = add(x = norm_x_5_cast_fp16, y = var_257_to_fp16)[name = tensor<string, []>("op_258_cast_fp16")];
-            tensor<fp16, []> var_259_epsilon_0_to_fp16 = const()[name = tensor<string, []>("op_259_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1p-24)];
-            tensor<fp16, [1, 1, 1, 64]> var_259_cast_fp16 = rsqrt(epsilon = var_259_epsilon_0_to_fp16, x = var_258_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> x_normed_9_cast_fp16 = mul(x = x_15_cast_fp16, y = var_259_cast_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303668800)))];
-            tensor<fp16, [1, 4096, 1, 64]> x_19_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_19_cast_fp16")];
-            tensor<int32, [2]> var_274 = const()[name = tensor<string, []>("op_274"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_276 = const()[name = tensor<string, []>("op_276"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_278_pad_type_0 = const()[name = tensor<string, []>("op_278_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_278_pad_0 = const()[name = tensor<string, []>("op_278_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_278_cast_fp16 = conv(dilations = var_276, groups = var_246, pad = var_278_pad_0, pad_type = var_278_pad_type_0, strides = var_274, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = tensor<string, []>("op_278_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303677056)))];
-            tensor<fp16, [1, 4096, 1, 64]> q_7_cast_fp16 = mul(x = var_278_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
-            tensor<int32, [2]> var_282 = const()[name = tensor<string, []>("op_282"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_284 = const()[name = tensor<string, []>("op_284"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_286_pad_type_0 = const()[name = tensor<string, []>("op_286_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_286_pad_0 = const()[name = tensor<string, []>("op_286_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_286_cast_fp16 = conv(dilations = var_284, groups = var_246, pad = var_286_pad_0, pad_type = var_286_pad_type_0, strides = var_282, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = tensor<string, []>("op_286_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303685312)))];
-            tensor<fp16, [1, 4096, 1, 64]> k_9_cast_fp16 = mul(x = var_286_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = tensor<string, []>("k_9_cast_fp16")];
-            tensor<int32, [2]> var_290 = const()[name = tensor<string, []>("op_290"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_292 = const()[name = tensor<string, []>("op_292"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_294_pad_type_0 = const()[name = tensor<string, []>("op_294_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_294_pad_0 = const()[name = tensor<string, []>("op_294_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_294_cast_fp16 = conv(dilations = var_292, groups = var_246, pad = var_294_pad_0, pad_type = var_294_pad_type_0, strides = var_290, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = x_19_cast_fp16)[name = tensor<string, []>("op_294_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303693568)))];
-            tensor<fp16, [1, 4096, 1, 64]> v_7_cast_fp16 = mul(x = var_294_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = tensor<string, []>("v_7_cast_fp16")];
-            tensor<int32, [4]> var_296 = const()[name = tensor<string, []>("op_296"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<fp16, [1, 32, 128, 64]> q_9_cast_fp16 = reshape(shape = var_296, x = q_7_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
-            tensor<int32, [4]> var_298 = const()[name = tensor<string, []>("op_298"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<fp16, [1, 32, 128, 64]> k_11_cast_fp16 = reshape(shape = var_298, x = k_9_cast_fp16)[name = tensor<string, []>("k_11_cast_fp16")];
-            tensor<int32, [4]> var_300 = const()[name = tensor<string, []>("op_300"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<fp16, [1, 32, 128, 64]> new_v_cache_1 = reshape(shape = var_300, x = v_7_cast_fp16)[name = tensor<string, []>("v_9_cast_fp16")];
-            tensor<int32, [4]> var_312_begin_0 = const()[name = tensor<string, []>("op_312_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_312_end_0 = const()[name = tensor<string, []>("op_312_end_0"), val = tensor<int32, [4]>([1, 32, 64, 64])];
-            tensor<bool, [4]> var_312_end_mask_0 = const()[name = tensor<string, []>("op_312_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 64]> var_312_cast_fp16 = slice_by_index(begin = var_312_begin_0, end = var_312_end_0, end_mask = var_312_end_mask_0, x = q_9_cast_fp16)[name = tensor<string, []>("op_312_cast_fp16")];
-            tensor<int32, [4]> var_318_begin_0 = const()[name = tensor<string, []>("op_318_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_318_end_0 = const()[name = tensor<string, []>("op_318_end_0"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<bool, [4]> var_318_end_mask_0 = const()[name = tensor<string, []>("op_318_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 64]> var_318_cast_fp16 = slice_by_index(begin = var_318_begin_0, end = var_318_end_0, end_mask = var_318_end_mask_0, x = q_9_cast_fp16)[name = tensor<string, []>("op_318_cast_fp16")];
-            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 64]> var_320_cast_fp16 = mul(x = var_318_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_320_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_21, interleave = rotated_3_interleave_0, values = (var_188_cast_fp16, var_180_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> var_191_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> var_192_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_192_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> roped_3_cast_fp16 = add(x = var_191_cast_fp16, y = var_192_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_195 = const()[name = tensor<string, []>("op_195"), val = tensor<int32, [4]>([1, 4096, 1, 64])];
+            tensor<fp16, [1, 4096, 1, 64]> var_196_cast_fp16 = reshape(shape = var_195, x = roped_3_cast_fp16)[name = tensor<string, []>("op_196_cast_fp16")];
+            tensor<int32, [4]> k_7_perm_0 = const()[name = tensor<string, []>("k_7_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<int32, [4]> var_198 = const()[name = tensor<string, []>("op_198"), val = tensor<int32, [4]>([1, 4096, 1, 64])];
+            tensor<fp16, [1, 4096, 1, 64]> new_v_cache_0 = reshape(shape = var_198, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<bool, []> k_9_interleave_0 = const()[name = tensor<string, []>("k_9_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 4096]> new_k_cache_0 = transpose(perm = k_7_perm_0, x = var_196_cast_fp16)[name = tensor<string, []>("transpose_2")];
+            tensor<fp16, [1, 512, 1, 4096]> k_9_cast_fp16 = concat(axis = var_23, interleave = k_9_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_9_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 4096, 1, 512]> v_7_cast_fp16 = concat(axis = var_17, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_205 = const()[name = tensor<string, []>("op_205"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 4096, 1, 64]> q_7_cast_fp16 = reshape(shape = var_205, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_210_begin_0 = const()[name = tensor<string, []>("op_210_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_210_end_0 = const()[name = tensor<string, []>("op_210_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_210_end_mask_0 = const()[name = tensor<string, []>("op_210_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_210_cast_fp16 = slice_by_index(begin = var_210_begin_0, end = var_210_end_0, end_mask = var_210_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_210_cast_fp16")];
+            tensor<int32, [4]> var_214_begin_0 = const()[name = tensor<string, []>("op_214_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_214_end_0 = const()[name = tensor<string, []>("op_214_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_214_end_mask_0 = const()[name = tensor<string, []>("op_214_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_214_cast_fp16 = slice_by_index(begin = var_214_begin_0, end = var_214_end_0, end_mask = var_214_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_214_cast_fp16")];
+            tensor<int32, [4]> var_218_begin_0 = const()[name = tensor<string, []>("op_218_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_218_end_0 = const()[name = tensor<string, []>("op_218_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_218_end_mask_0 = const()[name = tensor<string, []>("op_218_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_218_cast_fp16 = slice_by_index(begin = var_218_begin_0, end = var_218_end_0, end_mask = var_218_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_218_cast_fp16")];
+            tensor<int32, [4]> var_222_begin_0 = const()[name = tensor<string, []>("op_222_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_222_end_0 = const()[name = tensor<string, []>("op_222_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_222_end_mask_0 = const()[name = tensor<string, []>("op_222_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_222_cast_fp16 = slice_by_index(begin = var_222_begin_0, end = var_222_end_0, end_mask = var_222_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_222_cast_fp16")];
+            tensor<int32, [4]> var_226_begin_0 = const()[name = tensor<string, []>("op_226_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_226_end_0 = const()[name = tensor<string, []>("op_226_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_226_end_mask_0 = const()[name = tensor<string, []>("op_226_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_226_cast_fp16 = slice_by_index(begin = var_226_begin_0, end = var_226_end_0, end_mask = var_226_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_226_cast_fp16")];
+            tensor<int32, [4]> var_230_begin_0 = const()[name = tensor<string, []>("op_230_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_230_end_0 = const()[name = tensor<string, []>("op_230_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_230_end_mask_0 = const()[name = tensor<string, []>("op_230_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_230_cast_fp16 = slice_by_index(begin = var_230_begin_0, end = var_230_end_0, end_mask = var_230_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_230_cast_fp16")];
+            tensor<int32, [4]> var_234_begin_0 = const()[name = tensor<string, []>("op_234_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_234_end_0 = const()[name = tensor<string, []>("op_234_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_234_end_mask_0 = const()[name = tensor<string, []>("op_234_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_234_cast_fp16 = slice_by_index(begin = var_234_begin_0, end = var_234_end_0, end_mask = var_234_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_234_cast_fp16")];
+            tensor<int32, [4]> var_238_begin_0 = const()[name = tensor<string, []>("op_238_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_238_end_0 = const()[name = tensor<string, []>("op_238_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_238_end_mask_0 = const()[name = tensor<string, []>("op_238_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_238_cast_fp16 = slice_by_index(begin = var_238_begin_0, end = var_238_end_0, end_mask = var_238_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_238_cast_fp16")];
+            tensor<int32, [4]> var_242_begin_0 = const()[name = tensor<string, []>("op_242_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_242_end_0 = const()[name = tensor<string, []>("op_242_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_242_end_mask_0 = const()[name = tensor<string, []>("op_242_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_242_cast_fp16 = slice_by_index(begin = var_242_begin_0, end = var_242_end_0, end_mask = var_242_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_242_cast_fp16")];
+            tensor<int32, [4]> var_246_begin_0 = const()[name = tensor<string, []>("op_246_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_246_end_0 = const()[name = tensor<string, []>("op_246_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_246_end_mask_0 = const()[name = tensor<string, []>("op_246_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_246_cast_fp16 = slice_by_index(begin = var_246_begin_0, end = var_246_end_0, end_mask = var_246_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_246_cast_fp16")];
+            tensor<int32, [4]> var_250_begin_0 = const()[name = tensor<string, []>("op_250_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_250_end_0 = const()[name = tensor<string, []>("op_250_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_250_end_mask_0 = const()[name = tensor<string, []>("op_250_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_250_cast_fp16 = slice_by_index(begin = var_250_begin_0, end = var_250_end_0, end_mask = var_250_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_250_cast_fp16")];
+            tensor<int32, [4]> var_254_begin_0 = const()[name = tensor<string, []>("op_254_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_254_end_0 = const()[name = tensor<string, []>("op_254_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_254_end_mask_0 = const()[name = tensor<string, []>("op_254_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_254_cast_fp16 = slice_by_index(begin = var_254_begin_0, end = var_254_end_0, end_mask = var_254_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_254_cast_fp16")];
+            tensor<int32, [4]> var_258_begin_0 = const()[name = tensor<string, []>("op_258_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_258_end_0 = const()[name = tensor<string, []>("op_258_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_258_end_mask_0 = const()[name = tensor<string, []>("op_258_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_258_cast_fp16 = slice_by_index(begin = var_258_begin_0, end = var_258_end_0, end_mask = var_258_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_258_cast_fp16")];
+            tensor<int32, [4]> var_262_begin_0 = const()[name = tensor<string, []>("op_262_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_262_end_0 = const()[name = tensor<string, []>("op_262_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_262_end_mask_0 = const()[name = tensor<string, []>("op_262_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_262_cast_fp16 = slice_by_index(begin = var_262_begin_0, end = var_262_end_0, end_mask = var_262_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_262_cast_fp16")];
+            tensor<int32, [4]> var_266_begin_0 = const()[name = tensor<string, []>("op_266_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_266_end_0 = const()[name = tensor<string, []>("op_266_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_266_end_mask_0 = const()[name = tensor<string, []>("op_266_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_266_cast_fp16 = slice_by_index(begin = var_266_begin_0, end = var_266_end_0, end_mask = var_266_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_266_cast_fp16")];
+            tensor<int32, [4]> var_270_begin_0 = const()[name = tensor<string, []>("op_270_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_270_end_0 = const()[name = tensor<string, []>("op_270_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_270_end_mask_0 = const()[name = tensor<string, []>("op_270_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_270_cast_fp16 = slice_by_index(begin = var_270_begin_0, end = var_270_end_0, end_mask = var_270_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_270_cast_fp16")];
+            tensor<int32, [4]> var_274_begin_0 = const()[name = tensor<string, []>("op_274_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_274_end_0 = const()[name = tensor<string, []>("op_274_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_274_end_mask_0 = const()[name = tensor<string, []>("op_274_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_274_cast_fp16 = slice_by_index(begin = var_274_begin_0, end = var_274_end_0, end_mask = var_274_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_274_cast_fp16")];
+            tensor<int32, [4]> var_278_begin_0 = const()[name = tensor<string, []>("op_278_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_278_end_0 = const()[name = tensor<string, []>("op_278_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_278_end_mask_0 = const()[name = tensor<string, []>("op_278_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_278_cast_fp16 = slice_by_index(begin = var_278_begin_0, end = var_278_end_0, end_mask = var_278_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_278_cast_fp16")];
+            tensor<int32, [4]> var_282_begin_0 = const()[name = tensor<string, []>("op_282_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_282_end_0 = const()[name = tensor<string, []>("op_282_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_282_end_mask_0 = const()[name = tensor<string, []>("op_282_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_282_cast_fp16 = slice_by_index(begin = var_282_begin_0, end = var_282_end_0, end_mask = var_282_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_282_cast_fp16")];
+            tensor<int32, [4]> var_286_begin_0 = const()[name = tensor<string, []>("op_286_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_286_end_0 = const()[name = tensor<string, []>("op_286_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_286_end_mask_0 = const()[name = tensor<string, []>("op_286_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_286_cast_fp16 = slice_by_index(begin = var_286_begin_0, end = var_286_end_0, end_mask = var_286_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_286_cast_fp16")];
+            tensor<int32, [4]> var_290_begin_0 = const()[name = tensor<string, []>("op_290_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_290_end_0 = const()[name = tensor<string, []>("op_290_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_290_end_mask_0 = const()[name = tensor<string, []>("op_290_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_290_cast_fp16 = slice_by_index(begin = var_290_begin_0, end = var_290_end_0, end_mask = var_290_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_290_cast_fp16")];
+            tensor<int32, [4]> var_294_begin_0 = const()[name = tensor<string, []>("op_294_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_294_end_0 = const()[name = tensor<string, []>("op_294_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_294_end_mask_0 = const()[name = tensor<string, []>("op_294_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_294_cast_fp16 = slice_by_index(begin = var_294_begin_0, end = var_294_end_0, end_mask = var_294_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_294_cast_fp16")];
+            tensor<int32, [4]> var_298_begin_0 = const()[name = tensor<string, []>("op_298_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_298_end_0 = const()[name = tensor<string, []>("op_298_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_298_end_mask_0 = const()[name = tensor<string, []>("op_298_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_298_cast_fp16 = slice_by_index(begin = var_298_begin_0, end = var_298_end_0, end_mask = var_298_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_298_cast_fp16")];
+            tensor<int32, [4]> var_302_begin_0 = const()[name = tensor<string, []>("op_302_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_302_end_0 = const()[name = tensor<string, []>("op_302_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_302_end_mask_0 = const()[name = tensor<string, []>("op_302_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_302_cast_fp16 = slice_by_index(begin = var_302_begin_0, end = var_302_end_0, end_mask = var_302_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_302_cast_fp16")];
+            tensor<int32, [4]> var_306_begin_0 = const()[name = tensor<string, []>("op_306_begin_0"), val = tensor<int32, [4]>([0, 3072, 0, 0])];
+            tensor<int32, [4]> var_306_end_0 = const()[name = tensor<string, []>("op_306_end_0"), val = tensor<int32, [4]>([1, 3200, 1, 64])];
+            tensor<bool, [4]> var_306_end_mask_0 = const()[name = tensor<string, []>("op_306_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_306_cast_fp16 = slice_by_index(begin = var_306_begin_0, end = var_306_end_0, end_mask = var_306_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_306_cast_fp16")];
+            tensor<int32, [4]> var_310_begin_0 = const()[name = tensor<string, []>("op_310_begin_0"), val = tensor<int32, [4]>([0, 3200, 0, 0])];
+            tensor<int32, [4]> var_310_end_0 = const()[name = tensor<string, []>("op_310_end_0"), val = tensor<int32, [4]>([1, 3328, 1, 64])];
+            tensor<bool, [4]> var_310_end_mask_0 = const()[name = tensor<string, []>("op_310_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_310_cast_fp16 = slice_by_index(begin = var_310_begin_0, end = var_310_end_0, end_mask = var_310_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_310_cast_fp16")];
+            tensor<int32, [4]> var_314_begin_0 = const()[name = tensor<string, []>("op_314_begin_0"), val = tensor<int32, [4]>([0, 3328, 0, 0])];
+            tensor<int32, [4]> var_314_end_0 = const()[name = tensor<string, []>("op_314_end_0"), val = tensor<int32, [4]>([1, 3456, 1, 64])];
+            tensor<bool, [4]> var_314_end_mask_0 = const()[name = tensor<string, []>("op_314_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_314_cast_fp16 = slice_by_index(begin = var_314_begin_0, end = var_314_end_0, end_mask = var_314_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_314_cast_fp16")];
+            tensor<int32, [4]> var_318_begin_0 = const()[name = tensor<string, []>("op_318_begin_0"), val = tensor<int32, [4]>([0, 3456, 0, 0])];
+            tensor<int32, [4]> var_318_end_0 = const()[name = tensor<string, []>("op_318_end_0"), val = tensor<int32, [4]>([1, 3584, 1, 64])];
+            tensor<bool, [4]> var_318_end_mask_0 = const()[name = tensor<string, []>("op_318_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_318_cast_fp16 = slice_by_index(begin = var_318_begin_0, end = var_318_end_0, end_mask = var_318_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_318_cast_fp16")];
+            tensor<int32, [4]> var_322_begin_0 = const()[name = tensor<string, []>("op_322_begin_0"), val = tensor<int32, [4]>([0, 3584, 0, 0])];
+            tensor<int32, [4]> var_322_end_0 = const()[name = tensor<string, []>("op_322_end_0"), val = tensor<int32, [4]>([1, 3712, 1, 64])];
+            tensor<bool, [4]> var_322_end_mask_0 = const()[name = tensor<string, []>("op_322_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_322_cast_fp16 = slice_by_index(begin = var_322_begin_0, end = var_322_end_0, end_mask = var_322_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_322_cast_fp16")];
+            tensor<int32, [4]> var_326_begin_0 = const()[name = tensor<string, []>("op_326_begin_0"), val = tensor<int32, [4]>([0, 3712, 0, 0])];
+            tensor<int32, [4]> var_326_end_0 = const()[name = tensor<string, []>("op_326_end_0"), val = tensor<int32, [4]>([1, 3840, 1, 64])];
+            tensor<bool, [4]> var_326_end_mask_0 = const()[name = tensor<string, []>("op_326_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_326_cast_fp16 = slice_by_index(begin = var_326_begin_0, end = var_326_end_0, end_mask = var_326_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_326_cast_fp16")];
+            tensor<int32, [4]> var_330_begin_0 = const()[name = tensor<string, []>("op_330_begin_0"), val = tensor<int32, [4]>([0, 3840, 0, 0])];
+            tensor<int32, [4]> var_330_end_0 = const()[name = tensor<string, []>("op_330_end_0"), val = tensor<int32, [4]>([1, 3968, 1, 64])];
+            tensor<bool, [4]> var_330_end_mask_0 = const()[name = tensor<string, []>("op_330_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_330_cast_fp16 = slice_by_index(begin = var_330_begin_0, end = var_330_end_0, end_mask = var_330_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_330_cast_fp16")];
+            tensor<int32, [4]> var_334_begin_0 = const()[name = tensor<string, []>("op_334_begin_0"), val = tensor<int32, [4]>([0, 3968, 0, 0])];
+            tensor<int32, [4]> var_334_end_0 = const()[name = tensor<string, []>("op_334_end_0"), val = tensor<int32, [4]>([1, 4096, 1, 64])];
+            tensor<bool, [4]> var_334_end_mask_0 = const()[name = tensor<string, []>("op_334_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_334_cast_fp16 = slice_by_index(begin = var_334_begin_0, end = var_334_end_0, end_mask = var_334_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_334_cast_fp16")];
+            tensor<int32, [4]> var_340_begin_0 = const()[name = tensor<string, []>("op_340_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_340_end_0 = const()[name = tensor<string, []>("op_340_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_340_end_mask_0 = const()[name = tensor<string, []>("op_340_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_340_cast_fp16 = slice_by_index(begin = var_340_begin_0, end = var_340_end_0, end_mask = var_340_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_340_cast_fp16")];
+            tensor<int32, [4]> var_344_begin_0 = const()[name = tensor<string, []>("op_344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_344_end_0 = const()[name = tensor<string, []>("op_344_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_344_end_mask_0 = const()[name = tensor<string, []>("op_344_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_344_cast_fp16 = slice_by_index(begin = var_344_begin_0, end = var_344_end_0, end_mask = var_344_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_344_cast_fp16")];
+            tensor<int32, [4]> var_348_begin_0 = const()[name = tensor<string, []>("op_348_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_348_end_0 = const()[name = tensor<string, []>("op_348_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_348_end_mask_0 = const()[name = tensor<string, []>("op_348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_348_cast_fp16 = slice_by_index(begin = var_348_begin_0, end = var_348_end_0, end_mask = var_348_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_348_cast_fp16")];
+            tensor<int32, [4]> var_352_begin_0 = const()[name = tensor<string, []>("op_352_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_352_end_0 = const()[name = tensor<string, []>("op_352_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_352_end_mask_0 = const()[name = tensor<string, []>("op_352_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_352_cast_fp16 = slice_by_index(begin = var_352_begin_0, end = var_352_end_0, end_mask = var_352_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_352_cast_fp16")];
+            tensor<int32, [4]> var_356_begin_0 = const()[name = tensor<string, []>("op_356_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_356_end_0 = const()[name = tensor<string, []>("op_356_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_356_end_mask_0 = const()[name = tensor<string, []>("op_356_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_356_cast_fp16 = slice_by_index(begin = var_356_begin_0, end = var_356_end_0, end_mask = var_356_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_356_cast_fp16")];
+            tensor<int32, [4]> var_360_begin_0 = const()[name = tensor<string, []>("op_360_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_360_end_0 = const()[name = tensor<string, []>("op_360_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_360_end_mask_0 = const()[name = tensor<string, []>("op_360_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_360_cast_fp16 = slice_by_index(begin = var_360_begin_0, end = var_360_end_0, end_mask = var_360_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_360_cast_fp16")];
+            tensor<int32, [4]> var_364_begin_0 = const()[name = tensor<string, []>("op_364_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_364_end_0 = const()[name = tensor<string, []>("op_364_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_364_end_mask_0 = const()[name = tensor<string, []>("op_364_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_364_cast_fp16 = slice_by_index(begin = var_364_begin_0, end = var_364_end_0, end_mask = var_364_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_364_cast_fp16")];
+            tensor<int32, [4]> var_368_begin_0 = const()[name = tensor<string, []>("op_368_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_368_end_0 = const()[name = tensor<string, []>("op_368_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_368_end_mask_0 = const()[name = tensor<string, []>("op_368_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_368_cast_fp16 = slice_by_index(begin = var_368_begin_0, end = var_368_end_0, end_mask = var_368_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_368_cast_fp16")];
+            tensor<int32, [4]> var_372_begin_0 = const()[name = tensor<string, []>("op_372_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1024])];
+            tensor<int32, [4]> var_372_end_0 = const()[name = tensor<string, []>("op_372_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1152])];
+            tensor<bool, [4]> var_372_end_mask_0 = const()[name = tensor<string, []>("op_372_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_372_cast_fp16 = slice_by_index(begin = var_372_begin_0, end = var_372_end_0, end_mask = var_372_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_372_cast_fp16")];
+            tensor<int32, [4]> var_376_begin_0 = const()[name = tensor<string, []>("op_376_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1152])];
+            tensor<int32, [4]> var_376_end_0 = const()[name = tensor<string, []>("op_376_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1280])];
+            tensor<bool, [4]> var_376_end_mask_0 = const()[name = tensor<string, []>("op_376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_376_cast_fp16 = slice_by_index(begin = var_376_begin_0, end = var_376_end_0, end_mask = var_376_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_376_cast_fp16")];
+            tensor<int32, [4]> var_380_begin_0 = const()[name = tensor<string, []>("op_380_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1280])];
+            tensor<int32, [4]> var_380_end_0 = const()[name = tensor<string, []>("op_380_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1408])];
+            tensor<bool, [4]> var_380_end_mask_0 = const()[name = tensor<string, []>("op_380_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_380_cast_fp16 = slice_by_index(begin = var_380_begin_0, end = var_380_end_0, end_mask = var_380_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_380_cast_fp16")];
+            tensor<int32, [4]> var_384_begin_0 = const()[name = tensor<string, []>("op_384_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1408])];
+            tensor<int32, [4]> var_384_end_0 = const()[name = tensor<string, []>("op_384_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1536])];
+            tensor<bool, [4]> var_384_end_mask_0 = const()[name = tensor<string, []>("op_384_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_384_cast_fp16 = slice_by_index(begin = var_384_begin_0, end = var_384_end_0, end_mask = var_384_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_384_cast_fp16")];
+            tensor<int32, [4]> var_388_begin_0 = const()[name = tensor<string, []>("op_388_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1536])];
+            tensor<int32, [4]> var_388_end_0 = const()[name = tensor<string, []>("op_388_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1664])];
+            tensor<bool, [4]> var_388_end_mask_0 = const()[name = tensor<string, []>("op_388_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_388_cast_fp16 = slice_by_index(begin = var_388_begin_0, end = var_388_end_0, end_mask = var_388_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_388_cast_fp16")];
+            tensor<int32, [4]> var_392_begin_0 = const()[name = tensor<string, []>("op_392_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1664])];
+            tensor<int32, [4]> var_392_end_0 = const()[name = tensor<string, []>("op_392_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1792])];
+            tensor<bool, [4]> var_392_end_mask_0 = const()[name = tensor<string, []>("op_392_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_392_cast_fp16 = slice_by_index(begin = var_392_begin_0, end = var_392_end_0, end_mask = var_392_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_392_cast_fp16")];
+            tensor<int32, [4]> var_396_begin_0 = const()[name = tensor<string, []>("op_396_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1792])];
+            tensor<int32, [4]> var_396_end_0 = const()[name = tensor<string, []>("op_396_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1920])];
+            tensor<bool, [4]> var_396_end_mask_0 = const()[name = tensor<string, []>("op_396_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_396_cast_fp16 = slice_by_index(begin = var_396_begin_0, end = var_396_end_0, end_mask = var_396_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_396_cast_fp16")];
+            tensor<int32, [4]> var_400_begin_0 = const()[name = tensor<string, []>("op_400_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1920])];
+            tensor<int32, [4]> var_400_end_0 = const()[name = tensor<string, []>("op_400_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2048])];
+            tensor<bool, [4]> var_400_end_mask_0 = const()[name = tensor<string, []>("op_400_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_400_cast_fp16 = slice_by_index(begin = var_400_begin_0, end = var_400_end_0, end_mask = var_400_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_400_cast_fp16")];
+            tensor<int32, [4]> var_404_begin_0 = const()[name = tensor<string, []>("op_404_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2048])];
+            tensor<int32, [4]> var_404_end_0 = const()[name = tensor<string, []>("op_404_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2176])];
+            tensor<bool, [4]> var_404_end_mask_0 = const()[name = tensor<string, []>("op_404_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_404_cast_fp16 = slice_by_index(begin = var_404_begin_0, end = var_404_end_0, end_mask = var_404_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_404_cast_fp16")];
+            tensor<int32, [4]> var_408_begin_0 = const()[name = tensor<string, []>("op_408_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2176])];
+            tensor<int32, [4]> var_408_end_0 = const()[name = tensor<string, []>("op_408_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2304])];
+            tensor<bool, [4]> var_408_end_mask_0 = const()[name = tensor<string, []>("op_408_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_408_cast_fp16 = slice_by_index(begin = var_408_begin_0, end = var_408_end_0, end_mask = var_408_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_408_cast_fp16")];
+            tensor<int32, [4]> var_412_begin_0 = const()[name = tensor<string, []>("op_412_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2304])];
+            tensor<int32, [4]> var_412_end_0 = const()[name = tensor<string, []>("op_412_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2432])];
+            tensor<bool, [4]> var_412_end_mask_0 = const()[name = tensor<string, []>("op_412_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_412_cast_fp16 = slice_by_index(begin = var_412_begin_0, end = var_412_end_0, end_mask = var_412_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_412_cast_fp16")];
+            tensor<int32, [4]> var_416_begin_0 = const()[name = tensor<string, []>("op_416_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2432])];
+            tensor<int32, [4]> var_416_end_0 = const()[name = tensor<string, []>("op_416_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2560])];
+            tensor<bool, [4]> var_416_end_mask_0 = const()[name = tensor<string, []>("op_416_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_416_cast_fp16 = slice_by_index(begin = var_416_begin_0, end = var_416_end_0, end_mask = var_416_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_416_cast_fp16")];
+            tensor<int32, [4]> var_420_begin_0 = const()[name = tensor<string, []>("op_420_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2560])];
+            tensor<int32, [4]> var_420_end_0 = const()[name = tensor<string, []>("op_420_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2688])];
+            tensor<bool, [4]> var_420_end_mask_0 = const()[name = tensor<string, []>("op_420_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_420_cast_fp16 = slice_by_index(begin = var_420_begin_0, end = var_420_end_0, end_mask = var_420_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_420_cast_fp16")];
+            tensor<int32, [4]> var_424_begin_0 = const()[name = tensor<string, []>("op_424_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2688])];
+            tensor<int32, [4]> var_424_end_0 = const()[name = tensor<string, []>("op_424_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2816])];
+            tensor<bool, [4]> var_424_end_mask_0 = const()[name = tensor<string, []>("op_424_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_424_cast_fp16 = slice_by_index(begin = var_424_begin_0, end = var_424_end_0, end_mask = var_424_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_424_cast_fp16")];
+            tensor<int32, [4]> var_428_begin_0 = const()[name = tensor<string, []>("op_428_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2816])];
+            tensor<int32, [4]> var_428_end_0 = const()[name = tensor<string, []>("op_428_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2944])];
+            tensor<bool, [4]> var_428_end_mask_0 = const()[name = tensor<string, []>("op_428_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_428_cast_fp16 = slice_by_index(begin = var_428_begin_0, end = var_428_end_0, end_mask = var_428_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_428_cast_fp16")];
+            tensor<int32, [4]> var_432_begin_0 = const()[name = tensor<string, []>("op_432_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2944])];
+            tensor<int32, [4]> var_432_end_0 = const()[name = tensor<string, []>("op_432_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3072])];
+            tensor<bool, [4]> var_432_end_mask_0 = const()[name = tensor<string, []>("op_432_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_432_cast_fp16 = slice_by_index(begin = var_432_begin_0, end = var_432_end_0, end_mask = var_432_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_432_cast_fp16")];
+            tensor<int32, [4]> var_436_begin_0 = const()[name = tensor<string, []>("op_436_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3072])];
+            tensor<int32, [4]> var_436_end_0 = const()[name = tensor<string, []>("op_436_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3200])];
+            tensor<bool, [4]> var_436_end_mask_0 = const()[name = tensor<string, []>("op_436_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_436_cast_fp16 = slice_by_index(begin = var_436_begin_0, end = var_436_end_0, end_mask = var_436_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_436_cast_fp16")];
+            tensor<int32, [4]> var_440_begin_0 = const()[name = tensor<string, []>("op_440_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3200])];
+            tensor<int32, [4]> var_440_end_0 = const()[name = tensor<string, []>("op_440_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3328])];
+            tensor<bool, [4]> var_440_end_mask_0 = const()[name = tensor<string, []>("op_440_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_440_cast_fp16 = slice_by_index(begin = var_440_begin_0, end = var_440_end_0, end_mask = var_440_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_440_cast_fp16")];
+            tensor<int32, [4]> var_444_begin_0 = const()[name = tensor<string, []>("op_444_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3328])];
+            tensor<int32, [4]> var_444_end_0 = const()[name = tensor<string, []>("op_444_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3456])];
+            tensor<bool, [4]> var_444_end_mask_0 = const()[name = tensor<string, []>("op_444_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_444_cast_fp16 = slice_by_index(begin = var_444_begin_0, end = var_444_end_0, end_mask = var_444_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_444_cast_fp16")];
+            tensor<int32, [4]> var_448_begin_0 = const()[name = tensor<string, []>("op_448_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3456])];
+            tensor<int32, [4]> var_448_end_0 = const()[name = tensor<string, []>("op_448_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3584])];
+            tensor<bool, [4]> var_448_end_mask_0 = const()[name = tensor<string, []>("op_448_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_448_cast_fp16 = slice_by_index(begin = var_448_begin_0, end = var_448_end_0, end_mask = var_448_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_448_cast_fp16")];
+            tensor<int32, [4]> var_452_begin_0 = const()[name = tensor<string, []>("op_452_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3584])];
+            tensor<int32, [4]> var_452_end_0 = const()[name = tensor<string, []>("op_452_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3712])];
+            tensor<bool, [4]> var_452_end_mask_0 = const()[name = tensor<string, []>("op_452_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_452_cast_fp16 = slice_by_index(begin = var_452_begin_0, end = var_452_end_0, end_mask = var_452_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_452_cast_fp16")];
+            tensor<int32, [4]> var_456_begin_0 = const()[name = tensor<string, []>("op_456_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3712])];
+            tensor<int32, [4]> var_456_end_0 = const()[name = tensor<string, []>("op_456_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3840])];
+            tensor<bool, [4]> var_456_end_mask_0 = const()[name = tensor<string, []>("op_456_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_456_cast_fp16 = slice_by_index(begin = var_456_begin_0, end = var_456_end_0, end_mask = var_456_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_456_cast_fp16")];
+            tensor<int32, [4]> var_460_begin_0 = const()[name = tensor<string, []>("op_460_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3840])];
+            tensor<int32, [4]> var_460_end_0 = const()[name = tensor<string, []>("op_460_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3968])];
+            tensor<bool, [4]> var_460_end_mask_0 = const()[name = tensor<string, []>("op_460_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_460_cast_fp16 = slice_by_index(begin = var_460_begin_0, end = var_460_end_0, end_mask = var_460_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_460_cast_fp16")];
+            tensor<int32, [4]> var_464_begin_0 = const()[name = tensor<string, []>("op_464_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3968])];
+            tensor<int32, [4]> var_464_end_0 = const()[name = tensor<string, []>("op_464_end_0"), val = tensor<int32, [4]>([1, 512, 1, 4096])];
+            tensor<bool, [4]> var_464_end_mask_0 = const()[name = tensor<string, []>("op_464_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_464_cast_fp16 = slice_by_index(begin = var_464_begin_0, end = var_464_end_0, end_mask = var_464_end_mask_0, x = k_9_cast_fp16)[name = tensor<string, []>("op_464_cast_fp16")];
+            tensor<int32, [4]> var_466_begin_0 = const()[name = tensor<string, []>("op_466_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_466_end_0 = const()[name = tensor<string, []>("op_466_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_466_end_mask_0 = const()[name = tensor<string, []>("op_466_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_466_cast_fp16 = slice_by_index(begin = var_466_begin_0, end = var_466_end_0, end_mask = var_466_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_466_cast_fp16")];
+            tensor<int32, [4]> var_470_begin_0 = const()[name = tensor<string, []>("op_470_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_470_end_0 = const()[name = tensor<string, []>("op_470_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_470_end_mask_0 = const()[name = tensor<string, []>("op_470_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_470_cast_fp16 = slice_by_index(begin = var_470_begin_0, end = var_470_end_0, end_mask = var_470_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_470_cast_fp16")];
+            tensor<int32, [4]> var_474_begin_0 = const()[name = tensor<string, []>("op_474_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_474_end_0 = const()[name = tensor<string, []>("op_474_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_474_end_mask_0 = const()[name = tensor<string, []>("op_474_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_474_cast_fp16 = slice_by_index(begin = var_474_begin_0, end = var_474_end_0, end_mask = var_474_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_474_cast_fp16")];
+            tensor<int32, [4]> var_478_begin_0 = const()[name = tensor<string, []>("op_478_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_478_end_0 = const()[name = tensor<string, []>("op_478_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_478_end_mask_0 = const()[name = tensor<string, []>("op_478_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_478_cast_fp16 = slice_by_index(begin = var_478_begin_0, end = var_478_end_0, end_mask = var_478_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_478_cast_fp16")];
+            tensor<int32, [4]> var_482_begin_0 = const()[name = tensor<string, []>("op_482_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_482_end_0 = const()[name = tensor<string, []>("op_482_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_482_end_mask_0 = const()[name = tensor<string, []>("op_482_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_482_cast_fp16 = slice_by_index(begin = var_482_begin_0, end = var_482_end_0, end_mask = var_482_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_482_cast_fp16")];
+            tensor<int32, [4]> var_486_begin_0 = const()[name = tensor<string, []>("op_486_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_486_end_0 = const()[name = tensor<string, []>("op_486_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_486_end_mask_0 = const()[name = tensor<string, []>("op_486_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_486_cast_fp16 = slice_by_index(begin = var_486_begin_0, end = var_486_end_0, end_mask = var_486_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_486_cast_fp16")];
+            tensor<int32, [4]> var_490_begin_0 = const()[name = tensor<string, []>("op_490_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_490_end_0 = const()[name = tensor<string, []>("op_490_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_490_end_mask_0 = const()[name = tensor<string, []>("op_490_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_490_cast_fp16 = slice_by_index(begin = var_490_begin_0, end = var_490_end_0, end_mask = var_490_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_490_cast_fp16")];
+            tensor<int32, [4]> var_494_begin_0 = const()[name = tensor<string, []>("op_494_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_494_end_0 = const()[name = tensor<string, []>("op_494_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_494_end_mask_0 = const()[name = tensor<string, []>("op_494_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_494_cast_fp16 = slice_by_index(begin = var_494_begin_0, end = var_494_end_0, end_mask = var_494_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_494_cast_fp16")];
+            tensor<int32, [4]> var_498_begin_0 = const()[name = tensor<string, []>("op_498_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_498_end_0 = const()[name = tensor<string, []>("op_498_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 512])];
+            tensor<bool, [4]> var_498_end_mask_0 = const()[name = tensor<string, []>("op_498_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_498_cast_fp16 = slice_by_index(begin = var_498_begin_0, end = var_498_end_0, end_mask = var_498_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_498_cast_fp16")];
+            tensor<int32, [4]> var_502_begin_0 = const()[name = tensor<string, []>("op_502_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_502_end_0 = const()[name = tensor<string, []>("op_502_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 512])];
+            tensor<bool, [4]> var_502_end_mask_0 = const()[name = tensor<string, []>("op_502_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_502_cast_fp16 = slice_by_index(begin = var_502_begin_0, end = var_502_end_0, end_mask = var_502_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_502_cast_fp16")];
+            tensor<int32, [4]> var_506_begin_0 = const()[name = tensor<string, []>("op_506_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_506_end_0 = const()[name = tensor<string, []>("op_506_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 512])];
+            tensor<bool, [4]> var_506_end_mask_0 = const()[name = tensor<string, []>("op_506_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_506_cast_fp16 = slice_by_index(begin = var_506_begin_0, end = var_506_end_0, end_mask = var_506_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_506_cast_fp16")];
+            tensor<int32, [4]> var_510_begin_0 = const()[name = tensor<string, []>("op_510_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_510_end_0 = const()[name = tensor<string, []>("op_510_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 512])];
+            tensor<bool, [4]> var_510_end_mask_0 = const()[name = tensor<string, []>("op_510_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_510_cast_fp16 = slice_by_index(begin = var_510_begin_0, end = var_510_end_0, end_mask = var_510_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_510_cast_fp16")];
+            tensor<int32, [4]> var_514_begin_0 = const()[name = tensor<string, []>("op_514_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_514_end_0 = const()[name = tensor<string, []>("op_514_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 512])];
+            tensor<bool, [4]> var_514_end_mask_0 = const()[name = tensor<string, []>("op_514_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_514_cast_fp16 = slice_by_index(begin = var_514_begin_0, end = var_514_end_0, end_mask = var_514_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_514_cast_fp16")];
+            tensor<int32, [4]> var_518_begin_0 = const()[name = tensor<string, []>("op_518_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_518_end_0 = const()[name = tensor<string, []>("op_518_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 512])];
+            tensor<bool, [4]> var_518_end_mask_0 = const()[name = tensor<string, []>("op_518_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_518_cast_fp16 = slice_by_index(begin = var_518_begin_0, end = var_518_end_0, end_mask = var_518_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_518_cast_fp16")];
+            tensor<int32, [4]> var_522_begin_0 = const()[name = tensor<string, []>("op_522_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_522_end_0 = const()[name = tensor<string, []>("op_522_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 512])];
+            tensor<bool, [4]> var_522_end_mask_0 = const()[name = tensor<string, []>("op_522_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_522_cast_fp16 = slice_by_index(begin = var_522_begin_0, end = var_522_end_0, end_mask = var_522_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_522_cast_fp16")];
+            tensor<int32, [4]> var_526_begin_0 = const()[name = tensor<string, []>("op_526_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_526_end_0 = const()[name = tensor<string, []>("op_526_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 512])];
+            tensor<bool, [4]> var_526_end_mask_0 = const()[name = tensor<string, []>("op_526_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_526_cast_fp16 = slice_by_index(begin = var_526_begin_0, end = var_526_end_0, end_mask = var_526_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_526_cast_fp16")];
+            tensor<int32, [4]> var_530_begin_0 = const()[name = tensor<string, []>("op_530_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_530_end_0 = const()[name = tensor<string, []>("op_530_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 512])];
+            tensor<bool, [4]> var_530_end_mask_0 = const()[name = tensor<string, []>("op_530_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_530_cast_fp16 = slice_by_index(begin = var_530_begin_0, end = var_530_end_0, end_mask = var_530_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_530_cast_fp16")];
+            tensor<int32, [4]> var_534_begin_0 = const()[name = tensor<string, []>("op_534_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_534_end_0 = const()[name = tensor<string, []>("op_534_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 512])];
+            tensor<bool, [4]> var_534_end_mask_0 = const()[name = tensor<string, []>("op_534_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_534_cast_fp16 = slice_by_index(begin = var_534_begin_0, end = var_534_end_0, end_mask = var_534_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_534_cast_fp16")];
+            tensor<int32, [4]> var_538_begin_0 = const()[name = tensor<string, []>("op_538_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_538_end_0 = const()[name = tensor<string, []>("op_538_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 512])];
+            tensor<bool, [4]> var_538_end_mask_0 = const()[name = tensor<string, []>("op_538_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_538_cast_fp16 = slice_by_index(begin = var_538_begin_0, end = var_538_end_0, end_mask = var_538_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_538_cast_fp16")];
+            tensor<int32, [4]> var_542_begin_0 = const()[name = tensor<string, []>("op_542_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_542_end_0 = const()[name = tensor<string, []>("op_542_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 512])];
+            tensor<bool, [4]> var_542_end_mask_0 = const()[name = tensor<string, []>("op_542_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_542_cast_fp16 = slice_by_index(begin = var_542_begin_0, end = var_542_end_0, end_mask = var_542_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_542_cast_fp16")];
+            tensor<int32, [4]> var_546_begin_0 = const()[name = tensor<string, []>("op_546_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_546_end_0 = const()[name = tensor<string, []>("op_546_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 512])];
+            tensor<bool, [4]> var_546_end_mask_0 = const()[name = tensor<string, []>("op_546_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_546_cast_fp16 = slice_by_index(begin = var_546_begin_0, end = var_546_end_0, end_mask = var_546_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_546_cast_fp16")];
+            tensor<int32, [4]> var_550_begin_0 = const()[name = tensor<string, []>("op_550_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_550_end_0 = const()[name = tensor<string, []>("op_550_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 512])];
+            tensor<bool, [4]> var_550_end_mask_0 = const()[name = tensor<string, []>("op_550_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_550_cast_fp16 = slice_by_index(begin = var_550_begin_0, end = var_550_end_0, end_mask = var_550_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_550_cast_fp16")];
+            tensor<int32, [4]> var_554_begin_0 = const()[name = tensor<string, []>("op_554_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_554_end_0 = const()[name = tensor<string, []>("op_554_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 512])];
+            tensor<bool, [4]> var_554_end_mask_0 = const()[name = tensor<string, []>("op_554_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_554_cast_fp16 = slice_by_index(begin = var_554_begin_0, end = var_554_end_0, end_mask = var_554_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_554_cast_fp16")];
+            tensor<int32, [4]> var_558_begin_0 = const()[name = tensor<string, []>("op_558_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_558_end_0 = const()[name = tensor<string, []>("op_558_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 512])];
+            tensor<bool, [4]> var_558_end_mask_0 = const()[name = tensor<string, []>("op_558_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_558_cast_fp16 = slice_by_index(begin = var_558_begin_0, end = var_558_end_0, end_mask = var_558_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_558_cast_fp16")];
+            tensor<int32, [4]> var_562_begin_0 = const()[name = tensor<string, []>("op_562_begin_0"), val = tensor<int32, [4]>([0, 3072, 0, 0])];
+            tensor<int32, [4]> var_562_end_0 = const()[name = tensor<string, []>("op_562_end_0"), val = tensor<int32, [4]>([1, 3200, 1, 512])];
+            tensor<bool, [4]> var_562_end_mask_0 = const()[name = tensor<string, []>("op_562_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_562_cast_fp16 = slice_by_index(begin = var_562_begin_0, end = var_562_end_0, end_mask = var_562_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_562_cast_fp16")];
+            tensor<int32, [4]> var_566_begin_0 = const()[name = tensor<string, []>("op_566_begin_0"), val = tensor<int32, [4]>([0, 3200, 0, 0])];
+            tensor<int32, [4]> var_566_end_0 = const()[name = tensor<string, []>("op_566_end_0"), val = tensor<int32, [4]>([1, 3328, 1, 512])];
+            tensor<bool, [4]> var_566_end_mask_0 = const()[name = tensor<string, []>("op_566_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_566_cast_fp16 = slice_by_index(begin = var_566_begin_0, end = var_566_end_0, end_mask = var_566_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_566_cast_fp16")];
+            tensor<int32, [4]> var_570_begin_0 = const()[name = tensor<string, []>("op_570_begin_0"), val = tensor<int32, [4]>([0, 3328, 0, 0])];
+            tensor<int32, [4]> var_570_end_0 = const()[name = tensor<string, []>("op_570_end_0"), val = tensor<int32, [4]>([1, 3456, 1, 512])];
+            tensor<bool, [4]> var_570_end_mask_0 = const()[name = tensor<string, []>("op_570_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_570_cast_fp16 = slice_by_index(begin = var_570_begin_0, end = var_570_end_0, end_mask = var_570_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_570_cast_fp16")];
+            tensor<int32, [4]> var_574_begin_0 = const()[name = tensor<string, []>("op_574_begin_0"), val = tensor<int32, [4]>([0, 3456, 0, 0])];
+            tensor<int32, [4]> var_574_end_0 = const()[name = tensor<string, []>("op_574_end_0"), val = tensor<int32, [4]>([1, 3584, 1, 512])];
+            tensor<bool, [4]> var_574_end_mask_0 = const()[name = tensor<string, []>("op_574_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_574_cast_fp16 = slice_by_index(begin = var_574_begin_0, end = var_574_end_0, end_mask = var_574_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_574_cast_fp16")];
+            tensor<int32, [4]> var_578_begin_0 = const()[name = tensor<string, []>("op_578_begin_0"), val = tensor<int32, [4]>([0, 3584, 0, 0])];
+            tensor<int32, [4]> var_578_end_0 = const()[name = tensor<string, []>("op_578_end_0"), val = tensor<int32, [4]>([1, 3712, 1, 512])];
+            tensor<bool, [4]> var_578_end_mask_0 = const()[name = tensor<string, []>("op_578_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_578_cast_fp16 = slice_by_index(begin = var_578_begin_0, end = var_578_end_0, end_mask = var_578_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_578_cast_fp16")];
+            tensor<int32, [4]> var_582_begin_0 = const()[name = tensor<string, []>("op_582_begin_0"), val = tensor<int32, [4]>([0, 3712, 0, 0])];
+            tensor<int32, [4]> var_582_end_0 = const()[name = tensor<string, []>("op_582_end_0"), val = tensor<int32, [4]>([1, 3840, 1, 512])];
+            tensor<bool, [4]> var_582_end_mask_0 = const()[name = tensor<string, []>("op_582_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_582_cast_fp16 = slice_by_index(begin = var_582_begin_0, end = var_582_end_0, end_mask = var_582_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_582_cast_fp16")];
+            tensor<int32, [4]> var_586_begin_0 = const()[name = tensor<string, []>("op_586_begin_0"), val = tensor<int32, [4]>([0, 3840, 0, 0])];
+            tensor<int32, [4]> var_586_end_0 = const()[name = tensor<string, []>("op_586_end_0"), val = tensor<int32, [4]>([1, 3968, 1, 512])];
+            tensor<bool, [4]> var_586_end_mask_0 = const()[name = tensor<string, []>("op_586_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_586_cast_fp16 = slice_by_index(begin = var_586_begin_0, end = var_586_end_0, end_mask = var_586_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_586_cast_fp16")];
+            tensor<int32, [4]> var_590_begin_0 = const()[name = tensor<string, []>("op_590_begin_0"), val = tensor<int32, [4]>([0, 3968, 0, 0])];
+            tensor<int32, [4]> var_590_end_0 = const()[name = tensor<string, []>("op_590_end_0"), val = tensor<int32, [4]>([1, 4096, 1, 512])];
+            tensor<bool, [4]> var_590_end_mask_0 = const()[name = tensor<string, []>("op_590_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_590_cast_fp16 = slice_by_index(begin = var_590_begin_0, end = var_590_end_0, end_mask = var_590_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_590_cast_fp16")];
+            tensor<string, []> var_594_equation_0 = const()[name = tensor<string, []>("op_594_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_594_cast_fp16 = einsum(equation = var_594_equation_0, values = (var_340_cast_fp16, var_210_cast_fp16))[name = tensor<string, []>("op_594_cast_fp16")];
+            tensor<fp16, []> var_595_to_fp16 = const()[name = tensor<string, []>("op_595_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_596_cast_fp16 = mul(x = var_594_cast_fp16, y = var_595_to_fp16)[name = tensor<string, []>("op_596_cast_fp16")];
+            tensor<string, []> var_598_equation_0 = const()[name = tensor<string, []>("op_598_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = einsum(equation = var_598_equation_0, values = (var_344_cast_fp16, var_214_cast_fp16))[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, []> var_599_to_fp16 = const()[name = tensor<string, []>("op_599_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = mul(x = var_598_cast_fp16, y = var_599_to_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<string, []> var_602_equation_0 = const()[name = tensor<string, []>("op_602_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = einsum(equation = var_602_equation_0, values = (var_348_cast_fp16, var_218_cast_fp16))[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, []> var_603_to_fp16 = const()[name = tensor<string, []>("op_603_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = mul(x = var_602_cast_fp16, y = var_603_to_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<string, []> var_606_equation_0 = const()[name = tensor<string, []>("op_606_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = einsum(equation = var_606_equation_0, values = (var_352_cast_fp16, var_222_cast_fp16))[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, []> var_607_to_fp16 = const()[name = tensor<string, []>("op_607_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = mul(x = var_606_cast_fp16, y = var_607_to_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<string, []> var_610_equation_0 = const()[name = tensor<string, []>("op_610_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = einsum(equation = var_610_equation_0, values = (var_356_cast_fp16, var_226_cast_fp16))[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, []> var_611_to_fp16 = const()[name = tensor<string, []>("op_611_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = mul(x = var_610_cast_fp16, y = var_611_to_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<string, []> var_614_equation_0 = const()[name = tensor<string, []>("op_614_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = einsum(equation = var_614_equation_0, values = (var_360_cast_fp16, var_230_cast_fp16))[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, []> var_615_to_fp16 = const()[name = tensor<string, []>("op_615_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = mul(x = var_614_cast_fp16, y = var_615_to_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<string, []> var_618_equation_0 = const()[name = tensor<string, []>("op_618_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = einsum(equation = var_618_equation_0, values = (var_364_cast_fp16, var_234_cast_fp16))[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, []> var_619_to_fp16 = const()[name = tensor<string, []>("op_619_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = mul(x = var_618_cast_fp16, y = var_619_to_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<string, []> var_622_equation_0 = const()[name = tensor<string, []>("op_622_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_622_cast_fp16 = einsum(equation = var_622_equation_0, values = (var_368_cast_fp16, var_238_cast_fp16))[name = tensor<string, []>("op_622_cast_fp16")];
+            tensor<fp16, []> var_623_to_fp16 = const()[name = tensor<string, []>("op_623_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_624_cast_fp16 = mul(x = var_622_cast_fp16, y = var_623_to_fp16)[name = tensor<string, []>("op_624_cast_fp16")];
+            tensor<string, []> var_626_equation_0 = const()[name = tensor<string, []>("op_626_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_626_cast_fp16 = einsum(equation = var_626_equation_0, values = (var_372_cast_fp16, var_242_cast_fp16))[name = tensor<string, []>("op_626_cast_fp16")];
+            tensor<fp16, []> var_627_to_fp16 = const()[name = tensor<string, []>("op_627_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_628_cast_fp16 = mul(x = var_626_cast_fp16, y = var_627_to_fp16)[name = tensor<string, []>("op_628_cast_fp16")];
+            tensor<string, []> var_630_equation_0 = const()[name = tensor<string, []>("op_630_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_630_cast_fp16 = einsum(equation = var_630_equation_0, values = (var_376_cast_fp16, var_246_cast_fp16))[name = tensor<string, []>("op_630_cast_fp16")];
+            tensor<fp16, []> var_631_to_fp16 = const()[name = tensor<string, []>("op_631_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_632_cast_fp16 = mul(x = var_630_cast_fp16, y = var_631_to_fp16)[name = tensor<string, []>("op_632_cast_fp16")];
+            tensor<string, []> var_634_equation_0 = const()[name = tensor<string, []>("op_634_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_634_cast_fp16 = einsum(equation = var_634_equation_0, values = (var_380_cast_fp16, var_250_cast_fp16))[name = tensor<string, []>("op_634_cast_fp16")];
+            tensor<fp16, []> var_635_to_fp16 = const()[name = tensor<string, []>("op_635_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_636_cast_fp16 = mul(x = var_634_cast_fp16, y = var_635_to_fp16)[name = tensor<string, []>("op_636_cast_fp16")];
+            tensor<string, []> var_638_equation_0 = const()[name = tensor<string, []>("op_638_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_638_cast_fp16 = einsum(equation = var_638_equation_0, values = (var_384_cast_fp16, var_254_cast_fp16))[name = tensor<string, []>("op_638_cast_fp16")];
+            tensor<fp16, []> var_639_to_fp16 = const()[name = tensor<string, []>("op_639_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_640_cast_fp16 = mul(x = var_638_cast_fp16, y = var_639_to_fp16)[name = tensor<string, []>("op_640_cast_fp16")];
+            tensor<string, []> var_642_equation_0 = const()[name = tensor<string, []>("op_642_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_642_cast_fp16 = einsum(equation = var_642_equation_0, values = (var_388_cast_fp16, var_258_cast_fp16))[name = tensor<string, []>("op_642_cast_fp16")];
+            tensor<fp16, []> var_643_to_fp16 = const()[name = tensor<string, []>("op_643_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_644_cast_fp16 = mul(x = var_642_cast_fp16, y = var_643_to_fp16)[name = tensor<string, []>("op_644_cast_fp16")];
+            tensor<string, []> var_646_equation_0 = const()[name = tensor<string, []>("op_646_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_646_cast_fp16 = einsum(equation = var_646_equation_0, values = (var_392_cast_fp16, var_262_cast_fp16))[name = tensor<string, []>("op_646_cast_fp16")];
+            tensor<fp16, []> var_647_to_fp16 = const()[name = tensor<string, []>("op_647_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_648_cast_fp16 = mul(x = var_646_cast_fp16, y = var_647_to_fp16)[name = tensor<string, []>("op_648_cast_fp16")];
+            tensor<string, []> var_650_equation_0 = const()[name = tensor<string, []>("op_650_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_650_cast_fp16 = einsum(equation = var_650_equation_0, values = (var_396_cast_fp16, var_266_cast_fp16))[name = tensor<string, []>("op_650_cast_fp16")];
+            tensor<fp16, []> var_651_to_fp16 = const()[name = tensor<string, []>("op_651_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_652_cast_fp16 = mul(x = var_650_cast_fp16, y = var_651_to_fp16)[name = tensor<string, []>("op_652_cast_fp16")];
+            tensor<string, []> var_654_equation_0 = const()[name = tensor<string, []>("op_654_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_654_cast_fp16 = einsum(equation = var_654_equation_0, values = (var_400_cast_fp16, var_270_cast_fp16))[name = tensor<string, []>("op_654_cast_fp16")];
+            tensor<fp16, []> var_655_to_fp16 = const()[name = tensor<string, []>("op_655_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_656_cast_fp16 = mul(x = var_654_cast_fp16, y = var_655_to_fp16)[name = tensor<string, []>("op_656_cast_fp16")];
+            tensor<string, []> var_658_equation_0 = const()[name = tensor<string, []>("op_658_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_658_cast_fp16 = einsum(equation = var_658_equation_0, values = (var_404_cast_fp16, var_274_cast_fp16))[name = tensor<string, []>("op_658_cast_fp16")];
+            tensor<fp16, []> var_659_to_fp16 = const()[name = tensor<string, []>("op_659_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_660_cast_fp16 = mul(x = var_658_cast_fp16, y = var_659_to_fp16)[name = tensor<string, []>("op_660_cast_fp16")];
+            tensor<string, []> var_662_equation_0 = const()[name = tensor<string, []>("op_662_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_662_cast_fp16 = einsum(equation = var_662_equation_0, values = (var_408_cast_fp16, var_278_cast_fp16))[name = tensor<string, []>("op_662_cast_fp16")];
+            tensor<fp16, []> var_663_to_fp16 = const()[name = tensor<string, []>("op_663_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_664_cast_fp16 = mul(x = var_662_cast_fp16, y = var_663_to_fp16)[name = tensor<string, []>("op_664_cast_fp16")];
+            tensor<string, []> var_666_equation_0 = const()[name = tensor<string, []>("op_666_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_666_cast_fp16 = einsum(equation = var_666_equation_0, values = (var_412_cast_fp16, var_282_cast_fp16))[name = tensor<string, []>("op_666_cast_fp16")];
+            tensor<fp16, []> var_667_to_fp16 = const()[name = tensor<string, []>("op_667_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_668_cast_fp16 = mul(x = var_666_cast_fp16, y = var_667_to_fp16)[name = tensor<string, []>("op_668_cast_fp16")];
+            tensor<string, []> var_670_equation_0 = const()[name = tensor<string, []>("op_670_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_670_cast_fp16 = einsum(equation = var_670_equation_0, values = (var_416_cast_fp16, var_286_cast_fp16))[name = tensor<string, []>("op_670_cast_fp16")];
+            tensor<fp16, []> var_671_to_fp16 = const()[name = tensor<string, []>("op_671_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_672_cast_fp16 = mul(x = var_670_cast_fp16, y = var_671_to_fp16)[name = tensor<string, []>("op_672_cast_fp16")];
+            tensor<string, []> var_674_equation_0 = const()[name = tensor<string, []>("op_674_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_674_cast_fp16 = einsum(equation = var_674_equation_0, values = (var_420_cast_fp16, var_290_cast_fp16))[name = tensor<string, []>("op_674_cast_fp16")];
+            tensor<fp16, []> var_675_to_fp16 = const()[name = tensor<string, []>("op_675_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_676_cast_fp16 = mul(x = var_674_cast_fp16, y = var_675_to_fp16)[name = tensor<string, []>("op_676_cast_fp16")];
+            tensor<string, []> var_678_equation_0 = const()[name = tensor<string, []>("op_678_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_678_cast_fp16 = einsum(equation = var_678_equation_0, values = (var_424_cast_fp16, var_294_cast_fp16))[name = tensor<string, []>("op_678_cast_fp16")];
+            tensor<fp16, []> var_679_to_fp16 = const()[name = tensor<string, []>("op_679_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_680_cast_fp16 = mul(x = var_678_cast_fp16, y = var_679_to_fp16)[name = tensor<string, []>("op_680_cast_fp16")];
+            tensor<string, []> var_682_equation_0 = const()[name = tensor<string, []>("op_682_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_682_cast_fp16 = einsum(equation = var_682_equation_0, values = (var_428_cast_fp16, var_298_cast_fp16))[name = tensor<string, []>("op_682_cast_fp16")];
+            tensor<fp16, []> var_683_to_fp16 = const()[name = tensor<string, []>("op_683_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_684_cast_fp16 = mul(x = var_682_cast_fp16, y = var_683_to_fp16)[name = tensor<string, []>("op_684_cast_fp16")];
+            tensor<string, []> var_686_equation_0 = const()[name = tensor<string, []>("op_686_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_686_cast_fp16 = einsum(equation = var_686_equation_0, values = (var_432_cast_fp16, var_302_cast_fp16))[name = tensor<string, []>("op_686_cast_fp16")];
+            tensor<fp16, []> var_687_to_fp16 = const()[name = tensor<string, []>("op_687_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_688_cast_fp16 = mul(x = var_686_cast_fp16, y = var_687_to_fp16)[name = tensor<string, []>("op_688_cast_fp16")];
+            tensor<string, []> var_690_equation_0 = const()[name = tensor<string, []>("op_690_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_690_cast_fp16 = einsum(equation = var_690_equation_0, values = (var_436_cast_fp16, var_306_cast_fp16))[name = tensor<string, []>("op_690_cast_fp16")];
+            tensor<fp16, []> var_691_to_fp16 = const()[name = tensor<string, []>("op_691_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_692_cast_fp16 = mul(x = var_690_cast_fp16, y = var_691_to_fp16)[name = tensor<string, []>("op_692_cast_fp16")];
+            tensor<string, []> var_694_equation_0 = const()[name = tensor<string, []>("op_694_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_694_cast_fp16 = einsum(equation = var_694_equation_0, values = (var_440_cast_fp16, var_310_cast_fp16))[name = tensor<string, []>("op_694_cast_fp16")];
+            tensor<fp16, []> var_695_to_fp16 = const()[name = tensor<string, []>("op_695_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_696_cast_fp16 = mul(x = var_694_cast_fp16, y = var_695_to_fp16)[name = tensor<string, []>("op_696_cast_fp16")];
+            tensor<string, []> var_698_equation_0 = const()[name = tensor<string, []>("op_698_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_698_cast_fp16 = einsum(equation = var_698_equation_0, values = (var_444_cast_fp16, var_314_cast_fp16))[name = tensor<string, []>("op_698_cast_fp16")];
+            tensor<fp16, []> var_699_to_fp16 = const()[name = tensor<string, []>("op_699_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_700_cast_fp16 = mul(x = var_698_cast_fp16, y = var_699_to_fp16)[name = tensor<string, []>("op_700_cast_fp16")];
+            tensor<string, []> var_702_equation_0 = const()[name = tensor<string, []>("op_702_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_702_cast_fp16 = einsum(equation = var_702_equation_0, values = (var_448_cast_fp16, var_318_cast_fp16))[name = tensor<string, []>("op_702_cast_fp16")];
+            tensor<fp16, []> var_703_to_fp16 = const()[name = tensor<string, []>("op_703_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_704_cast_fp16 = mul(x = var_702_cast_fp16, y = var_703_to_fp16)[name = tensor<string, []>("op_704_cast_fp16")];
+            tensor<string, []> var_706_equation_0 = const()[name = tensor<string, []>("op_706_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_706_cast_fp16 = einsum(equation = var_706_equation_0, values = (var_452_cast_fp16, var_322_cast_fp16))[name = tensor<string, []>("op_706_cast_fp16")];
+            tensor<fp16, []> var_707_to_fp16 = const()[name = tensor<string, []>("op_707_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_708_cast_fp16 = mul(x = var_706_cast_fp16, y = var_707_to_fp16)[name = tensor<string, []>("op_708_cast_fp16")];
+            tensor<string, []> var_710_equation_0 = const()[name = tensor<string, []>("op_710_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_710_cast_fp16 = einsum(equation = var_710_equation_0, values = (var_456_cast_fp16, var_326_cast_fp16))[name = tensor<string, []>("op_710_cast_fp16")];
+            tensor<fp16, []> var_711_to_fp16 = const()[name = tensor<string, []>("op_711_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_712_cast_fp16 = mul(x = var_710_cast_fp16, y = var_711_to_fp16)[name = tensor<string, []>("op_712_cast_fp16")];
+            tensor<string, []> var_714_equation_0 = const()[name = tensor<string, []>("op_714_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_714_cast_fp16 = einsum(equation = var_714_equation_0, values = (var_460_cast_fp16, var_330_cast_fp16))[name = tensor<string, []>("op_714_cast_fp16")];
+            tensor<fp16, []> var_715_to_fp16 = const()[name = tensor<string, []>("op_715_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_716_cast_fp16 = mul(x = var_714_cast_fp16, y = var_715_to_fp16)[name = tensor<string, []>("op_716_cast_fp16")];
+            tensor<string, []> var_718_equation_0 = const()[name = tensor<string, []>("op_718_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_718_cast_fp16 = einsum(equation = var_718_equation_0, values = (var_464_cast_fp16, var_334_cast_fp16))[name = tensor<string, []>("op_718_cast_fp16")];
+            tensor<fp16, []> var_719_to_fp16 = const()[name = tensor<string, []>("op_719_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_720_cast_fp16 = mul(x = var_718_cast_fp16, y = var_719_to_fp16)[name = tensor<string, []>("op_720_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_596_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_600_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_604_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_608_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_612_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_616_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_620_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_624_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_628_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_632_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_636_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_640_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_644_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_648_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_652_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_656_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_660_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_664_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_668_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_672_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_676_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_680_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_684_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_688_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_692_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_696_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_700_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_704_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_708_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_712_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_716_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_720_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_753_cast_fp16 = softmax(axis = var_64, x = aw_1_cast_fp16)[name = tensor<string, []>("op_753_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_754_cast_fp16 = softmax(axis = var_64, x = aw_3_cast_fp16)[name = tensor<string, []>("op_754_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_755_cast_fp16 = softmax(axis = var_64, x = aw_5_cast_fp16)[name = tensor<string, []>("op_755_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_756_cast_fp16 = softmax(axis = var_64, x = aw_7_cast_fp16)[name = tensor<string, []>("op_756_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_757_cast_fp16 = softmax(axis = var_64, x = aw_9_cast_fp16)[name = tensor<string, []>("op_757_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_758_cast_fp16 = softmax(axis = var_64, x = aw_11_cast_fp16)[name = tensor<string, []>("op_758_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_759_cast_fp16 = softmax(axis = var_64, x = aw_13_cast_fp16)[name = tensor<string, []>("op_759_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_760_cast_fp16 = softmax(axis = var_64, x = aw_15_cast_fp16)[name = tensor<string, []>("op_760_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_761_cast_fp16 = softmax(axis = var_64, x = aw_17_cast_fp16)[name = tensor<string, []>("op_761_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_762_cast_fp16 = softmax(axis = var_64, x = aw_19_cast_fp16)[name = tensor<string, []>("op_762_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_763_cast_fp16 = softmax(axis = var_64, x = aw_21_cast_fp16)[name = tensor<string, []>("op_763_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_764_cast_fp16 = softmax(axis = var_64, x = aw_23_cast_fp16)[name = tensor<string, []>("op_764_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_765_cast_fp16 = softmax(axis = var_64, x = aw_25_cast_fp16)[name = tensor<string, []>("op_765_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_766_cast_fp16 = softmax(axis = var_64, x = aw_27_cast_fp16)[name = tensor<string, []>("op_766_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_767_cast_fp16 = softmax(axis = var_64, x = aw_29_cast_fp16)[name = tensor<string, []>("op_767_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_768_cast_fp16 = softmax(axis = var_64, x = aw_31_cast_fp16)[name = tensor<string, []>("op_768_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_769_cast_fp16 = softmax(axis = var_64, x = aw_33_cast_fp16)[name = tensor<string, []>("op_769_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_770_cast_fp16 = softmax(axis = var_64, x = aw_35_cast_fp16)[name = tensor<string, []>("op_770_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_771_cast_fp16 = softmax(axis = var_64, x = aw_37_cast_fp16)[name = tensor<string, []>("op_771_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_772_cast_fp16 = softmax(axis = var_64, x = aw_39_cast_fp16)[name = tensor<string, []>("op_772_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_773_cast_fp16 = softmax(axis = var_64, x = aw_41_cast_fp16)[name = tensor<string, []>("op_773_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_774_cast_fp16 = softmax(axis = var_64, x = aw_43_cast_fp16)[name = tensor<string, []>("op_774_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_775_cast_fp16 = softmax(axis = var_64, x = aw_45_cast_fp16)[name = tensor<string, []>("op_775_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_776_cast_fp16 = softmax(axis = var_64, x = aw_47_cast_fp16)[name = tensor<string, []>("op_776_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_777_cast_fp16 = softmax(axis = var_64, x = aw_49_cast_fp16)[name = tensor<string, []>("op_777_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_778_cast_fp16 = softmax(axis = var_64, x = aw_51_cast_fp16)[name = tensor<string, []>("op_778_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_779_cast_fp16 = softmax(axis = var_64, x = aw_53_cast_fp16)[name = tensor<string, []>("op_779_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_780_cast_fp16 = softmax(axis = var_64, x = aw_55_cast_fp16)[name = tensor<string, []>("op_780_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_781_cast_fp16 = softmax(axis = var_64, x = aw_57_cast_fp16)[name = tensor<string, []>("op_781_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_782_cast_fp16 = softmax(axis = var_64, x = aw_59_cast_fp16)[name = tensor<string, []>("op_782_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_783_cast_fp16 = softmax(axis = var_64, x = aw_61_cast_fp16)[name = tensor<string, []>("op_783_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_784_cast_fp16 = softmax(axis = var_64, x = aw_63_cast_fp16)[name = tensor<string, []>("op_784_cast_fp16")];
+            tensor<string, []> var_786_equation_0 = const()[name = tensor<string, []>("op_786_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_786_cast_fp16 = einsum(equation = var_786_equation_0, values = (var_466_cast_fp16, var_753_cast_fp16))[name = tensor<string, []>("op_786_cast_fp16")];
+            tensor<string, []> var_788_equation_0 = const()[name = tensor<string, []>("op_788_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_788_cast_fp16 = einsum(equation = var_788_equation_0, values = (var_470_cast_fp16, var_754_cast_fp16))[name = tensor<string, []>("op_788_cast_fp16")];
+            tensor<string, []> var_790_equation_0 = const()[name = tensor<string, []>("op_790_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_790_cast_fp16 = einsum(equation = var_790_equation_0, values = (var_474_cast_fp16, var_755_cast_fp16))[name = tensor<string, []>("op_790_cast_fp16")];
+            tensor<string, []> var_792_equation_0 = const()[name = tensor<string, []>("op_792_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_792_cast_fp16 = einsum(equation = var_792_equation_0, values = (var_478_cast_fp16, var_756_cast_fp16))[name = tensor<string, []>("op_792_cast_fp16")];
+            tensor<string, []> var_794_equation_0 = const()[name = tensor<string, []>("op_794_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_794_cast_fp16 = einsum(equation = var_794_equation_0, values = (var_482_cast_fp16, var_757_cast_fp16))[name = tensor<string, []>("op_794_cast_fp16")];
+            tensor<string, []> var_796_equation_0 = const()[name = tensor<string, []>("op_796_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_796_cast_fp16 = einsum(equation = var_796_equation_0, values = (var_486_cast_fp16, var_758_cast_fp16))[name = tensor<string, []>("op_796_cast_fp16")];
+            tensor<string, []> var_798_equation_0 = const()[name = tensor<string, []>("op_798_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_798_cast_fp16 = einsum(equation = var_798_equation_0, values = (var_490_cast_fp16, var_759_cast_fp16))[name = tensor<string, []>("op_798_cast_fp16")];
+            tensor<string, []> var_800_equation_0 = const()[name = tensor<string, []>("op_800_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_800_cast_fp16 = einsum(equation = var_800_equation_0, values = (var_494_cast_fp16, var_760_cast_fp16))[name = tensor<string, []>("op_800_cast_fp16")];
+            tensor<string, []> var_802_equation_0 = const()[name = tensor<string, []>("op_802_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_802_cast_fp16 = einsum(equation = var_802_equation_0, values = (var_498_cast_fp16, var_761_cast_fp16))[name = tensor<string, []>("op_802_cast_fp16")];
+            tensor<string, []> var_804_equation_0 = const()[name = tensor<string, []>("op_804_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_804_cast_fp16 = einsum(equation = var_804_equation_0, values = (var_502_cast_fp16, var_762_cast_fp16))[name = tensor<string, []>("op_804_cast_fp16")];
+            tensor<string, []> var_806_equation_0 = const()[name = tensor<string, []>("op_806_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_806_cast_fp16 = einsum(equation = var_806_equation_0, values = (var_506_cast_fp16, var_763_cast_fp16))[name = tensor<string, []>("op_806_cast_fp16")];
+            tensor<string, []> var_808_equation_0 = const()[name = tensor<string, []>("op_808_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_808_cast_fp16 = einsum(equation = var_808_equation_0, values = (var_510_cast_fp16, var_764_cast_fp16))[name = tensor<string, []>("op_808_cast_fp16")];
+            tensor<string, []> var_810_equation_0 = const()[name = tensor<string, []>("op_810_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_810_cast_fp16 = einsum(equation = var_810_equation_0, values = (var_514_cast_fp16, var_765_cast_fp16))[name = tensor<string, []>("op_810_cast_fp16")];
+            tensor<string, []> var_812_equation_0 = const()[name = tensor<string, []>("op_812_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_812_cast_fp16 = einsum(equation = var_812_equation_0, values = (var_518_cast_fp16, var_766_cast_fp16))[name = tensor<string, []>("op_812_cast_fp16")];
+            tensor<string, []> var_814_equation_0 = const()[name = tensor<string, []>("op_814_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_814_cast_fp16 = einsum(equation = var_814_equation_0, values = (var_522_cast_fp16, var_767_cast_fp16))[name = tensor<string, []>("op_814_cast_fp16")];
+            tensor<string, []> var_816_equation_0 = const()[name = tensor<string, []>("op_816_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_816_cast_fp16 = einsum(equation = var_816_equation_0, values = (var_526_cast_fp16, var_768_cast_fp16))[name = tensor<string, []>("op_816_cast_fp16")];
+            tensor<string, []> var_818_equation_0 = const()[name = tensor<string, []>("op_818_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_818_cast_fp16 = einsum(equation = var_818_equation_0, values = (var_530_cast_fp16, var_769_cast_fp16))[name = tensor<string, []>("op_818_cast_fp16")];
+            tensor<string, []> var_820_equation_0 = const()[name = tensor<string, []>("op_820_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_820_cast_fp16 = einsum(equation = var_820_equation_0, values = (var_534_cast_fp16, var_770_cast_fp16))[name = tensor<string, []>("op_820_cast_fp16")];
+            tensor<string, []> var_822_equation_0 = const()[name = tensor<string, []>("op_822_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_822_cast_fp16 = einsum(equation = var_822_equation_0, values = (var_538_cast_fp16, var_771_cast_fp16))[name = tensor<string, []>("op_822_cast_fp16")];
+            tensor<string, []> var_824_equation_0 = const()[name = tensor<string, []>("op_824_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_824_cast_fp16 = einsum(equation = var_824_equation_0, values = (var_542_cast_fp16, var_772_cast_fp16))[name = tensor<string, []>("op_824_cast_fp16")];
+            tensor<string, []> var_826_equation_0 = const()[name = tensor<string, []>("op_826_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_826_cast_fp16 = einsum(equation = var_826_equation_0, values = (var_546_cast_fp16, var_773_cast_fp16))[name = tensor<string, []>("op_826_cast_fp16")];
+            tensor<string, []> var_828_equation_0 = const()[name = tensor<string, []>("op_828_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_828_cast_fp16 = einsum(equation = var_828_equation_0, values = (var_550_cast_fp16, var_774_cast_fp16))[name = tensor<string, []>("op_828_cast_fp16")];
+            tensor<string, []> var_830_equation_0 = const()[name = tensor<string, []>("op_830_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_830_cast_fp16 = einsum(equation = var_830_equation_0, values = (var_554_cast_fp16, var_775_cast_fp16))[name = tensor<string, []>("op_830_cast_fp16")];
+            tensor<string, []> var_832_equation_0 = const()[name = tensor<string, []>("op_832_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_832_cast_fp16 = einsum(equation = var_832_equation_0, values = (var_558_cast_fp16, var_776_cast_fp16))[name = tensor<string, []>("op_832_cast_fp16")];
+            tensor<string, []> var_834_equation_0 = const()[name = tensor<string, []>("op_834_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_834_cast_fp16 = einsum(equation = var_834_equation_0, values = (var_562_cast_fp16, var_777_cast_fp16))[name = tensor<string, []>("op_834_cast_fp16")];
+            tensor<string, []> var_836_equation_0 = const()[name = tensor<string, []>("op_836_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_836_cast_fp16 = einsum(equation = var_836_equation_0, values = (var_566_cast_fp16, var_778_cast_fp16))[name = tensor<string, []>("op_836_cast_fp16")];
+            tensor<string, []> var_838_equation_0 = const()[name = tensor<string, []>("op_838_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_838_cast_fp16 = einsum(equation = var_838_equation_0, values = (var_570_cast_fp16, var_779_cast_fp16))[name = tensor<string, []>("op_838_cast_fp16")];
+            tensor<string, []> var_840_equation_0 = const()[name = tensor<string, []>("op_840_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_840_cast_fp16 = einsum(equation = var_840_equation_0, values = (var_574_cast_fp16, var_780_cast_fp16))[name = tensor<string, []>("op_840_cast_fp16")];
+            tensor<string, []> var_842_equation_0 = const()[name = tensor<string, []>("op_842_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_842_cast_fp16 = einsum(equation = var_842_equation_0, values = (var_578_cast_fp16, var_781_cast_fp16))[name = tensor<string, []>("op_842_cast_fp16")];
+            tensor<string, []> var_844_equation_0 = const()[name = tensor<string, []>("op_844_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_844_cast_fp16 = einsum(equation = var_844_equation_0, values = (var_582_cast_fp16, var_782_cast_fp16))[name = tensor<string, []>("op_844_cast_fp16")];
+            tensor<string, []> var_846_equation_0 = const()[name = tensor<string, []>("op_846_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_846_cast_fp16 = einsum(equation = var_846_equation_0, values = (var_586_cast_fp16, var_783_cast_fp16))[name = tensor<string, []>("op_846_cast_fp16")];
+            tensor<string, []> var_848_equation_0 = const()[name = tensor<string, []>("op_848_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_848_cast_fp16 = einsum(equation = var_848_equation_0, values = (var_590_cast_fp16, var_784_cast_fp16))[name = tensor<string, []>("op_848_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 4096, 1, 64]> x_11_cast_fp16 = concat(axis = var_64, interleave = x_11_interleave_0, values = (var_786_cast_fp16, var_788_cast_fp16, var_790_cast_fp16, var_792_cast_fp16, var_794_cast_fp16, var_796_cast_fp16, var_798_cast_fp16, var_800_cast_fp16, var_802_cast_fp16, var_804_cast_fp16, var_806_cast_fp16, var_808_cast_fp16, var_810_cast_fp16, var_812_cast_fp16, var_814_cast_fp16, var_816_cast_fp16, var_818_cast_fp16, var_820_cast_fp16, var_822_cast_fp16, var_824_cast_fp16, var_826_cast_fp16, var_828_cast_fp16, var_830_cast_fp16, var_832_cast_fp16, var_834_cast_fp16, var_836_cast_fp16, var_838_cast_fp16, var_840_cast_fp16, var_842_cast_fp16, var_844_cast_fp16, var_846_cast_fp16, var_848_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_853 = const()[name = tensor<string, []>("op_853"), val = tensor<int32, [4]>([1, 4096, -1, 8])];
+            tensor<fp16, [1, 4096, 8, 8]> input_3_cast_fp16 = reshape(shape = var_853, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_857 = const()[name = tensor<string, []>("op_857"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_859 = const()[name = tensor<string, []>("op_859"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_861_pad_type_0 = const()[name = tensor<string, []>("op_861_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_861_pad_0 = const()[name = tensor<string, []>("op_861_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 8, 8]> var_861_cast_fp16 = conv(dilations = var_859, groups = var_64, pad = var_861_pad_0, pad_type = var_861_pad_type_0, strides = var_857, weight = blocks_0_attn_proj_weight_palettized_cast_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("op_861_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_attn_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303600064)))];
+            tensor<fp16, [1, 4096, 8, 8]> attention_output_1_cast_fp16 = mul(x = var_861_cast_fp16, y = blocks_0_attn_proj_output_scales_to_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 4096, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303608320)))];
+            tensor<fp16, [1, 4097, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_64, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_67, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_886_to_fp16 = const()[name = tensor<string, []>("op_886_to_fp16"), val = tensor<fp16, []>(0x1p+6)];
+            tensor<fp16, [1, 4096, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_886_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303608512)))];
+            tensor<fp16, [1, 4096, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_898 = const()[name = tensor<string, []>("op_898"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_900 = const()[name = tensor<string, []>("op_900"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_902_pad_type_0 = const()[name = tensor<string, []>("op_902_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_902_pad_0 = const()[name = tensor<string, []>("op_902_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 8, 8]> var_902_cast_fp16 = conv(dilations = var_900, groups = var_64, pad = var_902_pad_0, pad_type = var_902_pad_type_0, strides = var_898, weight = blocks_0_mlp_fc_1_weight_palettized_cast_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_1_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303616768)))];
+            tensor<fp16, [1, 11008, 8, 8]> input_7_cast_fp16 = mul(x = var_902_cast_fp16, y = blocks_0_mlp_fc_1_output_scales_to_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_906 = const()[name = tensor<string, []>("op_906"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_908 = const()[name = tensor<string, []>("op_908"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_910_pad_type_0 = const()[name = tensor<string, []>("op_910_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_910_pad_0 = const()[name = tensor<string, []>("op_910_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 8, 8]> var_910_cast_fp16 = conv(dilations = var_908, groups = var_64, pad = var_910_pad_0, pad_type = var_910_pad_type_0, strides = var_906, weight = blocks_0_mlp_fc_2_weight_palettized_cast_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("op_910_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_0_mlp_fc_2_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303638848)))];
+            tensor<fp16, [1, 11008, 8, 8]> x_fc_2_1_cast_fp16 = mul(x = var_910_cast_fp16, y = blocks_0_mlp_fc_2_output_scales_to_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 11008, 8, 8]> var_912_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_912_cast_fp16")];
+            tensor<fp16, [1, 11008, 8, 8]> input_9_cast_fp16 = mul(x = var_912_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_916 = const()[name = tensor<string, []>("op_916"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_918 = const()[name = tensor<string, []>("op_918"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_920_pad_type_0 = const()[name = tensor<string, []>("op_920_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_920_pad_0 = const()[name = tensor<string, []>("op_920_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 8, 8]> var_920_cast_fp16 = conv(dilations = var_918, groups = var_64, pad = var_920_pad_0, pad_type = var_920_pad_type_0, strides = var_916, weight = blocks_0_mlp_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_0_mlp_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303660928)))];
+            tensor<fp16, [1, 4096, 8, 8]> var_921_cast_fp16 = mul(x = var_920_cast_fp16, y = blocks_0_mlp_proj_output_scales_to_fp16)[name = tensor<string, []>("op_921_cast_fp16")];
+            tensor<fp16, [1, 4096, 8, 8]> x_17_cast_fp16 = add(x = var_921_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_927 = const()[name = tensor<string, []>("op_927"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_931 = const()[name = tensor<string, []>("op_931"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_933 = const()[name = tensor<string, []>("op_933"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_974 = const()[name = tensor<string, []>("op_974"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_977 = const()[name = tensor<string, []>("op_977"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303669184)))];
+            tensor<fp16, [1, 4097, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_974, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_977, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 4096, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_1000_to_fp16 = const()[name = tensor<string, []>("op_1000_to_fp16"), val = tensor<fp16, []>(0x1p+6)];
+            tensor<fp16, [1, 4096, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_1000_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303669376)))];
+            tensor<fp16, [1, 4096, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_1025 = const()[name = tensor<string, []>("op_1025"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 4096, 1, 64]> input_11_cast_fp16 = reshape(shape = var_1025, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_1029 = const()[name = tensor<string, []>("op_1029"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1031 = const()[name = tensor<string, []>("op_1031"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1033_pad_type_0 = const()[name = tensor<string, []>("op_1033_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1033_pad_0 = const()[name = tensor<string, []>("op_1033_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 64]> var_1033_cast_fp16 = conv(dilations = var_1031, groups = var_974, pad = var_1033_pad_0, pad_type = var_1033_pad_type_0, strides = var_1029, weight = blocks_1_attn_q_proj_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("op_1033_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_q_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303677632)))];
+            tensor<fp16, [1, 4096, 1, 64]> q_9_cast_fp16 = mul(x = var_1033_cast_fp16, y = blocks_1_attn_q_proj_output_scales_to_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_1037 = const()[name = tensor<string, []>("op_1037"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1039 = const()[name = tensor<string, []>("op_1039"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1041_pad_type_0 = const()[name = tensor<string, []>("op_1041_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1041_pad_0 = const()[name = tensor<string, []>("op_1041_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 64]> var_1041_cast_fp16 = conv(dilations = var_1039, groups = var_974, pad = var_1041_pad_0, pad_type = var_1041_pad_type_0, strides = var_1037, weight = blocks_1_attn_k_proj_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("op_1041_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_k_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303685888)))];
+            tensor<fp16, [1, 4096, 1, 64]> k_11_cast_fp16 = mul(x = var_1041_cast_fp16, y = blocks_1_attn_k_proj_output_scales_to_fp16)[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<int32, [2]> var_1045 = const()[name = tensor<string, []>("op_1045"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1047 = const()[name = tensor<string, []>("op_1047"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1049_pad_type_0 = const()[name = tensor<string, []>("op_1049_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1049_pad_0 = const()[name = tensor<string, []>("op_1049_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 64]> var_1049_cast_fp16 = conv(dilations = var_1047, groups = var_974, pad = var_1049_pad_0, pad_type = var_1049_pad_type_0, strides = var_1045, weight = blocks_1_attn_v_proj_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("op_1049_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_v_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303694144)))];
+            tensor<fp16, [1, 4096, 1, 64]> v_11_cast_fp16 = mul(x = var_1049_cast_fp16, y = blocks_1_attn_v_proj_output_scales_to_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_1051 = const()[name = tensor<string, []>("op_1051"), val = tensor<int32, [4]>([1, 32, 128, 64])];
+            tensor<fp16, [1, 32, 128, 64]> q_11_cast_fp16 = reshape(shape = var_1051, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_1053 = const()[name = tensor<string, []>("op_1053"), val = tensor<int32, [4]>([1, 32, 128, 64])];
+            tensor<fp16, [1, 32, 128, 64]> k_13_cast_fp16 = reshape(shape = var_1053, x = k_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [4]> var_1067_begin_0 = const()[name = tensor<string, []>("op_1067_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1067_end_0 = const()[name = tensor<string, []>("op_1067_end_0"), val = tensor<int32, [4]>([1, 32, 64, 64])];
+            tensor<bool, [4]> var_1067_end_mask_0 = const()[name = tensor<string, []>("op_1067_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 64]> var_1067_cast_fp16 = slice_by_index(begin = var_1067_begin_0, end = var_1067_end_0, end_mask = var_1067_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_1067_cast_fp16")];
+            tensor<int32, [4]> var_1073_begin_0 = const()[name = tensor<string, []>("op_1073_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1073_end_0 = const()[name = tensor<string, []>("op_1073_end_0"), val = tensor<int32, [4]>([1, 32, 128, 64])];
+            tensor<bool, [4]> var_1073_end_mask_0 = const()[name = tensor<string, []>("op_1073_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 64]> var_1073_cast_fp16 = slice_by_index(begin = var_1073_begin_0, end = var_1073_end_0, end_mask = var_1073_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_1073_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 64]> var_1075_cast_fp16 = mul(x = var_1073_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_1075_cast_fp16")];
             tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_237, interleave = rotated_5_interleave_0, values = (var_320_cast_fp16, var_312_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> var_323_cast_fp16 = mul(x = q_9_cast_fp16, y = cos)[name = tensor<string, []>("op_323_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> var_324_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_324_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> roped_5_cast_fp16 = add(x = var_323_cast_fp16, y = var_324_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
-            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 32, 64, 64])];
-            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 64]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
-            tensor<int32, [4]> var_343_begin_0 = const()[name = tensor<string, []>("op_343_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_343_end_0 = const()[name = tensor<string, []>("op_343_end_0"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<bool, [4]> var_343_end_mask_0 = const()[name = tensor<string, []>("op_343_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 64]> var_343_cast_fp16 = slice_by_index(begin = var_343_begin_0, end = var_343_end_0, end_mask = var_343_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_343_cast_fp16")];
-            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 64]> var_345_cast_fp16 = mul(x = var_343_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_345_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_931, interleave = rotated_5_interleave_0, values = (var_1075_cast_fp16, var_1067_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> var_1078_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> var_1079_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_1079_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> roped_5_cast_fp16 = add(x = var_1078_cast_fp16, y = var_1079_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_1092_begin_0 = const()[name = tensor<string, []>("op_1092_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1092_end_0 = const()[name = tensor<string, []>("op_1092_end_0"), val = tensor<int32, [4]>([1, 32, 64, 64])];
+            tensor<bool, [4]> var_1092_end_mask_0 = const()[name = tensor<string, []>("op_1092_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 64]> var_1092_cast_fp16 = slice_by_index(begin = var_1092_begin_0, end = var_1092_end_0, end_mask = var_1092_end_mask_0, x = k_13_cast_fp16)[name = tensor<string, []>("op_1092_cast_fp16")];
+            tensor<int32, [4]> var_1098_begin_0 = const()[name = tensor<string, []>("op_1098_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1098_end_0 = const()[name = tensor<string, []>("op_1098_end_0"), val = tensor<int32, [4]>([1, 32, 128, 64])];
+            tensor<bool, [4]> var_1098_end_mask_0 = const()[name = tensor<string, []>("op_1098_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 64]> var_1098_cast_fp16 = slice_by_index(begin = var_1098_begin_0, end = var_1098_end_0, end_mask = var_1098_end_mask_0, x = k_13_cast_fp16)[name = tensor<string, []>("op_1098_cast_fp16")];
+            tensor<fp16, []> const_34_promoted_to_fp16 = const()[name = tensor<string, []>("const_34_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 64]> var_1100_cast_fp16 = mul(x = var_1098_cast_fp16, y = const_34_promoted_to_fp16)[name = tensor<string, []>("op_1100_cast_fp16")];
             tensor<bool, []> rotated_7_interleave_0 = const()[name = tensor<string, []>("rotated_7_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 64]> rotated_7_cast_fp16 = concat(axis = var_237, interleave = rotated_7_interleave_0, values = (var_345_cast_fp16, var_337_cast_fp16))[name = tensor<string, []>("rotated_7_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> var_348_cast_fp16 = mul(x = k_11_cast_fp16, y = cos)[name = tensor<string, []>("op_348_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> var_349_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = tensor<string, []>("op_349_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> roped_7_cast_fp16 = add(x = var_348_cast_fp16, y = var_349_cast_fp16)[name = tensor<string, []>("roped_7_cast_fp16")];
-            tensor<bool, []> q_11_interleave_0 = const()[name = tensor<string, []>("q_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 64]> q_11_cast_fp16 = concat(axis = var_237, interleave = q_11_interleave_0, values = roped_5_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
-            tensor<bool, []> k_13_interleave_0 = const()[name = tensor<string, []>("k_13_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 64]> new_k_cache_1 = concat(axis = var_237, interleave = k_13_interleave_0, values = roped_7_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
-            tensor<bool, []> k_15_interleave_0 = const()[name = tensor<string, []>("k_15_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_15_cast_fp16 = concat(axis = var_239, interleave = k_15_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_15_cast_fp16")];
-            tensor<bool, []> v_11_interleave_0 = const()[name = tensor<string, []>("v_11_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_11_cast_fp16 = concat(axis = var_239, interleave = v_11_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_11_cast_fp16")];
-            tensor<fp16, []> var_371_to_fp16 = const()[name = tensor<string, []>("op_371_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 64]> var_372_cast_fp16 = mul(x = q_11_cast_fp16, y = var_371_to_fp16)[name = tensor<string, []>("op_372_cast_fp16")];
-            tensor<bool, []> attn_weights_5_transpose_x_0 = const()[name = tensor<string, []>("attn_weights_5_transpose_x_0"), val = tensor<bool, []>(true)];
-            tensor<bool, []> attn_weights_5_transpose_y_0 = const()[name = tensor<string, []>("attn_weights_5_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 64, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = var_372_cast_fp16, y = k_15_cast_fp16)[name = tensor<string, []>("attn_weights_5_cast_fp16")];
-            tensor<fp16, [1, 32, 64, 512]> attn_weights_7_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = mask)[name = tensor<string, []>("attn_weights_7_cast_fp16")];
-            tensor<fp16, [1, 32, 64, 512]> var_380_cast_fp16 = softmax(axis = var_232, x = attn_weights_7_cast_fp16)[name = tensor<string, []>("op_380_cast_fp16")];
-            tensor<bool, []> attn_3_transpose_x_0 = const()[name = tensor<string, []>("attn_3_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> attn_3_transpose_y_0 = const()[name = tensor<string, []>("attn_3_transpose_y_0"), val = tensor<bool, []>(true)];
-            tensor<fp16, [1, 32, 128, 64]> attn_3_cast_fp16 = matmul(transpose_x = attn_3_transpose_x_0, transpose_y = attn_3_transpose_y_0, x = v_11_cast_fp16, y = var_380_cast_fp16)[name = tensor<string, []>("attn_3_cast_fp16")];
-            tensor<int32, [4]> var_384 = const()[name = tensor<string, []>("op_384"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 64]> input_9_cast_fp16 = reshape(shape = var_384, x = attn_3_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
-            tensor<int32, [2]> var_388 = const()[name = tensor<string, []>("op_388"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_390 = const()[name = tensor<string, []>("op_390"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_392_pad_type_0 = const()[name = tensor<string, []>("op_392_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_392_pad_0 = const()[name = tensor<string, []>("op_392_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_392_cast_fp16 = conv(dilations = var_390, groups = var_246, pad = var_392_pad_0, pad_type = var_392_pad_type_0, strides = var_388, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_392_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303701824)))];
-            tensor<fp16, [1, 4096, 1, 64]> attention_output_3_cast_fp16 = mul(x = var_392_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = tensor<string, []>("attention_output_3_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> x_25_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_15_cast_fp16)[name = tensor<string, []>("x_25_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> var_401_cast_fp16 = mul(x = x_25_cast_fp16, y = x_25_cast_fp16)[name = tensor<string, []>("op_401_cast_fp16")];
-            tensor<int32, [1]> var_402 = const()[name = tensor<string, []>("op_402"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 64]> norm_x_7_cast_fp16 = reduce_mean(axes = var_402, keep_dims = var_247, x = var_401_cast_fp16)[name = tensor<string, []>("norm_x_7_cast_fp16")];
-            tensor<fp16, []> var_404_to_fp16 = const()[name = tensor<string, []>("op_404_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
-            tensor<fp16, [1, 1, 1, 64]> var_405_cast_fp16 = add(x = norm_x_7_cast_fp16, y = var_404_to_fp16)[name = tensor<string, []>("op_405_cast_fp16")];
-            tensor<fp16, []> var_406_epsilon_0_to_fp16 = const()[name = tensor<string, []>("op_406_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1p-24)];
-            tensor<fp16, [1, 1, 1, 64]> var_406_cast_fp16 = rsqrt(epsilon = var_406_epsilon_0_to_fp16, x = var_405_cast_fp16)[name = tensor<string, []>("op_406_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> x_normed_13_cast_fp16 = mul(x = x_25_cast_fp16, y = var_406_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303710080)))];
-            tensor<fp16, [1, 4096, 1, 64]> input_11_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
-            tensor<int32, [2]> var_418 = const()[name = tensor<string, []>("op_418"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_420 = const()[name = tensor<string, []>("op_420"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_422_pad_type_0 = const()[name = tensor<string, []>("op_422_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_422_pad_0 = const()[name = tensor<string, []>("op_422_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 64]> var_422_cast_fp16 = conv(dilations = var_420, groups = var_246, pad = var_422_pad_0, pad_type = var_422_pad_type_0, strides = var_418, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("op_422_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303718336)))];
-            tensor<fp16, [1, 11008, 1, 64]> input_13_cast_fp16 = mul(x = var_422_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
-            tensor<int32, [2]> var_426 = const()[name = tensor<string, []>("op_426"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_428 = const()[name = tensor<string, []>("op_428"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_430_pad_type_0 = const()[name = tensor<string, []>("op_430_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_430_pad_0 = const()[name = tensor<string, []>("op_430_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 64]> var_430_cast_fp16 = conv(dilations = var_428, groups = var_246, pad = var_430_pad_0, pad_type = var_430_pad_type_0, strides = var_426, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("op_430_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303740416)))];
-            tensor<fp16, [1, 11008, 1, 64]> x_fc_2_3_cast_fp16 = mul(x = var_430_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = tensor<string, []>("x_fc_2_3_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 64]> var_432_cast_fp16 = silu(x = input_13_cast_fp16)[name = tensor<string, []>("op_432_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 64]> input_15_cast_fp16 = mul(x = var_432_cast_fp16, y = x_fc_2_3_cast_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
-            tensor<int32, [2]> var_436 = const()[name = tensor<string, []>("op_436"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_438 = const()[name = tensor<string, []>("op_438"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_440_pad_type_0 = const()[name = tensor<string, []>("op_440_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_440_pad_0 = const()[name = tensor<string, []>("op_440_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_440_cast_fp16 = conv(dilations = var_438, groups = var_246, pad = var_440_pad_0, pad_type = var_440_pad_type_0, strides = var_436, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("op_440_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303762496)))];
-            tensor<fp16, [1, 4096, 1, 64]> var_441_cast_fp16 = mul(x = var_440_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = tensor<string, []>("op_441_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> x_29_cast_fp16 = add(x = var_441_cast_fp16, y = x_25_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
-            tensor<int32, []> var_448 = const()[name = tensor<string, []>("op_448"), val = tensor<int32, []>(3)];
-            tensor<int32, []> var_453 = const()[name = tensor<string, []>("op_453"), val = tensor<int32, []>(-2)];
-            tensor<int32, []> var_455 = const()[name = tensor<string, []>("op_455"), val = tensor<int32, []>(-1)];
-            tensor<int32, []> var_462 = const()[name = tensor<string, []>("op_462"), val = tensor<int32, []>(1)];
-            tensor<bool, []> var_463 = const()[name = tensor<string, []>("op_463"), val = tensor<bool, []>(true)];
-            tensor<fp16, [1, 4096, 1, 64]> var_470_cast_fp16 = mul(x = x_29_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_470_cast_fp16")];
-            tensor<int32, [1]> var_471 = const()[name = tensor<string, []>("op_471"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 64]> norm_x_9_cast_fp16 = reduce_mean(axes = var_471, keep_dims = var_463, x = var_470_cast_fp16)[name = tensor<string, []>("norm_x_9_cast_fp16")];
-            tensor<fp16, []> var_473_to_fp16 = const()[name = tensor<string, []>("op_473_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
-            tensor<fp16, [1, 1, 1, 64]> var_474_cast_fp16 = add(x = norm_x_9_cast_fp16, y = var_473_to_fp16)[name = tensor<string, []>("op_474_cast_fp16")];
-            tensor<fp16, []> var_475_epsilon_0_to_fp16 = const()[name = tensor<string, []>("op_475_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1p-24)];
-            tensor<fp16, [1, 1, 1, 64]> var_475_cast_fp16 = rsqrt(epsilon = var_475_epsilon_0_to_fp16, x = var_474_cast_fp16)[name = tensor<string, []>("op_475_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> x_normed_17_cast_fp16 = mul(x = x_29_cast_fp16, y = var_475_cast_fp16)[name = tensor<string, []>("x_normed_17_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303770752)))];
-            tensor<fp16, [1, 4096, 1, 64]> x_33_cast_fp16 = mul(x = x_normed_17_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = tensor<string, []>("x_33_cast_fp16")];
-            tensor<int32, [2]> var_490 = const()[name = tensor<string, []>("op_490"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_492 = const()[name = tensor<string, []>("op_492"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_494_pad_type_0 = const()[name = tensor<string, []>("op_494_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_494_pad_0 = const()[name = tensor<string, []>("op_494_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_494_cast_fp16 = conv(dilations = var_492, groups = var_462, pad = var_494_pad_0, pad_type = var_494_pad_type_0, strides = var_490, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = tensor<string, []>("op_494_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303779008)))];
-            tensor<fp16, [1, 4096, 1, 64]> q_13_cast_fp16 = mul(x = var_494_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = tensor<string, []>("q_13_cast_fp16")];
-            tensor<int32, [2]> var_498 = const()[name = tensor<string, []>("op_498"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_500 = const()[name = tensor<string, []>("op_500"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_502_pad_type_0 = const()[name = tensor<string, []>("op_502_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_502_pad_0 = const()[name = tensor<string, []>("op_502_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_502_cast_fp16 = conv(dilations = var_500, groups = var_462, pad = var_502_pad_0, pad_type = var_502_pad_type_0, strides = var_498, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = tensor<string, []>("op_502_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303787264)))];
-            tensor<fp16, [1, 4096, 1, 64]> k_17_cast_fp16 = mul(x = var_502_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = tensor<string, []>("k_17_cast_fp16")];
-            tensor<int32, [2]> var_506 = const()[name = tensor<string, []>("op_506"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_508 = const()[name = tensor<string, []>("op_508"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_510_pad_type_0 = const()[name = tensor<string, []>("op_510_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_510_pad_0 = const()[name = tensor<string, []>("op_510_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_510_cast_fp16 = conv(dilations = var_508, groups = var_462, pad = var_510_pad_0, pad_type = var_510_pad_type_0, strides = var_506, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = x_33_cast_fp16)[name = tensor<string, []>("op_510_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303795520)))];
-            tensor<fp16, [1, 4096, 1, 64]> v_13_cast_fp16 = mul(x = var_510_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = tensor<string, []>("v_13_cast_fp16")];
-            tensor<int32, [4]> var_512 = const()[name = tensor<string, []>("op_512"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<fp16, [1, 32, 128, 64]> q_15_cast_fp16 = reshape(shape = var_512, x = q_13_cast_fp16)[name = tensor<string, []>("q_15_cast_fp16")];
-            tensor<int32, [4]> var_514 = const()[name = tensor<string, []>("op_514"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<fp16, [1, 32, 128, 64]> k_19_cast_fp16 = reshape(shape = var_514, x = k_17_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
-            tensor<int32, [4]> var_516 = const()[name = tensor<string, []>("op_516"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<fp16, [1, 32, 128, 64]> new_v_cache_2 = reshape(shape = var_516, x = v_13_cast_fp16)[name = tensor<string, []>("v_15_cast_fp16")];
-            tensor<int32, [4]> var_528_begin_0 = const()[name = tensor<string, []>("op_528_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_528_end_0 = const()[name = tensor<string, []>("op_528_end_0"), val = tensor<int32, [4]>([1, 32, 64, 64])];
-            tensor<bool, [4]> var_528_end_mask_0 = const()[name = tensor<string, []>("op_528_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 64]> var_528_cast_fp16 = slice_by_index(begin = var_528_begin_0, end = var_528_end_0, end_mask = var_528_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_528_cast_fp16")];
-            tensor<int32, [4]> var_534_begin_0 = const()[name = tensor<string, []>("op_534_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_534_end_0 = const()[name = tensor<string, []>("op_534_end_0"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<bool, [4]> var_534_end_mask_0 = const()[name = tensor<string, []>("op_534_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 64]> var_534_cast_fp16 = slice_by_index(begin = var_534_begin_0, end = var_534_end_0, end_mask = var_534_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_534_cast_fp16")];
-            tensor<fp16, []> const_17_promoted_to_fp16 = const()[name = tensor<string, []>("const_17_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 64]> var_536_cast_fp16 = mul(x = var_534_cast_fp16, y = const_17_promoted_to_fp16)[name = tensor<string, []>("op_536_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> rotated_7_cast_fp16 = concat(axis = var_931, interleave = rotated_7_interleave_0, values = (var_1100_cast_fp16, var_1092_cast_fp16))[name = tensor<string, []>("rotated_7_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> var_1103_cast_fp16 = mul(x = k_13_cast_fp16, y = cos)[name = tensor<string, []>("op_1103_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> var_1104_cast_fp16 = mul(x = rotated_7_cast_fp16, y = sin)[name = tensor<string, []>("op_1104_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> roped_7_cast_fp16 = add(x = var_1103_cast_fp16, y = var_1104_cast_fp16)[name = tensor<string, []>("roped_7_cast_fp16")];
+            tensor<int32, [4]> var_1107 = const()[name = tensor<string, []>("op_1107"), val = tensor<int32, [4]>([1, 4096, 1, 64])];
+            tensor<fp16, [1, 4096, 1, 64]> var_1108_cast_fp16 = reshape(shape = var_1107, x = roped_7_cast_fp16)[name = tensor<string, []>("op_1108_cast_fp16")];
+            tensor<int32, [4]> k_17_perm_0 = const()[name = tensor<string, []>("k_17_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<int32, [4]> var_1110 = const()[name = tensor<string, []>("op_1110"), val = tensor<int32, [4]>([1, 4096, 1, 64])];
+            tensor<fp16, [1, 4096, 1, 64]> new_v_cache_1 = reshape(shape = var_1110, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<bool, []> k_19_interleave_0 = const()[name = tensor<string, []>("k_19_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 4096]> new_k_cache_1 = transpose(perm = k_17_perm_0, x = var_1108_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 4096]> k_19_cast_fp16 = concat(axis = var_933, interleave = k_19_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 4096, 1, 512]> v_17_cast_fp16 = concat(axis = var_927, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_1117 = const()[name = tensor<string, []>("op_1117"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 4096, 1, 64]> q_15_cast_fp16 = reshape(shape = var_1117, x = roped_5_cast_fp16)[name = tensor<string, []>("q_15_cast_fp16")];
+            tensor<int32, [4]> var_1122_begin_0 = const()[name = tensor<string, []>("op_1122_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1122_end_0 = const()[name = tensor<string, []>("op_1122_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_1122_end_mask_0 = const()[name = tensor<string, []>("op_1122_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1122_cast_fp16 = slice_by_index(begin = var_1122_begin_0, end = var_1122_end_0, end_mask = var_1122_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1122_cast_fp16")];
+            tensor<int32, [4]> var_1126_begin_0 = const()[name = tensor<string, []>("op_1126_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1126_end_0 = const()[name = tensor<string, []>("op_1126_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_1126_end_mask_0 = const()[name = tensor<string, []>("op_1126_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1126_cast_fp16 = slice_by_index(begin = var_1126_begin_0, end = var_1126_end_0, end_mask = var_1126_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1126_cast_fp16")];
+            tensor<int32, [4]> var_1130_begin_0 = const()[name = tensor<string, []>("op_1130_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1130_end_0 = const()[name = tensor<string, []>("op_1130_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_1130_end_mask_0 = const()[name = tensor<string, []>("op_1130_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1130_cast_fp16 = slice_by_index(begin = var_1130_begin_0, end = var_1130_end_0, end_mask = var_1130_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1130_cast_fp16")];
+            tensor<int32, [4]> var_1134_begin_0 = const()[name = tensor<string, []>("op_1134_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1134_end_0 = const()[name = tensor<string, []>("op_1134_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_1134_end_mask_0 = const()[name = tensor<string, []>("op_1134_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1134_cast_fp16 = slice_by_index(begin = var_1134_begin_0, end = var_1134_end_0, end_mask = var_1134_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1134_cast_fp16")];
+            tensor<int32, [4]> var_1138_begin_0 = const()[name = tensor<string, []>("op_1138_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1138_end_0 = const()[name = tensor<string, []>("op_1138_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_1138_end_mask_0 = const()[name = tensor<string, []>("op_1138_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1138_cast_fp16 = slice_by_index(begin = var_1138_begin_0, end = var_1138_end_0, end_mask = var_1138_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1138_cast_fp16")];
+            tensor<int32, [4]> var_1142_begin_0 = const()[name = tensor<string, []>("op_1142_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1142_end_0 = const()[name = tensor<string, []>("op_1142_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_1142_end_mask_0 = const()[name = tensor<string, []>("op_1142_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1142_cast_fp16 = slice_by_index(begin = var_1142_begin_0, end = var_1142_end_0, end_mask = var_1142_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1142_cast_fp16")];
+            tensor<int32, [4]> var_1146_begin_0 = const()[name = tensor<string, []>("op_1146_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1146_end_0 = const()[name = tensor<string, []>("op_1146_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_1146_end_mask_0 = const()[name = tensor<string, []>("op_1146_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1146_cast_fp16 = slice_by_index(begin = var_1146_begin_0, end = var_1146_end_0, end_mask = var_1146_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1146_cast_fp16")];
+            tensor<int32, [4]> var_1150_begin_0 = const()[name = tensor<string, []>("op_1150_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1150_end_0 = const()[name = tensor<string, []>("op_1150_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_1150_end_mask_0 = const()[name = tensor<string, []>("op_1150_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1150_cast_fp16 = slice_by_index(begin = var_1150_begin_0, end = var_1150_end_0, end_mask = var_1150_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1150_cast_fp16")];
+            tensor<int32, [4]> var_1154_begin_0 = const()[name = tensor<string, []>("op_1154_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_1154_end_0 = const()[name = tensor<string, []>("op_1154_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_1154_end_mask_0 = const()[name = tensor<string, []>("op_1154_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1154_cast_fp16 = slice_by_index(begin = var_1154_begin_0, end = var_1154_end_0, end_mask = var_1154_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1154_cast_fp16")];
+            tensor<int32, [4]> var_1158_begin_0 = const()[name = tensor<string, []>("op_1158_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_1158_end_0 = const()[name = tensor<string, []>("op_1158_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_1158_end_mask_0 = const()[name = tensor<string, []>("op_1158_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1158_cast_fp16 = slice_by_index(begin = var_1158_begin_0, end = var_1158_end_0, end_mask = var_1158_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1158_cast_fp16")];
+            tensor<int32, [4]> var_1162_begin_0 = const()[name = tensor<string, []>("op_1162_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_1162_end_0 = const()[name = tensor<string, []>("op_1162_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_1162_end_mask_0 = const()[name = tensor<string, []>("op_1162_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1162_cast_fp16 = slice_by_index(begin = var_1162_begin_0, end = var_1162_end_0, end_mask = var_1162_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1162_cast_fp16")];
+            tensor<int32, [4]> var_1166_begin_0 = const()[name = tensor<string, []>("op_1166_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_1166_end_0 = const()[name = tensor<string, []>("op_1166_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_1166_end_mask_0 = const()[name = tensor<string, []>("op_1166_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1166_cast_fp16 = slice_by_index(begin = var_1166_begin_0, end = var_1166_end_0, end_mask = var_1166_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1166_cast_fp16")];
+            tensor<int32, [4]> var_1170_begin_0 = const()[name = tensor<string, []>("op_1170_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_1170_end_0 = const()[name = tensor<string, []>("op_1170_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_1170_end_mask_0 = const()[name = tensor<string, []>("op_1170_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1170_cast_fp16 = slice_by_index(begin = var_1170_begin_0, end = var_1170_end_0, end_mask = var_1170_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1170_cast_fp16")];
+            tensor<int32, [4]> var_1174_begin_0 = const()[name = tensor<string, []>("op_1174_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_1174_end_0 = const()[name = tensor<string, []>("op_1174_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_1174_end_mask_0 = const()[name = tensor<string, []>("op_1174_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1174_cast_fp16 = slice_by_index(begin = var_1174_begin_0, end = var_1174_end_0, end_mask = var_1174_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1174_cast_fp16")];
+            tensor<int32, [4]> var_1178_begin_0 = const()[name = tensor<string, []>("op_1178_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_1178_end_0 = const()[name = tensor<string, []>("op_1178_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_1178_end_mask_0 = const()[name = tensor<string, []>("op_1178_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1178_cast_fp16 = slice_by_index(begin = var_1178_begin_0, end = var_1178_end_0, end_mask = var_1178_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1178_cast_fp16")];
+            tensor<int32, [4]> var_1182_begin_0 = const()[name = tensor<string, []>("op_1182_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_1182_end_0 = const()[name = tensor<string, []>("op_1182_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_1182_end_mask_0 = const()[name = tensor<string, []>("op_1182_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1182_cast_fp16 = slice_by_index(begin = var_1182_begin_0, end = var_1182_end_0, end_mask = var_1182_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1182_cast_fp16")];
+            tensor<int32, [4]> var_1186_begin_0 = const()[name = tensor<string, []>("op_1186_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_1186_end_0 = const()[name = tensor<string, []>("op_1186_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_1186_end_mask_0 = const()[name = tensor<string, []>("op_1186_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1186_cast_fp16 = slice_by_index(begin = var_1186_begin_0, end = var_1186_end_0, end_mask = var_1186_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1186_cast_fp16")];
+            tensor<int32, [4]> var_1190_begin_0 = const()[name = tensor<string, []>("op_1190_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_1190_end_0 = const()[name = tensor<string, []>("op_1190_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_1190_end_mask_0 = const()[name = tensor<string, []>("op_1190_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1190_cast_fp16 = slice_by_index(begin = var_1190_begin_0, end = var_1190_end_0, end_mask = var_1190_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1190_cast_fp16")];
+            tensor<int32, [4]> var_1194_begin_0 = const()[name = tensor<string, []>("op_1194_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_1194_end_0 = const()[name = tensor<string, []>("op_1194_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_1194_end_mask_0 = const()[name = tensor<string, []>("op_1194_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1194_cast_fp16 = slice_by_index(begin = var_1194_begin_0, end = var_1194_end_0, end_mask = var_1194_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1194_cast_fp16")];
+            tensor<int32, [4]> var_1198_begin_0 = const()[name = tensor<string, []>("op_1198_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_1198_end_0 = const()[name = tensor<string, []>("op_1198_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_1198_end_mask_0 = const()[name = tensor<string, []>("op_1198_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1198_cast_fp16 = slice_by_index(begin = var_1198_begin_0, end = var_1198_end_0, end_mask = var_1198_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1198_cast_fp16")];
+            tensor<int32, [4]> var_1202_begin_0 = const()[name = tensor<string, []>("op_1202_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1202_end_0 = const()[name = tensor<string, []>("op_1202_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1202_end_mask_0 = const()[name = tensor<string, []>("op_1202_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1202_cast_fp16 = slice_by_index(begin = var_1202_begin_0, end = var_1202_end_0, end_mask = var_1202_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1202_cast_fp16")];
+            tensor<int32, [4]> var_1206_begin_0 = const()[name = tensor<string, []>("op_1206_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1206_end_0 = const()[name = tensor<string, []>("op_1206_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1206_end_mask_0 = const()[name = tensor<string, []>("op_1206_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1206_cast_fp16 = slice_by_index(begin = var_1206_begin_0, end = var_1206_end_0, end_mask = var_1206_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1206_cast_fp16")];
+            tensor<int32, [4]> var_1210_begin_0 = const()[name = tensor<string, []>("op_1210_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1210_end_0 = const()[name = tensor<string, []>("op_1210_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1210_end_mask_0 = const()[name = tensor<string, []>("op_1210_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1210_cast_fp16 = slice_by_index(begin = var_1210_begin_0, end = var_1210_end_0, end_mask = var_1210_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<int32, [4]> var_1214_begin_0 = const()[name = tensor<string, []>("op_1214_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1214_end_0 = const()[name = tensor<string, []>("op_1214_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1214_end_mask_0 = const()[name = tensor<string, []>("op_1214_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1214_cast_fp16 = slice_by_index(begin = var_1214_begin_0, end = var_1214_end_0, end_mask = var_1214_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<int32, [4]> var_1218_begin_0 = const()[name = tensor<string, []>("op_1218_begin_0"), val = tensor<int32, [4]>([0, 3072, 0, 0])];
+            tensor<int32, [4]> var_1218_end_0 = const()[name = tensor<string, []>("op_1218_end_0"), val = tensor<int32, [4]>([1, 3200, 1, 64])];
+            tensor<bool, [4]> var_1218_end_mask_0 = const()[name = tensor<string, []>("op_1218_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1218_cast_fp16 = slice_by_index(begin = var_1218_begin_0, end = var_1218_end_0, end_mask = var_1218_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<int32, [4]> var_1222_begin_0 = const()[name = tensor<string, []>("op_1222_begin_0"), val = tensor<int32, [4]>([0, 3200, 0, 0])];
+            tensor<int32, [4]> var_1222_end_0 = const()[name = tensor<string, []>("op_1222_end_0"), val = tensor<int32, [4]>([1, 3328, 1, 64])];
+            tensor<bool, [4]> var_1222_end_mask_0 = const()[name = tensor<string, []>("op_1222_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1222_cast_fp16 = slice_by_index(begin = var_1222_begin_0, end = var_1222_end_0, end_mask = var_1222_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<int32, [4]> var_1226_begin_0 = const()[name = tensor<string, []>("op_1226_begin_0"), val = tensor<int32, [4]>([0, 3328, 0, 0])];
+            tensor<int32, [4]> var_1226_end_0 = const()[name = tensor<string, []>("op_1226_end_0"), val = tensor<int32, [4]>([1, 3456, 1, 64])];
+            tensor<bool, [4]> var_1226_end_mask_0 = const()[name = tensor<string, []>("op_1226_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1226_cast_fp16 = slice_by_index(begin = var_1226_begin_0, end = var_1226_end_0, end_mask = var_1226_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<int32, [4]> var_1230_begin_0 = const()[name = tensor<string, []>("op_1230_begin_0"), val = tensor<int32, [4]>([0, 3456, 0, 0])];
+            tensor<int32, [4]> var_1230_end_0 = const()[name = tensor<string, []>("op_1230_end_0"), val = tensor<int32, [4]>([1, 3584, 1, 64])];
+            tensor<bool, [4]> var_1230_end_mask_0 = const()[name = tensor<string, []>("op_1230_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1230_cast_fp16 = slice_by_index(begin = var_1230_begin_0, end = var_1230_end_0, end_mask = var_1230_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<int32, [4]> var_1234_begin_0 = const()[name = tensor<string, []>("op_1234_begin_0"), val = tensor<int32, [4]>([0, 3584, 0, 0])];
+            tensor<int32, [4]> var_1234_end_0 = const()[name = tensor<string, []>("op_1234_end_0"), val = tensor<int32, [4]>([1, 3712, 1, 64])];
+            tensor<bool, [4]> var_1234_end_mask_0 = const()[name = tensor<string, []>("op_1234_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1234_cast_fp16 = slice_by_index(begin = var_1234_begin_0, end = var_1234_end_0, end_mask = var_1234_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<int32, [4]> var_1238_begin_0 = const()[name = tensor<string, []>("op_1238_begin_0"), val = tensor<int32, [4]>([0, 3712, 0, 0])];
+            tensor<int32, [4]> var_1238_end_0 = const()[name = tensor<string, []>("op_1238_end_0"), val = tensor<int32, [4]>([1, 3840, 1, 64])];
+            tensor<bool, [4]> var_1238_end_mask_0 = const()[name = tensor<string, []>("op_1238_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1238_cast_fp16 = slice_by_index(begin = var_1238_begin_0, end = var_1238_end_0, end_mask = var_1238_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<int32, [4]> var_1242_begin_0 = const()[name = tensor<string, []>("op_1242_begin_0"), val = tensor<int32, [4]>([0, 3840, 0, 0])];
+            tensor<int32, [4]> var_1242_end_0 = const()[name = tensor<string, []>("op_1242_end_0"), val = tensor<int32, [4]>([1, 3968, 1, 64])];
+            tensor<bool, [4]> var_1242_end_mask_0 = const()[name = tensor<string, []>("op_1242_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1242_cast_fp16 = slice_by_index(begin = var_1242_begin_0, end = var_1242_end_0, end_mask = var_1242_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<int32, [4]> var_1246_begin_0 = const()[name = tensor<string, []>("op_1246_begin_0"), val = tensor<int32, [4]>([0, 3968, 0, 0])];
+            tensor<int32, [4]> var_1246_end_0 = const()[name = tensor<string, []>("op_1246_end_0"), val = tensor<int32, [4]>([1, 4096, 1, 64])];
+            tensor<bool, [4]> var_1246_end_mask_0 = const()[name = tensor<string, []>("op_1246_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1246_cast_fp16 = slice_by_index(begin = var_1246_begin_0, end = var_1246_end_0, end_mask = var_1246_end_mask_0, x = q_15_cast_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<int32, [4]> var_1252_begin_0 = const()[name = tensor<string, []>("op_1252_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1252_end_0 = const()[name = tensor<string, []>("op_1252_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1252_end_mask_0 = const()[name = tensor<string, []>("op_1252_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1252_cast_fp16 = slice_by_index(begin = var_1252_begin_0, end = var_1252_end_0, end_mask = var_1252_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<int32, [4]> var_1256_begin_0 = const()[name = tensor<string, []>("op_1256_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1256_end_0 = const()[name = tensor<string, []>("op_1256_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1256_end_mask_0 = const()[name = tensor<string, []>("op_1256_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1256_cast_fp16 = slice_by_index(begin = var_1256_begin_0, end = var_1256_end_0, end_mask = var_1256_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<int32, [4]> var_1260_begin_0 = const()[name = tensor<string, []>("op_1260_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1260_end_0 = const()[name = tensor<string, []>("op_1260_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1260_end_mask_0 = const()[name = tensor<string, []>("op_1260_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1260_cast_fp16 = slice_by_index(begin = var_1260_begin_0, end = var_1260_end_0, end_mask = var_1260_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<int32, [4]> var_1264_begin_0 = const()[name = tensor<string, []>("op_1264_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1264_end_0 = const()[name = tensor<string, []>("op_1264_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1264_end_mask_0 = const()[name = tensor<string, []>("op_1264_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1264_cast_fp16 = slice_by_index(begin = var_1264_begin_0, end = var_1264_end_0, end_mask = var_1264_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<int32, [4]> var_1268_begin_0 = const()[name = tensor<string, []>("op_1268_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1268_end_0 = const()[name = tensor<string, []>("op_1268_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1268_end_mask_0 = const()[name = tensor<string, []>("op_1268_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1268_cast_fp16 = slice_by_index(begin = var_1268_begin_0, end = var_1268_end_0, end_mask = var_1268_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<int32, [4]> var_1272_begin_0 = const()[name = tensor<string, []>("op_1272_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1272_end_0 = const()[name = tensor<string, []>("op_1272_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1272_end_mask_0 = const()[name = tensor<string, []>("op_1272_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1272_cast_fp16 = slice_by_index(begin = var_1272_begin_0, end = var_1272_end_0, end_mask = var_1272_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<int32, [4]> var_1276_begin_0 = const()[name = tensor<string, []>("op_1276_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1276_end_0 = const()[name = tensor<string, []>("op_1276_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1276_end_mask_0 = const()[name = tensor<string, []>("op_1276_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1276_cast_fp16 = slice_by_index(begin = var_1276_begin_0, end = var_1276_end_0, end_mask = var_1276_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<int32, [4]> var_1280_begin_0 = const()[name = tensor<string, []>("op_1280_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1280_end_0 = const()[name = tensor<string, []>("op_1280_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1280_end_mask_0 = const()[name = tensor<string, []>("op_1280_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1280_cast_fp16 = slice_by_index(begin = var_1280_begin_0, end = var_1280_end_0, end_mask = var_1280_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<int32, [4]> var_1284_begin_0 = const()[name = tensor<string, []>("op_1284_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1024])];
+            tensor<int32, [4]> var_1284_end_0 = const()[name = tensor<string, []>("op_1284_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1152])];
+            tensor<bool, [4]> var_1284_end_mask_0 = const()[name = tensor<string, []>("op_1284_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1284_cast_fp16 = slice_by_index(begin = var_1284_begin_0, end = var_1284_end_0, end_mask = var_1284_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<int32, [4]> var_1288_begin_0 = const()[name = tensor<string, []>("op_1288_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1152])];
+            tensor<int32, [4]> var_1288_end_0 = const()[name = tensor<string, []>("op_1288_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1280])];
+            tensor<bool, [4]> var_1288_end_mask_0 = const()[name = tensor<string, []>("op_1288_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1288_cast_fp16 = slice_by_index(begin = var_1288_begin_0, end = var_1288_end_0, end_mask = var_1288_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<int32, [4]> var_1292_begin_0 = const()[name = tensor<string, []>("op_1292_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1280])];
+            tensor<int32, [4]> var_1292_end_0 = const()[name = tensor<string, []>("op_1292_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1408])];
+            tensor<bool, [4]> var_1292_end_mask_0 = const()[name = tensor<string, []>("op_1292_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1292_cast_fp16 = slice_by_index(begin = var_1292_begin_0, end = var_1292_end_0, end_mask = var_1292_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<int32, [4]> var_1296_begin_0 = const()[name = tensor<string, []>("op_1296_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1408])];
+            tensor<int32, [4]> var_1296_end_0 = const()[name = tensor<string, []>("op_1296_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1536])];
+            tensor<bool, [4]> var_1296_end_mask_0 = const()[name = tensor<string, []>("op_1296_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1296_cast_fp16 = slice_by_index(begin = var_1296_begin_0, end = var_1296_end_0, end_mask = var_1296_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<int32, [4]> var_1300_begin_0 = const()[name = tensor<string, []>("op_1300_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1536])];
+            tensor<int32, [4]> var_1300_end_0 = const()[name = tensor<string, []>("op_1300_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1664])];
+            tensor<bool, [4]> var_1300_end_mask_0 = const()[name = tensor<string, []>("op_1300_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1300_cast_fp16 = slice_by_index(begin = var_1300_begin_0, end = var_1300_end_0, end_mask = var_1300_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<int32, [4]> var_1304_begin_0 = const()[name = tensor<string, []>("op_1304_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1664])];
+            tensor<int32, [4]> var_1304_end_0 = const()[name = tensor<string, []>("op_1304_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1792])];
+            tensor<bool, [4]> var_1304_end_mask_0 = const()[name = tensor<string, []>("op_1304_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1304_cast_fp16 = slice_by_index(begin = var_1304_begin_0, end = var_1304_end_0, end_mask = var_1304_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1304_cast_fp16")];
+            tensor<int32, [4]> var_1308_begin_0 = const()[name = tensor<string, []>("op_1308_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1792])];
+            tensor<int32, [4]> var_1308_end_0 = const()[name = tensor<string, []>("op_1308_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1920])];
+            tensor<bool, [4]> var_1308_end_mask_0 = const()[name = tensor<string, []>("op_1308_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1308_cast_fp16 = slice_by_index(begin = var_1308_begin_0, end = var_1308_end_0, end_mask = var_1308_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1308_cast_fp16")];
+            tensor<int32, [4]> var_1312_begin_0 = const()[name = tensor<string, []>("op_1312_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1920])];
+            tensor<int32, [4]> var_1312_end_0 = const()[name = tensor<string, []>("op_1312_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2048])];
+            tensor<bool, [4]> var_1312_end_mask_0 = const()[name = tensor<string, []>("op_1312_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1312_cast_fp16 = slice_by_index(begin = var_1312_begin_0, end = var_1312_end_0, end_mask = var_1312_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1312_cast_fp16")];
+            tensor<int32, [4]> var_1316_begin_0 = const()[name = tensor<string, []>("op_1316_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2048])];
+            tensor<int32, [4]> var_1316_end_0 = const()[name = tensor<string, []>("op_1316_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2176])];
+            tensor<bool, [4]> var_1316_end_mask_0 = const()[name = tensor<string, []>("op_1316_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1316_cast_fp16 = slice_by_index(begin = var_1316_begin_0, end = var_1316_end_0, end_mask = var_1316_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1316_cast_fp16")];
+            tensor<int32, [4]> var_1320_begin_0 = const()[name = tensor<string, []>("op_1320_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2176])];
+            tensor<int32, [4]> var_1320_end_0 = const()[name = tensor<string, []>("op_1320_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2304])];
+            tensor<bool, [4]> var_1320_end_mask_0 = const()[name = tensor<string, []>("op_1320_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1320_cast_fp16 = slice_by_index(begin = var_1320_begin_0, end = var_1320_end_0, end_mask = var_1320_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1320_cast_fp16")];
+            tensor<int32, [4]> var_1324_begin_0 = const()[name = tensor<string, []>("op_1324_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2304])];
+            tensor<int32, [4]> var_1324_end_0 = const()[name = tensor<string, []>("op_1324_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2432])];
+            tensor<bool, [4]> var_1324_end_mask_0 = const()[name = tensor<string, []>("op_1324_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1324_cast_fp16 = slice_by_index(begin = var_1324_begin_0, end = var_1324_end_0, end_mask = var_1324_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1324_cast_fp16")];
+            tensor<int32, [4]> var_1328_begin_0 = const()[name = tensor<string, []>("op_1328_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2432])];
+            tensor<int32, [4]> var_1328_end_0 = const()[name = tensor<string, []>("op_1328_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2560])];
+            tensor<bool, [4]> var_1328_end_mask_0 = const()[name = tensor<string, []>("op_1328_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1328_cast_fp16 = slice_by_index(begin = var_1328_begin_0, end = var_1328_end_0, end_mask = var_1328_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<int32, [4]> var_1332_begin_0 = const()[name = tensor<string, []>("op_1332_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2560])];
+            tensor<int32, [4]> var_1332_end_0 = const()[name = tensor<string, []>("op_1332_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2688])];
+            tensor<bool, [4]> var_1332_end_mask_0 = const()[name = tensor<string, []>("op_1332_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1332_cast_fp16 = slice_by_index(begin = var_1332_begin_0, end = var_1332_end_0, end_mask = var_1332_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<int32, [4]> var_1336_begin_0 = const()[name = tensor<string, []>("op_1336_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2688])];
+            tensor<int32, [4]> var_1336_end_0 = const()[name = tensor<string, []>("op_1336_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2816])];
+            tensor<bool, [4]> var_1336_end_mask_0 = const()[name = tensor<string, []>("op_1336_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1336_cast_fp16 = slice_by_index(begin = var_1336_begin_0, end = var_1336_end_0, end_mask = var_1336_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<int32, [4]> var_1340_begin_0 = const()[name = tensor<string, []>("op_1340_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2816])];
+            tensor<int32, [4]> var_1340_end_0 = const()[name = tensor<string, []>("op_1340_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2944])];
+            tensor<bool, [4]> var_1340_end_mask_0 = const()[name = tensor<string, []>("op_1340_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1340_cast_fp16 = slice_by_index(begin = var_1340_begin_0, end = var_1340_end_0, end_mask = var_1340_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<int32, [4]> var_1344_begin_0 = const()[name = tensor<string, []>("op_1344_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2944])];
+            tensor<int32, [4]> var_1344_end_0 = const()[name = tensor<string, []>("op_1344_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3072])];
+            tensor<bool, [4]> var_1344_end_mask_0 = const()[name = tensor<string, []>("op_1344_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1344_cast_fp16 = slice_by_index(begin = var_1344_begin_0, end = var_1344_end_0, end_mask = var_1344_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<int32, [4]> var_1348_begin_0 = const()[name = tensor<string, []>("op_1348_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3072])];
+            tensor<int32, [4]> var_1348_end_0 = const()[name = tensor<string, []>("op_1348_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3200])];
+            tensor<bool, [4]> var_1348_end_mask_0 = const()[name = tensor<string, []>("op_1348_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1348_cast_fp16 = slice_by_index(begin = var_1348_begin_0, end = var_1348_end_0, end_mask = var_1348_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<int32, [4]> var_1352_begin_0 = const()[name = tensor<string, []>("op_1352_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3200])];
+            tensor<int32, [4]> var_1352_end_0 = const()[name = tensor<string, []>("op_1352_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3328])];
+            tensor<bool, [4]> var_1352_end_mask_0 = const()[name = tensor<string, []>("op_1352_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1352_cast_fp16 = slice_by_index(begin = var_1352_begin_0, end = var_1352_end_0, end_mask = var_1352_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<int32, [4]> var_1356_begin_0 = const()[name = tensor<string, []>("op_1356_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3328])];
+            tensor<int32, [4]> var_1356_end_0 = const()[name = tensor<string, []>("op_1356_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3456])];
+            tensor<bool, [4]> var_1356_end_mask_0 = const()[name = tensor<string, []>("op_1356_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1356_cast_fp16 = slice_by_index(begin = var_1356_begin_0, end = var_1356_end_0, end_mask = var_1356_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<int32, [4]> var_1360_begin_0 = const()[name = tensor<string, []>("op_1360_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3456])];
+            tensor<int32, [4]> var_1360_end_0 = const()[name = tensor<string, []>("op_1360_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3584])];
+            tensor<bool, [4]> var_1360_end_mask_0 = const()[name = tensor<string, []>("op_1360_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1360_cast_fp16 = slice_by_index(begin = var_1360_begin_0, end = var_1360_end_0, end_mask = var_1360_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<int32, [4]> var_1364_begin_0 = const()[name = tensor<string, []>("op_1364_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3584])];
+            tensor<int32, [4]> var_1364_end_0 = const()[name = tensor<string, []>("op_1364_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3712])];
+            tensor<bool, [4]> var_1364_end_mask_0 = const()[name = tensor<string, []>("op_1364_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1364_cast_fp16 = slice_by_index(begin = var_1364_begin_0, end = var_1364_end_0, end_mask = var_1364_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<int32, [4]> var_1368_begin_0 = const()[name = tensor<string, []>("op_1368_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3712])];
+            tensor<int32, [4]> var_1368_end_0 = const()[name = tensor<string, []>("op_1368_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3840])];
+            tensor<bool, [4]> var_1368_end_mask_0 = const()[name = tensor<string, []>("op_1368_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1368_cast_fp16 = slice_by_index(begin = var_1368_begin_0, end = var_1368_end_0, end_mask = var_1368_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<int32, [4]> var_1372_begin_0 = const()[name = tensor<string, []>("op_1372_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3840])];
+            tensor<int32, [4]> var_1372_end_0 = const()[name = tensor<string, []>("op_1372_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3968])];
+            tensor<bool, [4]> var_1372_end_mask_0 = const()[name = tensor<string, []>("op_1372_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1372_cast_fp16 = slice_by_index(begin = var_1372_begin_0, end = var_1372_end_0, end_mask = var_1372_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<int32, [4]> var_1376_begin_0 = const()[name = tensor<string, []>("op_1376_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3968])];
+            tensor<int32, [4]> var_1376_end_0 = const()[name = tensor<string, []>("op_1376_end_0"), val = tensor<int32, [4]>([1, 512, 1, 4096])];
+            tensor<bool, [4]> var_1376_end_mask_0 = const()[name = tensor<string, []>("op_1376_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1376_cast_fp16 = slice_by_index(begin = var_1376_begin_0, end = var_1376_end_0, end_mask = var_1376_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<int32, [4]> var_1378_begin_0 = const()[name = tensor<string, []>("op_1378_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1378_end_0 = const()[name = tensor<string, []>("op_1378_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1378_end_mask_0 = const()[name = tensor<string, []>("op_1378_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1378_cast_fp16 = slice_by_index(begin = var_1378_begin_0, end = var_1378_end_0, end_mask = var_1378_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<int32, [4]> var_1382_begin_0 = const()[name = tensor<string, []>("op_1382_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1382_end_0 = const()[name = tensor<string, []>("op_1382_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1382_end_mask_0 = const()[name = tensor<string, []>("op_1382_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1382_cast_fp16 = slice_by_index(begin = var_1382_begin_0, end = var_1382_end_0, end_mask = var_1382_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<int32, [4]> var_1386_begin_0 = const()[name = tensor<string, []>("op_1386_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1386_end_0 = const()[name = tensor<string, []>("op_1386_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1386_end_mask_0 = const()[name = tensor<string, []>("op_1386_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1386_cast_fp16 = slice_by_index(begin = var_1386_begin_0, end = var_1386_end_0, end_mask = var_1386_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<int32, [4]> var_1390_begin_0 = const()[name = tensor<string, []>("op_1390_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1390_end_0 = const()[name = tensor<string, []>("op_1390_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1390_end_mask_0 = const()[name = tensor<string, []>("op_1390_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1390_cast_fp16 = slice_by_index(begin = var_1390_begin_0, end = var_1390_end_0, end_mask = var_1390_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<int32, [4]> var_1394_begin_0 = const()[name = tensor<string, []>("op_1394_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1394_end_0 = const()[name = tensor<string, []>("op_1394_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1394_end_mask_0 = const()[name = tensor<string, []>("op_1394_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1394_cast_fp16 = slice_by_index(begin = var_1394_begin_0, end = var_1394_end_0, end_mask = var_1394_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<int32, [4]> var_1398_begin_0 = const()[name = tensor<string, []>("op_1398_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1398_end_0 = const()[name = tensor<string, []>("op_1398_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1398_end_mask_0 = const()[name = tensor<string, []>("op_1398_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1398_cast_fp16 = slice_by_index(begin = var_1398_begin_0, end = var_1398_end_0, end_mask = var_1398_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<int32, [4]> var_1402_begin_0 = const()[name = tensor<string, []>("op_1402_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1402_end_0 = const()[name = tensor<string, []>("op_1402_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1402_end_mask_0 = const()[name = tensor<string, []>("op_1402_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1402_cast_fp16 = slice_by_index(begin = var_1402_begin_0, end = var_1402_end_0, end_mask = var_1402_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1402_cast_fp16")];
+            tensor<int32, [4]> var_1406_begin_0 = const()[name = tensor<string, []>("op_1406_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1406_end_0 = const()[name = tensor<string, []>("op_1406_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1406_end_mask_0 = const()[name = tensor<string, []>("op_1406_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1406_cast_fp16 = slice_by_index(begin = var_1406_begin_0, end = var_1406_end_0, end_mask = var_1406_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1406_cast_fp16")];
+            tensor<int32, [4]> var_1410_begin_0 = const()[name = tensor<string, []>("op_1410_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_1410_end_0 = const()[name = tensor<string, []>("op_1410_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 512])];
+            tensor<bool, [4]> var_1410_end_mask_0 = const()[name = tensor<string, []>("op_1410_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1410_cast_fp16 = slice_by_index(begin = var_1410_begin_0, end = var_1410_end_0, end_mask = var_1410_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1410_cast_fp16")];
+            tensor<int32, [4]> var_1414_begin_0 = const()[name = tensor<string, []>("op_1414_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_1414_end_0 = const()[name = tensor<string, []>("op_1414_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 512])];
+            tensor<bool, [4]> var_1414_end_mask_0 = const()[name = tensor<string, []>("op_1414_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1414_cast_fp16 = slice_by_index(begin = var_1414_begin_0, end = var_1414_end_0, end_mask = var_1414_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1414_cast_fp16")];
+            tensor<int32, [4]> var_1418_begin_0 = const()[name = tensor<string, []>("op_1418_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_1418_end_0 = const()[name = tensor<string, []>("op_1418_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 512])];
+            tensor<bool, [4]> var_1418_end_mask_0 = const()[name = tensor<string, []>("op_1418_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1418_cast_fp16 = slice_by_index(begin = var_1418_begin_0, end = var_1418_end_0, end_mask = var_1418_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1418_cast_fp16")];
+            tensor<int32, [4]> var_1422_begin_0 = const()[name = tensor<string, []>("op_1422_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_1422_end_0 = const()[name = tensor<string, []>("op_1422_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 512])];
+            tensor<bool, [4]> var_1422_end_mask_0 = const()[name = tensor<string, []>("op_1422_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1422_cast_fp16 = slice_by_index(begin = var_1422_begin_0, end = var_1422_end_0, end_mask = var_1422_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1422_cast_fp16")];
+            tensor<int32, [4]> var_1426_begin_0 = const()[name = tensor<string, []>("op_1426_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_1426_end_0 = const()[name = tensor<string, []>("op_1426_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 512])];
+            tensor<bool, [4]> var_1426_end_mask_0 = const()[name = tensor<string, []>("op_1426_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1426_cast_fp16 = slice_by_index(begin = var_1426_begin_0, end = var_1426_end_0, end_mask = var_1426_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1426_cast_fp16")];
+            tensor<int32, [4]> var_1430_begin_0 = const()[name = tensor<string, []>("op_1430_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_1430_end_0 = const()[name = tensor<string, []>("op_1430_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 512])];
+            tensor<bool, [4]> var_1430_end_mask_0 = const()[name = tensor<string, []>("op_1430_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1430_cast_fp16 = slice_by_index(begin = var_1430_begin_0, end = var_1430_end_0, end_mask = var_1430_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1430_cast_fp16")];
+            tensor<int32, [4]> var_1434_begin_0 = const()[name = tensor<string, []>("op_1434_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_1434_end_0 = const()[name = tensor<string, []>("op_1434_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 512])];
+            tensor<bool, [4]> var_1434_end_mask_0 = const()[name = tensor<string, []>("op_1434_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1434_cast_fp16 = slice_by_index(begin = var_1434_begin_0, end = var_1434_end_0, end_mask = var_1434_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1434_cast_fp16")];
+            tensor<int32, [4]> var_1438_begin_0 = const()[name = tensor<string, []>("op_1438_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_1438_end_0 = const()[name = tensor<string, []>("op_1438_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 512])];
+            tensor<bool, [4]> var_1438_end_mask_0 = const()[name = tensor<string, []>("op_1438_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1438_cast_fp16 = slice_by_index(begin = var_1438_begin_0, end = var_1438_end_0, end_mask = var_1438_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1438_cast_fp16")];
+            tensor<int32, [4]> var_1442_begin_0 = const()[name = tensor<string, []>("op_1442_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_1442_end_0 = const()[name = tensor<string, []>("op_1442_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 512])];
+            tensor<bool, [4]> var_1442_end_mask_0 = const()[name = tensor<string, []>("op_1442_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1442_cast_fp16 = slice_by_index(begin = var_1442_begin_0, end = var_1442_end_0, end_mask = var_1442_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1442_cast_fp16")];
+            tensor<int32, [4]> var_1446_begin_0 = const()[name = tensor<string, []>("op_1446_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_1446_end_0 = const()[name = tensor<string, []>("op_1446_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 512])];
+            tensor<bool, [4]> var_1446_end_mask_0 = const()[name = tensor<string, []>("op_1446_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1446_cast_fp16 = slice_by_index(begin = var_1446_begin_0, end = var_1446_end_0, end_mask = var_1446_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1446_cast_fp16")];
+            tensor<int32, [4]> var_1450_begin_0 = const()[name = tensor<string, []>("op_1450_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_1450_end_0 = const()[name = tensor<string, []>("op_1450_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 512])];
+            tensor<bool, [4]> var_1450_end_mask_0 = const()[name = tensor<string, []>("op_1450_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1450_cast_fp16 = slice_by_index(begin = var_1450_begin_0, end = var_1450_end_0, end_mask = var_1450_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1450_cast_fp16")];
+            tensor<int32, [4]> var_1454_begin_0 = const()[name = tensor<string, []>("op_1454_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_1454_end_0 = const()[name = tensor<string, []>("op_1454_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 512])];
+            tensor<bool, [4]> var_1454_end_mask_0 = const()[name = tensor<string, []>("op_1454_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1454_cast_fp16 = slice_by_index(begin = var_1454_begin_0, end = var_1454_end_0, end_mask = var_1454_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1454_cast_fp16")];
+            tensor<int32, [4]> var_1458_begin_0 = const()[name = tensor<string, []>("op_1458_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1458_end_0 = const()[name = tensor<string, []>("op_1458_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 512])];
+            tensor<bool, [4]> var_1458_end_mask_0 = const()[name = tensor<string, []>("op_1458_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1458_cast_fp16 = slice_by_index(begin = var_1458_begin_0, end = var_1458_end_0, end_mask = var_1458_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1458_cast_fp16")];
+            tensor<int32, [4]> var_1462_begin_0 = const()[name = tensor<string, []>("op_1462_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1462_end_0 = const()[name = tensor<string, []>("op_1462_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 512])];
+            tensor<bool, [4]> var_1462_end_mask_0 = const()[name = tensor<string, []>("op_1462_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1462_cast_fp16 = slice_by_index(begin = var_1462_begin_0, end = var_1462_end_0, end_mask = var_1462_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1462_cast_fp16")];
+            tensor<int32, [4]> var_1466_begin_0 = const()[name = tensor<string, []>("op_1466_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1466_end_0 = const()[name = tensor<string, []>("op_1466_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 512])];
+            tensor<bool, [4]> var_1466_end_mask_0 = const()[name = tensor<string, []>("op_1466_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1466_cast_fp16 = slice_by_index(begin = var_1466_begin_0, end = var_1466_end_0, end_mask = var_1466_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1466_cast_fp16")];
+            tensor<int32, [4]> var_1470_begin_0 = const()[name = tensor<string, []>("op_1470_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1470_end_0 = const()[name = tensor<string, []>("op_1470_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 512])];
+            tensor<bool, [4]> var_1470_end_mask_0 = const()[name = tensor<string, []>("op_1470_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1470_cast_fp16 = slice_by_index(begin = var_1470_begin_0, end = var_1470_end_0, end_mask = var_1470_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1470_cast_fp16")];
+            tensor<int32, [4]> var_1474_begin_0 = const()[name = tensor<string, []>("op_1474_begin_0"), val = tensor<int32, [4]>([0, 3072, 0, 0])];
+            tensor<int32, [4]> var_1474_end_0 = const()[name = tensor<string, []>("op_1474_end_0"), val = tensor<int32, [4]>([1, 3200, 1, 512])];
+            tensor<bool, [4]> var_1474_end_mask_0 = const()[name = tensor<string, []>("op_1474_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1474_cast_fp16 = slice_by_index(begin = var_1474_begin_0, end = var_1474_end_0, end_mask = var_1474_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1474_cast_fp16")];
+            tensor<int32, [4]> var_1478_begin_0 = const()[name = tensor<string, []>("op_1478_begin_0"), val = tensor<int32, [4]>([0, 3200, 0, 0])];
+            tensor<int32, [4]> var_1478_end_0 = const()[name = tensor<string, []>("op_1478_end_0"), val = tensor<int32, [4]>([1, 3328, 1, 512])];
+            tensor<bool, [4]> var_1478_end_mask_0 = const()[name = tensor<string, []>("op_1478_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1478_cast_fp16 = slice_by_index(begin = var_1478_begin_0, end = var_1478_end_0, end_mask = var_1478_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1478_cast_fp16")];
+            tensor<int32, [4]> var_1482_begin_0 = const()[name = tensor<string, []>("op_1482_begin_0"), val = tensor<int32, [4]>([0, 3328, 0, 0])];
+            tensor<int32, [4]> var_1482_end_0 = const()[name = tensor<string, []>("op_1482_end_0"), val = tensor<int32, [4]>([1, 3456, 1, 512])];
+            tensor<bool, [4]> var_1482_end_mask_0 = const()[name = tensor<string, []>("op_1482_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1482_cast_fp16 = slice_by_index(begin = var_1482_begin_0, end = var_1482_end_0, end_mask = var_1482_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1482_cast_fp16")];
+            tensor<int32, [4]> var_1486_begin_0 = const()[name = tensor<string, []>("op_1486_begin_0"), val = tensor<int32, [4]>([0, 3456, 0, 0])];
+            tensor<int32, [4]> var_1486_end_0 = const()[name = tensor<string, []>("op_1486_end_0"), val = tensor<int32, [4]>([1, 3584, 1, 512])];
+            tensor<bool, [4]> var_1486_end_mask_0 = const()[name = tensor<string, []>("op_1486_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1486_cast_fp16 = slice_by_index(begin = var_1486_begin_0, end = var_1486_end_0, end_mask = var_1486_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1486_cast_fp16")];
+            tensor<int32, [4]> var_1490_begin_0 = const()[name = tensor<string, []>("op_1490_begin_0"), val = tensor<int32, [4]>([0, 3584, 0, 0])];
+            tensor<int32, [4]> var_1490_end_0 = const()[name = tensor<string, []>("op_1490_end_0"), val = tensor<int32, [4]>([1, 3712, 1, 512])];
+            tensor<bool, [4]> var_1490_end_mask_0 = const()[name = tensor<string, []>("op_1490_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1490_cast_fp16 = slice_by_index(begin = var_1490_begin_0, end = var_1490_end_0, end_mask = var_1490_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1490_cast_fp16")];
+            tensor<int32, [4]> var_1494_begin_0 = const()[name = tensor<string, []>("op_1494_begin_0"), val = tensor<int32, [4]>([0, 3712, 0, 0])];
+            tensor<int32, [4]> var_1494_end_0 = const()[name = tensor<string, []>("op_1494_end_0"), val = tensor<int32, [4]>([1, 3840, 1, 512])];
+            tensor<bool, [4]> var_1494_end_mask_0 = const()[name = tensor<string, []>("op_1494_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1494_cast_fp16 = slice_by_index(begin = var_1494_begin_0, end = var_1494_end_0, end_mask = var_1494_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1494_cast_fp16")];
+            tensor<int32, [4]> var_1498_begin_0 = const()[name = tensor<string, []>("op_1498_begin_0"), val = tensor<int32, [4]>([0, 3840, 0, 0])];
+            tensor<int32, [4]> var_1498_end_0 = const()[name = tensor<string, []>("op_1498_end_0"), val = tensor<int32, [4]>([1, 3968, 1, 512])];
+            tensor<bool, [4]> var_1498_end_mask_0 = const()[name = tensor<string, []>("op_1498_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1498_cast_fp16 = slice_by_index(begin = var_1498_begin_0, end = var_1498_end_0, end_mask = var_1498_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1498_cast_fp16")];
+            tensor<int32, [4]> var_1502_begin_0 = const()[name = tensor<string, []>("op_1502_begin_0"), val = tensor<int32, [4]>([0, 3968, 0, 0])];
+            tensor<int32, [4]> var_1502_end_0 = const()[name = tensor<string, []>("op_1502_end_0"), val = tensor<int32, [4]>([1, 4096, 1, 512])];
+            tensor<bool, [4]> var_1502_end_mask_0 = const()[name = tensor<string, []>("op_1502_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1502_cast_fp16 = slice_by_index(begin = var_1502_begin_0, end = var_1502_end_0, end_mask = var_1502_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1502_cast_fp16")];
+            tensor<string, []> var_1506_equation_0 = const()[name = tensor<string, []>("op_1506_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1506_cast_fp16 = einsum(equation = var_1506_equation_0, values = (var_1252_cast_fp16, var_1122_cast_fp16))[name = tensor<string, []>("op_1506_cast_fp16")];
+            tensor<fp16, []> var_1507_to_fp16 = const()[name = tensor<string, []>("op_1507_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1508_cast_fp16 = mul(x = var_1506_cast_fp16, y = var_1507_to_fp16)[name = tensor<string, []>("op_1508_cast_fp16")];
+            tensor<string, []> var_1510_equation_0 = const()[name = tensor<string, []>("op_1510_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1510_cast_fp16 = einsum(equation = var_1510_equation_0, values = (var_1256_cast_fp16, var_1126_cast_fp16))[name = tensor<string, []>("op_1510_cast_fp16")];
+            tensor<fp16, []> var_1511_to_fp16 = const()[name = tensor<string, []>("op_1511_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1512_cast_fp16 = mul(x = var_1510_cast_fp16, y = var_1511_to_fp16)[name = tensor<string, []>("op_1512_cast_fp16")];
+            tensor<string, []> var_1514_equation_0 = const()[name = tensor<string, []>("op_1514_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1514_cast_fp16 = einsum(equation = var_1514_equation_0, values = (var_1260_cast_fp16, var_1130_cast_fp16))[name = tensor<string, []>("op_1514_cast_fp16")];
+            tensor<fp16, []> var_1515_to_fp16 = const()[name = tensor<string, []>("op_1515_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1516_cast_fp16 = mul(x = var_1514_cast_fp16, y = var_1515_to_fp16)[name = tensor<string, []>("op_1516_cast_fp16")];
+            tensor<string, []> var_1518_equation_0 = const()[name = tensor<string, []>("op_1518_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1518_cast_fp16 = einsum(equation = var_1518_equation_0, values = (var_1264_cast_fp16, var_1134_cast_fp16))[name = tensor<string, []>("op_1518_cast_fp16")];
+            tensor<fp16, []> var_1519_to_fp16 = const()[name = tensor<string, []>("op_1519_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1520_cast_fp16 = mul(x = var_1518_cast_fp16, y = var_1519_to_fp16)[name = tensor<string, []>("op_1520_cast_fp16")];
+            tensor<string, []> var_1522_equation_0 = const()[name = tensor<string, []>("op_1522_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1522_cast_fp16 = einsum(equation = var_1522_equation_0, values = (var_1268_cast_fp16, var_1138_cast_fp16))[name = tensor<string, []>("op_1522_cast_fp16")];
+            tensor<fp16, []> var_1523_to_fp16 = const()[name = tensor<string, []>("op_1523_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1524_cast_fp16 = mul(x = var_1522_cast_fp16, y = var_1523_to_fp16)[name = tensor<string, []>("op_1524_cast_fp16")];
+            tensor<string, []> var_1526_equation_0 = const()[name = tensor<string, []>("op_1526_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1526_cast_fp16 = einsum(equation = var_1526_equation_0, values = (var_1272_cast_fp16, var_1142_cast_fp16))[name = tensor<string, []>("op_1526_cast_fp16")];
+            tensor<fp16, []> var_1527_to_fp16 = const()[name = tensor<string, []>("op_1527_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1528_cast_fp16 = mul(x = var_1526_cast_fp16, y = var_1527_to_fp16)[name = tensor<string, []>("op_1528_cast_fp16")];
+            tensor<string, []> var_1530_equation_0 = const()[name = tensor<string, []>("op_1530_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1530_cast_fp16 = einsum(equation = var_1530_equation_0, values = (var_1276_cast_fp16, var_1146_cast_fp16))[name = tensor<string, []>("op_1530_cast_fp16")];
+            tensor<fp16, []> var_1531_to_fp16 = const()[name = tensor<string, []>("op_1531_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1532_cast_fp16 = mul(x = var_1530_cast_fp16, y = var_1531_to_fp16)[name = tensor<string, []>("op_1532_cast_fp16")];
+            tensor<string, []> var_1534_equation_0 = const()[name = tensor<string, []>("op_1534_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1534_cast_fp16 = einsum(equation = var_1534_equation_0, values = (var_1280_cast_fp16, var_1150_cast_fp16))[name = tensor<string, []>("op_1534_cast_fp16")];
+            tensor<fp16, []> var_1535_to_fp16 = const()[name = tensor<string, []>("op_1535_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1536_cast_fp16 = mul(x = var_1534_cast_fp16, y = var_1535_to_fp16)[name = tensor<string, []>("op_1536_cast_fp16")];
+            tensor<string, []> var_1538_equation_0 = const()[name = tensor<string, []>("op_1538_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1538_cast_fp16 = einsum(equation = var_1538_equation_0, values = (var_1284_cast_fp16, var_1154_cast_fp16))[name = tensor<string, []>("op_1538_cast_fp16")];
+            tensor<fp16, []> var_1539_to_fp16 = const()[name = tensor<string, []>("op_1539_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1540_cast_fp16 = mul(x = var_1538_cast_fp16, y = var_1539_to_fp16)[name = tensor<string, []>("op_1540_cast_fp16")];
+            tensor<string, []> var_1542_equation_0 = const()[name = tensor<string, []>("op_1542_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1542_cast_fp16 = einsum(equation = var_1542_equation_0, values = (var_1288_cast_fp16, var_1158_cast_fp16))[name = tensor<string, []>("op_1542_cast_fp16")];
+            tensor<fp16, []> var_1543_to_fp16 = const()[name = tensor<string, []>("op_1543_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1544_cast_fp16 = mul(x = var_1542_cast_fp16, y = var_1543_to_fp16)[name = tensor<string, []>("op_1544_cast_fp16")];
+            tensor<string, []> var_1546_equation_0 = const()[name = tensor<string, []>("op_1546_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1546_cast_fp16 = einsum(equation = var_1546_equation_0, values = (var_1292_cast_fp16, var_1162_cast_fp16))[name = tensor<string, []>("op_1546_cast_fp16")];
+            tensor<fp16, []> var_1547_to_fp16 = const()[name = tensor<string, []>("op_1547_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1548_cast_fp16 = mul(x = var_1546_cast_fp16, y = var_1547_to_fp16)[name = tensor<string, []>("op_1548_cast_fp16")];
+            tensor<string, []> var_1550_equation_0 = const()[name = tensor<string, []>("op_1550_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1550_cast_fp16 = einsum(equation = var_1550_equation_0, values = (var_1296_cast_fp16, var_1166_cast_fp16))[name = tensor<string, []>("op_1550_cast_fp16")];
+            tensor<fp16, []> var_1551_to_fp16 = const()[name = tensor<string, []>("op_1551_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1552_cast_fp16 = mul(x = var_1550_cast_fp16, y = var_1551_to_fp16)[name = tensor<string, []>("op_1552_cast_fp16")];
+            tensor<string, []> var_1554_equation_0 = const()[name = tensor<string, []>("op_1554_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1554_cast_fp16 = einsum(equation = var_1554_equation_0, values = (var_1300_cast_fp16, var_1170_cast_fp16))[name = tensor<string, []>("op_1554_cast_fp16")];
+            tensor<fp16, []> var_1555_to_fp16 = const()[name = tensor<string, []>("op_1555_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1556_cast_fp16 = mul(x = var_1554_cast_fp16, y = var_1555_to_fp16)[name = tensor<string, []>("op_1556_cast_fp16")];
+            tensor<string, []> var_1558_equation_0 = const()[name = tensor<string, []>("op_1558_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1558_cast_fp16 = einsum(equation = var_1558_equation_0, values = (var_1304_cast_fp16, var_1174_cast_fp16))[name = tensor<string, []>("op_1558_cast_fp16")];
+            tensor<fp16, []> var_1559_to_fp16 = const()[name = tensor<string, []>("op_1559_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1560_cast_fp16 = mul(x = var_1558_cast_fp16, y = var_1559_to_fp16)[name = tensor<string, []>("op_1560_cast_fp16")];
+            tensor<string, []> var_1562_equation_0 = const()[name = tensor<string, []>("op_1562_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1562_cast_fp16 = einsum(equation = var_1562_equation_0, values = (var_1308_cast_fp16, var_1178_cast_fp16))[name = tensor<string, []>("op_1562_cast_fp16")];
+            tensor<fp16, []> var_1563_to_fp16 = const()[name = tensor<string, []>("op_1563_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1564_cast_fp16 = mul(x = var_1562_cast_fp16, y = var_1563_to_fp16)[name = tensor<string, []>("op_1564_cast_fp16")];
+            tensor<string, []> var_1566_equation_0 = const()[name = tensor<string, []>("op_1566_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1566_cast_fp16 = einsum(equation = var_1566_equation_0, values = (var_1312_cast_fp16, var_1182_cast_fp16))[name = tensor<string, []>("op_1566_cast_fp16")];
+            tensor<fp16, []> var_1567_to_fp16 = const()[name = tensor<string, []>("op_1567_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1568_cast_fp16 = mul(x = var_1566_cast_fp16, y = var_1567_to_fp16)[name = tensor<string, []>("op_1568_cast_fp16")];
+            tensor<string, []> var_1570_equation_0 = const()[name = tensor<string, []>("op_1570_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1570_cast_fp16 = einsum(equation = var_1570_equation_0, values = (var_1316_cast_fp16, var_1186_cast_fp16))[name = tensor<string, []>("op_1570_cast_fp16")];
+            tensor<fp16, []> var_1571_to_fp16 = const()[name = tensor<string, []>("op_1571_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1572_cast_fp16 = mul(x = var_1570_cast_fp16, y = var_1571_to_fp16)[name = tensor<string, []>("op_1572_cast_fp16")];
+            tensor<string, []> var_1574_equation_0 = const()[name = tensor<string, []>("op_1574_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1574_cast_fp16 = einsum(equation = var_1574_equation_0, values = (var_1320_cast_fp16, var_1190_cast_fp16))[name = tensor<string, []>("op_1574_cast_fp16")];
+            tensor<fp16, []> var_1575_to_fp16 = const()[name = tensor<string, []>("op_1575_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1576_cast_fp16 = mul(x = var_1574_cast_fp16, y = var_1575_to_fp16)[name = tensor<string, []>("op_1576_cast_fp16")];
+            tensor<string, []> var_1578_equation_0 = const()[name = tensor<string, []>("op_1578_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1578_cast_fp16 = einsum(equation = var_1578_equation_0, values = (var_1324_cast_fp16, var_1194_cast_fp16))[name = tensor<string, []>("op_1578_cast_fp16")];
+            tensor<fp16, []> var_1579_to_fp16 = const()[name = tensor<string, []>("op_1579_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1580_cast_fp16 = mul(x = var_1578_cast_fp16, y = var_1579_to_fp16)[name = tensor<string, []>("op_1580_cast_fp16")];
+            tensor<string, []> var_1582_equation_0 = const()[name = tensor<string, []>("op_1582_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1582_cast_fp16 = einsum(equation = var_1582_equation_0, values = (var_1328_cast_fp16, var_1198_cast_fp16))[name = tensor<string, []>("op_1582_cast_fp16")];
+            tensor<fp16, []> var_1583_to_fp16 = const()[name = tensor<string, []>("op_1583_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1584_cast_fp16 = mul(x = var_1582_cast_fp16, y = var_1583_to_fp16)[name = tensor<string, []>("op_1584_cast_fp16")];
+            tensor<string, []> var_1586_equation_0 = const()[name = tensor<string, []>("op_1586_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1586_cast_fp16 = einsum(equation = var_1586_equation_0, values = (var_1332_cast_fp16, var_1202_cast_fp16))[name = tensor<string, []>("op_1586_cast_fp16")];
+            tensor<fp16, []> var_1587_to_fp16 = const()[name = tensor<string, []>("op_1587_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1588_cast_fp16 = mul(x = var_1586_cast_fp16, y = var_1587_to_fp16)[name = tensor<string, []>("op_1588_cast_fp16")];
+            tensor<string, []> var_1590_equation_0 = const()[name = tensor<string, []>("op_1590_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1590_cast_fp16 = einsum(equation = var_1590_equation_0, values = (var_1336_cast_fp16, var_1206_cast_fp16))[name = tensor<string, []>("op_1590_cast_fp16")];
+            tensor<fp16, []> var_1591_to_fp16 = const()[name = tensor<string, []>("op_1591_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1592_cast_fp16 = mul(x = var_1590_cast_fp16, y = var_1591_to_fp16)[name = tensor<string, []>("op_1592_cast_fp16")];
+            tensor<string, []> var_1594_equation_0 = const()[name = tensor<string, []>("op_1594_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1594_cast_fp16 = einsum(equation = var_1594_equation_0, values = (var_1340_cast_fp16, var_1210_cast_fp16))[name = tensor<string, []>("op_1594_cast_fp16")];
+            tensor<fp16, []> var_1595_to_fp16 = const()[name = tensor<string, []>("op_1595_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1596_cast_fp16 = mul(x = var_1594_cast_fp16, y = var_1595_to_fp16)[name = tensor<string, []>("op_1596_cast_fp16")];
+            tensor<string, []> var_1598_equation_0 = const()[name = tensor<string, []>("op_1598_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1598_cast_fp16 = einsum(equation = var_1598_equation_0, values = (var_1344_cast_fp16, var_1214_cast_fp16))[name = tensor<string, []>("op_1598_cast_fp16")];
+            tensor<fp16, []> var_1599_to_fp16 = const()[name = tensor<string, []>("op_1599_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1600_cast_fp16 = mul(x = var_1598_cast_fp16, y = var_1599_to_fp16)[name = tensor<string, []>("op_1600_cast_fp16")];
+            tensor<string, []> var_1602_equation_0 = const()[name = tensor<string, []>("op_1602_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1602_cast_fp16 = einsum(equation = var_1602_equation_0, values = (var_1348_cast_fp16, var_1218_cast_fp16))[name = tensor<string, []>("op_1602_cast_fp16")];
+            tensor<fp16, []> var_1603_to_fp16 = const()[name = tensor<string, []>("op_1603_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1604_cast_fp16 = mul(x = var_1602_cast_fp16, y = var_1603_to_fp16)[name = tensor<string, []>("op_1604_cast_fp16")];
+            tensor<string, []> var_1606_equation_0 = const()[name = tensor<string, []>("op_1606_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1606_cast_fp16 = einsum(equation = var_1606_equation_0, values = (var_1352_cast_fp16, var_1222_cast_fp16))[name = tensor<string, []>("op_1606_cast_fp16")];
+            tensor<fp16, []> var_1607_to_fp16 = const()[name = tensor<string, []>("op_1607_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1608_cast_fp16 = mul(x = var_1606_cast_fp16, y = var_1607_to_fp16)[name = tensor<string, []>("op_1608_cast_fp16")];
+            tensor<string, []> var_1610_equation_0 = const()[name = tensor<string, []>("op_1610_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1610_cast_fp16 = einsum(equation = var_1610_equation_0, values = (var_1356_cast_fp16, var_1226_cast_fp16))[name = tensor<string, []>("op_1610_cast_fp16")];
+            tensor<fp16, []> var_1611_to_fp16 = const()[name = tensor<string, []>("op_1611_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1612_cast_fp16 = mul(x = var_1610_cast_fp16, y = var_1611_to_fp16)[name = tensor<string, []>("op_1612_cast_fp16")];
+            tensor<string, []> var_1614_equation_0 = const()[name = tensor<string, []>("op_1614_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1614_cast_fp16 = einsum(equation = var_1614_equation_0, values = (var_1360_cast_fp16, var_1230_cast_fp16))[name = tensor<string, []>("op_1614_cast_fp16")];
+            tensor<fp16, []> var_1615_to_fp16 = const()[name = tensor<string, []>("op_1615_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1616_cast_fp16 = mul(x = var_1614_cast_fp16, y = var_1615_to_fp16)[name = tensor<string, []>("op_1616_cast_fp16")];
+            tensor<string, []> var_1618_equation_0 = const()[name = tensor<string, []>("op_1618_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1618_cast_fp16 = einsum(equation = var_1618_equation_0, values = (var_1364_cast_fp16, var_1234_cast_fp16))[name = tensor<string, []>("op_1618_cast_fp16")];
+            tensor<fp16, []> var_1619_to_fp16 = const()[name = tensor<string, []>("op_1619_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1620_cast_fp16 = mul(x = var_1618_cast_fp16, y = var_1619_to_fp16)[name = tensor<string, []>("op_1620_cast_fp16")];
+            tensor<string, []> var_1622_equation_0 = const()[name = tensor<string, []>("op_1622_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1622_cast_fp16 = einsum(equation = var_1622_equation_0, values = (var_1368_cast_fp16, var_1238_cast_fp16))[name = tensor<string, []>("op_1622_cast_fp16")];
+            tensor<fp16, []> var_1623_to_fp16 = const()[name = tensor<string, []>("op_1623_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1624_cast_fp16 = mul(x = var_1622_cast_fp16, y = var_1623_to_fp16)[name = tensor<string, []>("op_1624_cast_fp16")];
+            tensor<string, []> var_1626_equation_0 = const()[name = tensor<string, []>("op_1626_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1626_cast_fp16 = einsum(equation = var_1626_equation_0, values = (var_1372_cast_fp16, var_1242_cast_fp16))[name = tensor<string, []>("op_1626_cast_fp16")];
+            tensor<fp16, []> var_1627_to_fp16 = const()[name = tensor<string, []>("op_1627_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1628_cast_fp16 = mul(x = var_1626_cast_fp16, y = var_1627_to_fp16)[name = tensor<string, []>("op_1628_cast_fp16")];
+            tensor<string, []> var_1630_equation_0 = const()[name = tensor<string, []>("op_1630_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1630_cast_fp16 = einsum(equation = var_1630_equation_0, values = (var_1376_cast_fp16, var_1246_cast_fp16))[name = tensor<string, []>("op_1630_cast_fp16")];
+            tensor<fp16, []> var_1631_to_fp16 = const()[name = tensor<string, []>("op_1631_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1632_cast_fp16 = mul(x = var_1630_cast_fp16, y = var_1631_to_fp16)[name = tensor<string, []>("op_1632_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1508_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1512_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1516_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1520_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1524_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1528_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1532_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1536_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1540_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1544_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1548_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1552_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1556_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1560_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1564_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_95_cast_fp16 = add(x = var_1568_cast_fp16, y = mask)[name = tensor<string, []>("aw_95_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_97_cast_fp16 = add(x = var_1572_cast_fp16, y = mask)[name = tensor<string, []>("aw_97_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_99_cast_fp16 = add(x = var_1576_cast_fp16, y = mask)[name = tensor<string, []>("aw_99_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_101_cast_fp16 = add(x = var_1580_cast_fp16, y = mask)[name = tensor<string, []>("aw_101_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_103_cast_fp16 = add(x = var_1584_cast_fp16, y = mask)[name = tensor<string, []>("aw_103_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_105_cast_fp16 = add(x = var_1588_cast_fp16, y = mask)[name = tensor<string, []>("aw_105_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_107_cast_fp16 = add(x = var_1592_cast_fp16, y = mask)[name = tensor<string, []>("aw_107_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_109_cast_fp16 = add(x = var_1596_cast_fp16, y = mask)[name = tensor<string, []>("aw_109_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_111_cast_fp16 = add(x = var_1600_cast_fp16, y = mask)[name = tensor<string, []>("aw_111_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_113_cast_fp16 = add(x = var_1604_cast_fp16, y = mask)[name = tensor<string, []>("aw_113_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_115_cast_fp16 = add(x = var_1608_cast_fp16, y = mask)[name = tensor<string, []>("aw_115_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_117_cast_fp16 = add(x = var_1612_cast_fp16, y = mask)[name = tensor<string, []>("aw_117_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_119_cast_fp16 = add(x = var_1616_cast_fp16, y = mask)[name = tensor<string, []>("aw_119_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_121_cast_fp16 = add(x = var_1620_cast_fp16, y = mask)[name = tensor<string, []>("aw_121_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_123_cast_fp16 = add(x = var_1624_cast_fp16, y = mask)[name = tensor<string, []>("aw_123_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_125_cast_fp16 = add(x = var_1628_cast_fp16, y = mask)[name = tensor<string, []>("aw_125_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_127_cast_fp16 = add(x = var_1632_cast_fp16, y = mask)[name = tensor<string, []>("aw_127_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1665_cast_fp16 = softmax(axis = var_974, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1665_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1666_cast_fp16 = softmax(axis = var_974, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1666_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1667_cast_fp16 = softmax(axis = var_974, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1667_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1668_cast_fp16 = softmax(axis = var_974, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1668_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1669_cast_fp16 = softmax(axis = var_974, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1669_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1670_cast_fp16 = softmax(axis = var_974, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1670_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1671_cast_fp16 = softmax(axis = var_974, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1671_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1672_cast_fp16 = softmax(axis = var_974, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1672_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1673_cast_fp16 = softmax(axis = var_974, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1673_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1674_cast_fp16 = softmax(axis = var_974, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1674_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1675_cast_fp16 = softmax(axis = var_974, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1675_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1676_cast_fp16 = softmax(axis = var_974, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1676_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1677_cast_fp16 = softmax(axis = var_974, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1677_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1678_cast_fp16 = softmax(axis = var_974, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1678_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1679_cast_fp16 = softmax(axis = var_974, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1679_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1680_cast_fp16 = softmax(axis = var_974, x = aw_95_cast_fp16)[name = tensor<string, []>("op_1680_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1681_cast_fp16 = softmax(axis = var_974, x = aw_97_cast_fp16)[name = tensor<string, []>("op_1681_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1682_cast_fp16 = softmax(axis = var_974, x = aw_99_cast_fp16)[name = tensor<string, []>("op_1682_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1683_cast_fp16 = softmax(axis = var_974, x = aw_101_cast_fp16)[name = tensor<string, []>("op_1683_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1684_cast_fp16 = softmax(axis = var_974, x = aw_103_cast_fp16)[name = tensor<string, []>("op_1684_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1685_cast_fp16 = softmax(axis = var_974, x = aw_105_cast_fp16)[name = tensor<string, []>("op_1685_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1686_cast_fp16 = softmax(axis = var_974, x = aw_107_cast_fp16)[name = tensor<string, []>("op_1686_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1687_cast_fp16 = softmax(axis = var_974, x = aw_109_cast_fp16)[name = tensor<string, []>("op_1687_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1688_cast_fp16 = softmax(axis = var_974, x = aw_111_cast_fp16)[name = tensor<string, []>("op_1688_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1689_cast_fp16 = softmax(axis = var_974, x = aw_113_cast_fp16)[name = tensor<string, []>("op_1689_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1690_cast_fp16 = softmax(axis = var_974, x = aw_115_cast_fp16)[name = tensor<string, []>("op_1690_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1691_cast_fp16 = softmax(axis = var_974, x = aw_117_cast_fp16)[name = tensor<string, []>("op_1691_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1692_cast_fp16 = softmax(axis = var_974, x = aw_119_cast_fp16)[name = tensor<string, []>("op_1692_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1693_cast_fp16 = softmax(axis = var_974, x = aw_121_cast_fp16)[name = tensor<string, []>("op_1693_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1694_cast_fp16 = softmax(axis = var_974, x = aw_123_cast_fp16)[name = tensor<string, []>("op_1694_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1695_cast_fp16 = softmax(axis = var_974, x = aw_125_cast_fp16)[name = tensor<string, []>("op_1695_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1696_cast_fp16 = softmax(axis = var_974, x = aw_127_cast_fp16)[name = tensor<string, []>("op_1696_cast_fp16")];
+            tensor<string, []> var_1698_equation_0 = const()[name = tensor<string, []>("op_1698_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1698_cast_fp16 = einsum(equation = var_1698_equation_0, values = (var_1378_cast_fp16, var_1665_cast_fp16))[name = tensor<string, []>("op_1698_cast_fp16")];
+            tensor<string, []> var_1700_equation_0 = const()[name = tensor<string, []>("op_1700_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1700_cast_fp16 = einsum(equation = var_1700_equation_0, values = (var_1382_cast_fp16, var_1666_cast_fp16))[name = tensor<string, []>("op_1700_cast_fp16")];
+            tensor<string, []> var_1702_equation_0 = const()[name = tensor<string, []>("op_1702_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1702_cast_fp16 = einsum(equation = var_1702_equation_0, values = (var_1386_cast_fp16, var_1667_cast_fp16))[name = tensor<string, []>("op_1702_cast_fp16")];
+            tensor<string, []> var_1704_equation_0 = const()[name = tensor<string, []>("op_1704_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1704_cast_fp16 = einsum(equation = var_1704_equation_0, values = (var_1390_cast_fp16, var_1668_cast_fp16))[name = tensor<string, []>("op_1704_cast_fp16")];
+            tensor<string, []> var_1706_equation_0 = const()[name = tensor<string, []>("op_1706_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1706_cast_fp16 = einsum(equation = var_1706_equation_0, values = (var_1394_cast_fp16, var_1669_cast_fp16))[name = tensor<string, []>("op_1706_cast_fp16")];
+            tensor<string, []> var_1708_equation_0 = const()[name = tensor<string, []>("op_1708_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1708_cast_fp16 = einsum(equation = var_1708_equation_0, values = (var_1398_cast_fp16, var_1670_cast_fp16))[name = tensor<string, []>("op_1708_cast_fp16")];
+            tensor<string, []> var_1710_equation_0 = const()[name = tensor<string, []>("op_1710_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1710_cast_fp16 = einsum(equation = var_1710_equation_0, values = (var_1402_cast_fp16, var_1671_cast_fp16))[name = tensor<string, []>("op_1710_cast_fp16")];
+            tensor<string, []> var_1712_equation_0 = const()[name = tensor<string, []>("op_1712_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1712_cast_fp16 = einsum(equation = var_1712_equation_0, values = (var_1406_cast_fp16, var_1672_cast_fp16))[name = tensor<string, []>("op_1712_cast_fp16")];
+            tensor<string, []> var_1714_equation_0 = const()[name = tensor<string, []>("op_1714_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1714_cast_fp16 = einsum(equation = var_1714_equation_0, values = (var_1410_cast_fp16, var_1673_cast_fp16))[name = tensor<string, []>("op_1714_cast_fp16")];
+            tensor<string, []> var_1716_equation_0 = const()[name = tensor<string, []>("op_1716_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1716_cast_fp16 = einsum(equation = var_1716_equation_0, values = (var_1414_cast_fp16, var_1674_cast_fp16))[name = tensor<string, []>("op_1716_cast_fp16")];
+            tensor<string, []> var_1718_equation_0 = const()[name = tensor<string, []>("op_1718_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1718_cast_fp16 = einsum(equation = var_1718_equation_0, values = (var_1418_cast_fp16, var_1675_cast_fp16))[name = tensor<string, []>("op_1718_cast_fp16")];
+            tensor<string, []> var_1720_equation_0 = const()[name = tensor<string, []>("op_1720_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1720_cast_fp16 = einsum(equation = var_1720_equation_0, values = (var_1422_cast_fp16, var_1676_cast_fp16))[name = tensor<string, []>("op_1720_cast_fp16")];
+            tensor<string, []> var_1722_equation_0 = const()[name = tensor<string, []>("op_1722_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1722_cast_fp16 = einsum(equation = var_1722_equation_0, values = (var_1426_cast_fp16, var_1677_cast_fp16))[name = tensor<string, []>("op_1722_cast_fp16")];
+            tensor<string, []> var_1724_equation_0 = const()[name = tensor<string, []>("op_1724_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1724_cast_fp16 = einsum(equation = var_1724_equation_0, values = (var_1430_cast_fp16, var_1678_cast_fp16))[name = tensor<string, []>("op_1724_cast_fp16")];
+            tensor<string, []> var_1726_equation_0 = const()[name = tensor<string, []>("op_1726_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1726_cast_fp16 = einsum(equation = var_1726_equation_0, values = (var_1434_cast_fp16, var_1679_cast_fp16))[name = tensor<string, []>("op_1726_cast_fp16")];
+            tensor<string, []> var_1728_equation_0 = const()[name = tensor<string, []>("op_1728_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1728_cast_fp16 = einsum(equation = var_1728_equation_0, values = (var_1438_cast_fp16, var_1680_cast_fp16))[name = tensor<string, []>("op_1728_cast_fp16")];
+            tensor<string, []> var_1730_equation_0 = const()[name = tensor<string, []>("op_1730_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1730_cast_fp16 = einsum(equation = var_1730_equation_0, values = (var_1442_cast_fp16, var_1681_cast_fp16))[name = tensor<string, []>("op_1730_cast_fp16")];
+            tensor<string, []> var_1732_equation_0 = const()[name = tensor<string, []>("op_1732_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1732_cast_fp16 = einsum(equation = var_1732_equation_0, values = (var_1446_cast_fp16, var_1682_cast_fp16))[name = tensor<string, []>("op_1732_cast_fp16")];
+            tensor<string, []> var_1734_equation_0 = const()[name = tensor<string, []>("op_1734_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1734_cast_fp16 = einsum(equation = var_1734_equation_0, values = (var_1450_cast_fp16, var_1683_cast_fp16))[name = tensor<string, []>("op_1734_cast_fp16")];
+            tensor<string, []> var_1736_equation_0 = const()[name = tensor<string, []>("op_1736_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1736_cast_fp16 = einsum(equation = var_1736_equation_0, values = (var_1454_cast_fp16, var_1684_cast_fp16))[name = tensor<string, []>("op_1736_cast_fp16")];
+            tensor<string, []> var_1738_equation_0 = const()[name = tensor<string, []>("op_1738_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1738_cast_fp16 = einsum(equation = var_1738_equation_0, values = (var_1458_cast_fp16, var_1685_cast_fp16))[name = tensor<string, []>("op_1738_cast_fp16")];
+            tensor<string, []> var_1740_equation_0 = const()[name = tensor<string, []>("op_1740_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1740_cast_fp16 = einsum(equation = var_1740_equation_0, values = (var_1462_cast_fp16, var_1686_cast_fp16))[name = tensor<string, []>("op_1740_cast_fp16")];
+            tensor<string, []> var_1742_equation_0 = const()[name = tensor<string, []>("op_1742_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1742_cast_fp16 = einsum(equation = var_1742_equation_0, values = (var_1466_cast_fp16, var_1687_cast_fp16))[name = tensor<string, []>("op_1742_cast_fp16")];
+            tensor<string, []> var_1744_equation_0 = const()[name = tensor<string, []>("op_1744_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1744_cast_fp16 = einsum(equation = var_1744_equation_0, values = (var_1470_cast_fp16, var_1688_cast_fp16))[name = tensor<string, []>("op_1744_cast_fp16")];
+            tensor<string, []> var_1746_equation_0 = const()[name = tensor<string, []>("op_1746_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1746_cast_fp16 = einsum(equation = var_1746_equation_0, values = (var_1474_cast_fp16, var_1689_cast_fp16))[name = tensor<string, []>("op_1746_cast_fp16")];
+            tensor<string, []> var_1748_equation_0 = const()[name = tensor<string, []>("op_1748_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1748_cast_fp16 = einsum(equation = var_1748_equation_0, values = (var_1478_cast_fp16, var_1690_cast_fp16))[name = tensor<string, []>("op_1748_cast_fp16")];
+            tensor<string, []> var_1750_equation_0 = const()[name = tensor<string, []>("op_1750_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1750_cast_fp16 = einsum(equation = var_1750_equation_0, values = (var_1482_cast_fp16, var_1691_cast_fp16))[name = tensor<string, []>("op_1750_cast_fp16")];
+            tensor<string, []> var_1752_equation_0 = const()[name = tensor<string, []>("op_1752_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1752_cast_fp16 = einsum(equation = var_1752_equation_0, values = (var_1486_cast_fp16, var_1692_cast_fp16))[name = tensor<string, []>("op_1752_cast_fp16")];
+            tensor<string, []> var_1754_equation_0 = const()[name = tensor<string, []>("op_1754_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1754_cast_fp16 = einsum(equation = var_1754_equation_0, values = (var_1490_cast_fp16, var_1693_cast_fp16))[name = tensor<string, []>("op_1754_cast_fp16")];
+            tensor<string, []> var_1756_equation_0 = const()[name = tensor<string, []>("op_1756_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1756_cast_fp16 = einsum(equation = var_1756_equation_0, values = (var_1494_cast_fp16, var_1694_cast_fp16))[name = tensor<string, []>("op_1756_cast_fp16")];
+            tensor<string, []> var_1758_equation_0 = const()[name = tensor<string, []>("op_1758_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1758_cast_fp16 = einsum(equation = var_1758_equation_0, values = (var_1498_cast_fp16, var_1695_cast_fp16))[name = tensor<string, []>("op_1758_cast_fp16")];
+            tensor<string, []> var_1760_equation_0 = const()[name = tensor<string, []>("op_1760_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1760_cast_fp16 = einsum(equation = var_1760_equation_0, values = (var_1502_cast_fp16, var_1696_cast_fp16))[name = tensor<string, []>("op_1760_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 4096, 1, 64]> x_27_cast_fp16 = concat(axis = var_974, interleave = x_27_interleave_0, values = (var_1698_cast_fp16, var_1700_cast_fp16, var_1702_cast_fp16, var_1704_cast_fp16, var_1706_cast_fp16, var_1708_cast_fp16, var_1710_cast_fp16, var_1712_cast_fp16, var_1714_cast_fp16, var_1716_cast_fp16, var_1718_cast_fp16, var_1720_cast_fp16, var_1722_cast_fp16, var_1724_cast_fp16, var_1726_cast_fp16, var_1728_cast_fp16, var_1730_cast_fp16, var_1732_cast_fp16, var_1734_cast_fp16, var_1736_cast_fp16, var_1738_cast_fp16, var_1740_cast_fp16, var_1742_cast_fp16, var_1744_cast_fp16, var_1746_cast_fp16, var_1748_cast_fp16, var_1750_cast_fp16, var_1752_cast_fp16, var_1754_cast_fp16, var_1756_cast_fp16, var_1758_cast_fp16, var_1760_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1765 = const()[name = tensor<string, []>("op_1765"), val = tensor<int32, [4]>([1, 4096, -1, 8])];
+            tensor<fp16, [1, 4096, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1765, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1769 = const()[name = tensor<string, []>("op_1769"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1771 = const()[name = tensor<string, []>("op_1771"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1773_pad_type_0 = const()[name = tensor<string, []>("op_1773_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1773_pad_0 = const()[name = tensor<string, []>("op_1773_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 8, 8]> var_1773_cast_fp16 = conv(dilations = var_1771, groups = var_974, pad = var_1773_pad_0, pad_type = var_1773_pad_type_0, strides = var_1769, weight = blocks_1_attn_proj_weight_palettized_cast_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("op_1773_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_attn_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303702400)))];
+            tensor<fp16, [1, 4096, 8, 8]> attention_output_3_cast_fp16 = mul(x = var_1773_cast_fp16, y = blocks_1_attn_proj_output_scales_to_fp16)[name = tensor<string, []>("attention_output_3_cast_fp16")];
+            tensor<fp16, [1, 4096, 8, 8]> x_29_cast_fp16 = add(x = attention_output_3_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_7_interleave_0 = const()[name = tensor<string, []>("x_eps_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_7_to_fp16 = const()[name = tensor<string, []>("eps_chan_7_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303710656)))];
+            tensor<fp16, [1, 4097, 8, 8]> x_eps_7_cast_fp16 = concat(axis = var_974, interleave = x_eps_7_interleave_0, values = (x_29_cast_fp16, eps_chan_7_to_fp16))[name = tensor<string, []>("x_eps_7_cast_fp16")];
+            tensor<int32, [1]> norm_x_7_axes_0 = const()[name = tensor<string, []>("norm_x_7_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_7_cast_fp16 = reduce_l2_norm(axes = norm_x_7_axes_0, keep_dims = var_977, x = x_eps_7_cast_fp16)[name = tensor<string, []>("norm_x_7_cast_fp16")];
+            tensor<fp16, [1, 4096, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_7_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1798_to_fp16 = const()[name = tensor<string, []>("op_1798_to_fp16"), val = tensor<fp16, []>(0x1p+6)];
+            tensor<fp16, [1, 4096, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1798_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303710848)))];
+            tensor<fp16, [1, 4096, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1810 = const()[name = tensor<string, []>("op_1810"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1812 = const()[name = tensor<string, []>("op_1812"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1814_pad_type_0 = const()[name = tensor<string, []>("op_1814_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1814_pad_0 = const()[name = tensor<string, []>("op_1814_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 8, 8]> var_1814_cast_fp16 = conv(dilations = var_1812, groups = var_974, pad = var_1814_pad_0, pad_type = var_1814_pad_type_0, strides = var_1810, weight = blocks_1_mlp_fc_1_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("op_1814_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_1_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303719104)))];
+            tensor<fp16, [1, 11008, 8, 8]> input_17_cast_fp16 = mul(x = var_1814_cast_fp16, y = blocks_1_mlp_fc_1_output_scales_to_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1818 = const()[name = tensor<string, []>("op_1818"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1820 = const()[name = tensor<string, []>("op_1820"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1822_pad_type_0 = const()[name = tensor<string, []>("op_1822_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1822_pad_0 = const()[name = tensor<string, []>("op_1822_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 8, 8]> var_1822_cast_fp16 = conv(dilations = var_1820, groups = var_974, pad = var_1822_pad_0, pad_type = var_1822_pad_type_0, strides = var_1818, weight = blocks_1_mlp_fc_2_weight_palettized_cast_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("op_1822_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_1_mlp_fc_2_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303741184)))];
+            tensor<fp16, [1, 11008, 8, 8]> x_fc_2_3_cast_fp16 = mul(x = var_1822_cast_fp16, y = blocks_1_mlp_fc_2_output_scales_to_fp16)[name = tensor<string, []>("x_fc_2_3_cast_fp16")];
+            tensor<fp16, [1, 11008, 8, 8]> var_1824_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1824_cast_fp16")];
+            tensor<fp16, [1, 11008, 8, 8]> input_19_cast_fp16 = mul(x = var_1824_cast_fp16, y = x_fc_2_3_cast_fp16)[name = tensor<string, []>("input_19_cast_fp16")];
+            tensor<int32, [2]> var_1828 = const()[name = tensor<string, []>("op_1828"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1830 = const()[name = tensor<string, []>("op_1830"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1832_pad_type_0 = const()[name = tensor<string, []>("op_1832_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1832_pad_0 = const()[name = tensor<string, []>("op_1832_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 8, 8]> var_1832_cast_fp16 = conv(dilations = var_1830, groups = var_974, pad = var_1832_pad_0, pad_type = var_1832_pad_type_0, strides = var_1828, weight = blocks_1_mlp_proj_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = tensor<string, []>("op_1832_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_1_mlp_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303763264)))];
+            tensor<fp16, [1, 4096, 8, 8]> var_1833_cast_fp16 = mul(x = var_1832_cast_fp16, y = blocks_1_mlp_proj_output_scales_to_fp16)[name = tensor<string, []>("op_1833_cast_fp16")];
+            tensor<fp16, [1, 4096, 8, 8]> x_33_cast_fp16 = add(x = var_1833_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("x_33_cast_fp16")];
+            tensor<int32, []> var_1839 = const()[name = tensor<string, []>("op_1839"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_1843 = const()[name = tensor<string, []>("op_1843"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_1845 = const()[name = tensor<string, []>("op_1845"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_1886 = const()[name = tensor<string, []>("op_1886"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_1889 = const()[name = tensor<string, []>("op_1889"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_9_interleave_0 = const()[name = tensor<string, []>("x_eps_9_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_9_to_fp16 = const()[name = tensor<string, []>("eps_chan_9_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303771520)))];
+            tensor<fp16, [1, 4097, 8, 8]> x_eps_9_cast_fp16 = concat(axis = var_1886, interleave = x_eps_9_interleave_0, values = (x_33_cast_fp16, eps_chan_9_to_fp16))[name = tensor<string, []>("x_eps_9_cast_fp16")];
+            tensor<int32, [1]> norm_x_9_axes_0 = const()[name = tensor<string, []>("norm_x_9_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_9_cast_fp16 = reduce_l2_norm(axes = norm_x_9_axes_0, keep_dims = var_1889, x = x_eps_9_cast_fp16)[name = tensor<string, []>("norm_x_9_cast_fp16")];
+            tensor<fp16, [1, 4096, 8, 8]> x_normed_25_cast_fp16 = real_div(x = x_33_cast_fp16, y = norm_x_9_cast_fp16)[name = tensor<string, []>("x_normed_25_cast_fp16")];
+            tensor<fp16, []> var_1912_to_fp16 = const()[name = tensor<string, []>("op_1912_to_fp16"), val = tensor<fp16, []>(0x1p+6)];
+            tensor<fp16, [1, 4096, 8, 8]> x_normed_27_cast_fp16 = mul(x = x_normed_25_cast_fp16, y = var_1912_to_fp16)[name = tensor<string, []>("x_normed_27_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_2_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303771712)))];
+            tensor<fp16, [1, 4096, 8, 8]> x_37_cast_fp16 = mul(x = x_normed_27_cast_fp16, y = blocks_2_norm_1_weight_to_fp16)[name = tensor<string, []>("x_37_cast_fp16")];
+            tensor<int32, [4]> var_1937 = const()[name = tensor<string, []>("op_1937"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 4096, 1, 64]> input_21_cast_fp16 = reshape(shape = var_1937, x = x_37_cast_fp16)[name = tensor<string, []>("input_21_cast_fp16")];
+            tensor<int32, [2]> var_1941 = const()[name = tensor<string, []>("op_1941"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1943 = const()[name = tensor<string, []>("op_1943"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1945_pad_type_0 = const()[name = tensor<string, []>("op_1945_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1945_pad_0 = const()[name = tensor<string, []>("op_1945_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 64]> var_1945_cast_fp16 = conv(dilations = var_1943, groups = var_1886, pad = var_1945_pad_0, pad_type = var_1945_pad_type_0, strides = var_1941, weight = blocks_2_attn_q_proj_weight_palettized_cast_fp16, x = input_21_cast_fp16)[name = tensor<string, []>("op_1945_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_q_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_attn_q_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303779968)))];
+            tensor<fp16, [1, 4096, 1, 64]> q_17_cast_fp16 = mul(x = var_1945_cast_fp16, y = blocks_2_attn_q_proj_output_scales_to_fp16)[name = tensor<string, []>("q_17_cast_fp16")];
+            tensor<int32, [2]> var_1949 = const()[name = tensor<string, []>("op_1949"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1951 = const()[name = tensor<string, []>("op_1951"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1953_pad_type_0 = const()[name = tensor<string, []>("op_1953_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1953_pad_0 = const()[name = tensor<string, []>("op_1953_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 64]> var_1953_cast_fp16 = conv(dilations = var_1951, groups = var_1886, pad = var_1953_pad_0, pad_type = var_1953_pad_type_0, strides = var_1949, weight = blocks_2_attn_k_proj_weight_palettized_cast_fp16, x = input_21_cast_fp16)[name = tensor<string, []>("op_1953_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_k_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_attn_k_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303788224)))];
+            tensor<fp16, [1, 4096, 1, 64]> k_21_cast_fp16 = mul(x = var_1953_cast_fp16, y = blocks_2_attn_k_proj_output_scales_to_fp16)[name = tensor<string, []>("k_21_cast_fp16")];
+            tensor<int32, [2]> var_1957 = const()[name = tensor<string, []>("op_1957"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1959 = const()[name = tensor<string, []>("op_1959"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1961_pad_type_0 = const()[name = tensor<string, []>("op_1961_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1961_pad_0 = const()[name = tensor<string, []>("op_1961_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 1, 64]> var_1961_cast_fp16 = conv(dilations = var_1959, groups = var_1886, pad = var_1961_pad_0, pad_type = var_1961_pad_type_0, strides = var_1957, weight = blocks_2_attn_v_proj_weight_palettized_cast_fp16, x = input_21_cast_fp16)[name = tensor<string, []>("op_1961_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_v_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_attn_v_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303796480)))];
+            tensor<fp16, [1, 4096, 1, 64]> v_21_cast_fp16 = mul(x = var_1961_cast_fp16, y = blocks_2_attn_v_proj_output_scales_to_fp16)[name = tensor<string, []>("v_21_cast_fp16")];
+            tensor<int32, [4]> var_1963 = const()[name = tensor<string, []>("op_1963"), val = tensor<int32, [4]>([1, 32, 128, 64])];
+            tensor<fp16, [1, 32, 128, 64]> q_19_cast_fp16 = reshape(shape = var_1963, x = q_17_cast_fp16)[name = tensor<string, []>("q_19_cast_fp16")];
+            tensor<int32, [4]> var_1965 = const()[name = tensor<string, []>("op_1965"), val = tensor<int32, [4]>([1, 32, 128, 64])];
+            tensor<fp16, [1, 32, 128, 64]> k_23_cast_fp16 = reshape(shape = var_1965, x = k_21_cast_fp16)[name = tensor<string, []>("k_23_cast_fp16")];
+            tensor<int32, [4]> var_1979_begin_0 = const()[name = tensor<string, []>("op_1979_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1979_end_0 = const()[name = tensor<string, []>("op_1979_end_0"), val = tensor<int32, [4]>([1, 32, 64, 64])];
+            tensor<bool, [4]> var_1979_end_mask_0 = const()[name = tensor<string, []>("op_1979_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 64]> var_1979_cast_fp16 = slice_by_index(begin = var_1979_begin_0, end = var_1979_end_0, end_mask = var_1979_end_mask_0, x = q_19_cast_fp16)[name = tensor<string, []>("op_1979_cast_fp16")];
+            tensor<int32, [4]> var_1985_begin_0 = const()[name = tensor<string, []>("op_1985_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_1985_end_0 = const()[name = tensor<string, []>("op_1985_end_0"), val = tensor<int32, [4]>([1, 32, 128, 64])];
+            tensor<bool, [4]> var_1985_end_mask_0 = const()[name = tensor<string, []>("op_1985_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 64]> var_1985_cast_fp16 = slice_by_index(begin = var_1985_begin_0, end = var_1985_end_0, end_mask = var_1985_end_mask_0, x = q_19_cast_fp16)[name = tensor<string, []>("op_1985_cast_fp16")];
+            tensor<fp16, []> const_53_promoted_to_fp16 = const()[name = tensor<string, []>("const_53_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 64]> var_1987_cast_fp16 = mul(x = var_1985_cast_fp16, y = const_53_promoted_to_fp16)[name = tensor<string, []>("op_1987_cast_fp16")];
             tensor<bool, []> rotated_9_interleave_0 = const()[name = tensor<string, []>("rotated_9_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 64]> rotated_9_cast_fp16 = concat(axis = var_453, interleave = rotated_9_interleave_0, values = (var_536_cast_fp16, var_528_cast_fp16))[name = tensor<string, []>("rotated_9_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> var_539_cast_fp16 = mul(x = q_15_cast_fp16, y = cos)[name = tensor<string, []>("op_539_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> var_540_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = tensor<string, []>("op_540_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> roped_9_cast_fp16 = add(x = var_539_cast_fp16, y = var_540_cast_fp16)[name = tensor<string, []>("roped_9_cast_fp16")];
-            tensor<int32, [4]> var_553_begin_0 = const()[name = tensor<string, []>("op_553_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> var_553_end_0 = const()[name = tensor<string, []>("op_553_end_0"), val = tensor<int32, [4]>([1, 32, 64, 64])];
-            tensor<bool, [4]> var_553_end_mask_0 = const()[name = tensor<string, []>("op_553_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
-            tensor<fp16, [1, 32, 64, 64]> var_553_cast_fp16 = slice_by_index(begin = var_553_begin_0, end = var_553_end_0, end_mask = var_553_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
-            tensor<int32, [4]> var_559_begin_0 = const()[name = tensor<string, []>("op_559_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
-            tensor<int32, [4]> var_559_end_0 = const()[name = tensor<string, []>("op_559_end_0"), val = tensor<int32, [4]>([1, 32, 128, 64])];
-            tensor<bool, [4]> var_559_end_mask_0 = const()[name = tensor<string, []>("op_559_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 32, 64, 64]> var_559_cast_fp16 = slice_by_index(begin = var_559_begin_0, end = var_559_end_0, end_mask = var_559_end_mask_0, x = k_19_cast_fp16)[name = tensor<string, []>("op_559_cast_fp16")];
-            tensor<fp16, []> const_19_promoted_to_fp16 = const()[name = tensor<string, []>("const_19_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
-            tensor<fp16, [1, 32, 64, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = const_19_promoted_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> rotated_9_cast_fp16 = concat(axis = var_1843, interleave = rotated_9_interleave_0, values = (var_1987_cast_fp16, var_1979_cast_fp16))[name = tensor<string, []>("rotated_9_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> var_1990_cast_fp16 = mul(x = q_19_cast_fp16, y = cos)[name = tensor<string, []>("op_1990_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> var_1991_cast_fp16 = mul(x = rotated_9_cast_fp16, y = sin)[name = tensor<string, []>("op_1991_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> roped_9_cast_fp16 = add(x = var_1990_cast_fp16, y = var_1991_cast_fp16)[name = tensor<string, []>("roped_9_cast_fp16")];
+            tensor<int32, [4]> var_2004_begin_0 = const()[name = tensor<string, []>("op_2004_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2004_end_0 = const()[name = tensor<string, []>("op_2004_end_0"), val = tensor<int32, [4]>([1, 32, 64, 64])];
+            tensor<bool, [4]> var_2004_end_mask_0 = const()[name = tensor<string, []>("op_2004_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 32, 64, 64]> var_2004_cast_fp16 = slice_by_index(begin = var_2004_begin_0, end = var_2004_end_0, end_mask = var_2004_end_mask_0, x = k_23_cast_fp16)[name = tensor<string, []>("op_2004_cast_fp16")];
+            tensor<int32, [4]> var_2010_begin_0 = const()[name = tensor<string, []>("op_2010_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_2010_end_0 = const()[name = tensor<string, []>("op_2010_end_0"), val = tensor<int32, [4]>([1, 32, 128, 64])];
+            tensor<bool, [4]> var_2010_end_mask_0 = const()[name = tensor<string, []>("op_2010_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 32, 64, 64]> var_2010_cast_fp16 = slice_by_index(begin = var_2010_begin_0, end = var_2010_end_0, end_mask = var_2010_end_mask_0, x = k_23_cast_fp16)[name = tensor<string, []>("op_2010_cast_fp16")];
+            tensor<fp16, []> const_55_promoted_to_fp16 = const()[name = tensor<string, []>("const_55_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 32, 64, 64]> var_2012_cast_fp16 = mul(x = var_2010_cast_fp16, y = const_55_promoted_to_fp16)[name = tensor<string, []>("op_2012_cast_fp16")];
             tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 64]> rotated_cast_fp16 = concat(axis = var_453, interleave = rotated_interleave_0, values = (var_561_cast_fp16, var_553_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> var_564_cast_fp16 = mul(x = k_19_cast_fp16, y = cos)[name = tensor<string, []>("op_564_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> var_565_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_565_cast_fp16")];
-            tensor<fp16, [1, 32, 128, 64]> roped_cast_fp16 = add(x = var_564_cast_fp16, y = var_565_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
-            tensor<bool, []> q_interleave_0 = const()[name = tensor<string, []>("q_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 64]> q_cast_fp16 = concat(axis = var_453, interleave = q_interleave_0, values = roped_9_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
-            tensor<bool, []> k_21_interleave_0 = const()[name = tensor<string, []>("k_21_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 64]> new_k_cache_2 = concat(axis = var_453, interleave = k_21_interleave_0, values = roped_cast_fp16)[name = tensor<string, []>("k_21_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> rotated_cast_fp16 = concat(axis = var_1843, interleave = rotated_interleave_0, values = (var_2012_cast_fp16, var_2004_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> var_2015_cast_fp16 = mul(x = k_23_cast_fp16, y = cos)[name = tensor<string, []>("op_2015_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> var_2016_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_2016_cast_fp16")];
+            tensor<fp16, [1, 32, 128, 64]> roped_cast_fp16 = add(x = var_2015_cast_fp16, y = var_2016_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_2019 = const()[name = tensor<string, []>("op_2019"), val = tensor<int32, [4]>([1, 4096, 1, 64])];
+            tensor<fp16, [1, 4096, 1, 64]> var_2020_cast_fp16 = reshape(shape = var_2019, x = roped_cast_fp16)[name = tensor<string, []>("op_2020_cast_fp16")];
+            tensor<int32, [4]> k_27_perm_0 = const()[name = tensor<string, []>("k_27_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<int32, [4]> var_2022 = const()[name = tensor<string, []>("op_2022"), val = tensor<int32, [4]>([1, 4096, 1, 64])];
+            tensor<fp16, [1, 4096, 1, 64]> new_v_cache_2 = reshape(shape = var_2022, x = v_21_cast_fp16)[name = tensor<string, []>("new_v_cache_2_type_fp32_cast_fp16")];
             tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 512]> k_cast_fp16 = concat(axis = var_455, interleave = k_interleave_0, values = (k_cache_2, new_k_cache_2))[name = tensor<string, []>("k_cast_fp16")];
-            tensor<bool, []> v_interleave_0 = const()[name = tensor<string, []>("v_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 128, 512]> v_cast_fp16 = concat(axis = var_455, interleave = v_interleave_0, values = (v_cache_2, new_v_cache_2))[name = tensor<string, []>("v_cast_fp16")];
-            tensor<fp16, []> var_587_to_fp16 = const()[name = tensor<string, []>("op_587_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
-            tensor<fp16, [1, 32, 128, 64]> var_588_cast_fp16 = mul(x = q_cast_fp16, y = var_587_to_fp16)[name = tensor<string, []>("op_588_cast_fp16")];
-            tensor<bool, []> attn_weights_9_transpose_x_0 = const()[name = tensor<string, []>("attn_weights_9_transpose_x_0"), val = tensor<bool, []>(true)];
-            tensor<bool, []> attn_weights_9_transpose_y_0 = const()[name = tensor<string, []>("attn_weights_9_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 32, 64, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = var_588_cast_fp16, y = k_cast_fp16)[name = tensor<string, []>("attn_weights_9_cast_fp16")];
-            tensor<fp16, [1, 32, 64, 512]> attn_weights_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = mask)[name = tensor<string, []>("attn_weights_cast_fp16")];
-            tensor<fp16, [1, 32, 64, 512]> var_596_cast_fp16 = softmax(axis = var_448, x = attn_weights_cast_fp16)[name = tensor<string, []>("op_596_cast_fp16")];
-            tensor<bool, []> attn_5_transpose_x_0 = const()[name = tensor<string, []>("attn_5_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> attn_5_transpose_y_0 = const()[name = tensor<string, []>("attn_5_transpose_y_0"), val = tensor<bool, []>(true)];
-            tensor<fp16, [1, 32, 128, 64]> attn_5_cast_fp16 = matmul(transpose_x = attn_5_transpose_x_0, transpose_y = attn_5_transpose_y_0, x = v_cast_fp16, y = var_596_cast_fp16)[name = tensor<string, []>("attn_5_cast_fp16")];
-            tensor<int32, [4]> var_600 = const()[name = tensor<string, []>("op_600"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
-            tensor<fp16, [1, 4096, 1, 64]> input_17_cast_fp16 = reshape(shape = var_600, x = attn_5_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
-            tensor<int32, [2]> var_604 = const()[name = tensor<string, []>("op_604"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_606 = const()[name = tensor<string, []>("op_606"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_608_pad_type_0 = const()[name = tensor<string, []>("op_608_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_608_pad_0 = const()[name = tensor<string, []>("op_608_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_608_cast_fp16 = conv(dilations = var_606, groups = var_462, pad = var_608_pad_0, pad_type = var_608_pad_type_0, strides = var_604, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_17_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303803776)))];
-            tensor<fp16, [1, 4096, 1, 64]> attention_output_cast_fp16 = mul(x = var_608_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> x_39_cast_fp16 = add(x = attention_output_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("x_39_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> var_617_cast_fp16 = mul(x = x_39_cast_fp16, y = x_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
-            tensor<int32, [1]> var_618 = const()[name = tensor<string, []>("op_618"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 64]> norm_x_cast_fp16 = reduce_mean(axes = var_618, keep_dims = var_463, x = var_617_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
-            tensor<fp16, []> var_620_to_fp16 = const()[name = tensor<string, []>("op_620_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
-            tensor<fp16, [1, 1, 1, 64]> var_621_cast_fp16 = add(x = norm_x_cast_fp16, y = var_620_to_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
-            tensor<fp16, []> var_622_epsilon_0_to_fp16 = const()[name = tensor<string, []>("op_622_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1p-24)];
-            tensor<fp16, [1, 1, 1, 64]> var_622_cast_fp16 = rsqrt(epsilon = var_622_epsilon_0_to_fp16, x = var_621_cast_fp16)[name = tensor<string, []>("op_622_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> x_normed_21_cast_fp16 = mul(x = x_39_cast_fp16, y = var_622_cast_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303812032)))];
-            tensor<fp16, [1, 4096, 1, 64]> input_19_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = tensor<string, []>("input_19_cast_fp16")];
-            tensor<int32, [2]> var_634 = const()[name = tensor<string, []>("op_634"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_636 = const()[name = tensor<string, []>("op_636"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_638_pad_type_0 = const()[name = tensor<string, []>("op_638_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_638_pad_0 = const()[name = tensor<string, []>("op_638_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 64]> var_638_cast_fp16 = conv(dilations = var_636, groups = var_462, pad = var_638_pad_0, pad_type = var_638_pad_type_0, strides = var_634, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = tensor<string, []>("op_638_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303820288)))];
-            tensor<fp16, [1, 11008, 1, 64]> input_21_cast_fp16 = mul(x = var_638_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = tensor<string, []>("input_21_cast_fp16")];
-            tensor<int32, [2]> var_642 = const()[name = tensor<string, []>("op_642"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_644 = const()[name = tensor<string, []>("op_644"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_646_pad_type_0 = const()[name = tensor<string, []>("op_646_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_646_pad_0 = const()[name = tensor<string, []>("op_646_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 11008, 1, 64]> var_646_cast_fp16 = conv(dilations = var_644, groups = var_462, pad = var_646_pad_0, pad_type = var_646_pad_type_0, strides = var_642, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_19_cast_fp16)[name = tensor<string, []>("op_646_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303842368)))];
-            tensor<fp16, [1, 11008, 1, 64]> x_fc_2_cast_fp16 = mul(x = var_646_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 64]> var_648_cast_fp16 = silu(x = input_21_cast_fp16)[name = tensor<string, []>("op_648_cast_fp16")];
-            tensor<fp16, [1, 11008, 1, 64]> input_cast_fp16 = mul(x = var_648_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
-            tensor<int32, [2]> var_652 = const()[name = tensor<string, []>("op_652"), val = tensor<int32, [2]>([1, 1])];
-            tensor<int32, [2]> var_654 = const()[name = tensor<string, []>("op_654"), val = tensor<int32, [2]>([1, 1])];
-            tensor<string, []> var_656_pad_type_0 = const()[name = tensor<string, []>("op_656_pad_type_0"), val = tensor<string, []>("custom")];
-            tensor<int32, [4]> var_656_pad_0 = const()[name = tensor<string, []>("op_656_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<fp16, [1, 4096, 1, 64]> var_656_cast_fp16 = conv(dilations = var_654, groups = var_462, pad = var_656_pad_0, pad_type = var_656_pad_type_0, strides = var_652, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_656_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303864448)))];
-            tensor<fp16, [1, 4096, 1, 64]> var_657_cast_fp16 = mul(x = var_656_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = tensor<string, []>("op_657_cast_fp16")];
-            tensor<fp16, [1, 4096, 1, 64]> new_x = add(x = var_657_cast_fp16, y = x_39_cast_fp16)[name = tensor<string, []>("op_658_cast_fp16")];
+            tensor<fp16, [1, 64, 1, 4096]> new_k_cache_2 = transpose(perm = k_27_perm_0, x = var_2020_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 4096]> k_cast_fp16 = concat(axis = var_1845, interleave = k_interleave_0, values = (k_cache_2, new_k_cache_2))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_27_interleave_0 = const()[name = tensor<string, []>("v_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 4096, 1, 512]> v_27_cast_fp16 = concat(axis = var_1839, interleave = v_27_interleave_0, values = (v_cache_2, new_v_cache_2))[name = tensor<string, []>("v_27_cast_fp16")];
+            tensor<int32, [4]> var_2029 = const()[name = tensor<string, []>("op_2029"), val = tensor<int32, [4]>([1, 4096, 1, -1])];
+            tensor<fp16, [1, 4096, 1, 64]> q_cast_fp16 = reshape(shape = var_2029, x = roped_9_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_2034_begin_0 = const()[name = tensor<string, []>("op_2034_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2034_end_0 = const()[name = tensor<string, []>("op_2034_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_2034_end_mask_0 = const()[name = tensor<string, []>("op_2034_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2034_cast_fp16 = slice_by_index(begin = var_2034_begin_0, end = var_2034_end_0, end_mask = var_2034_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2034_cast_fp16")];
+            tensor<int32, [4]> var_2038_begin_0 = const()[name = tensor<string, []>("op_2038_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_2038_end_0 = const()[name = tensor<string, []>("op_2038_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_2038_end_mask_0 = const()[name = tensor<string, []>("op_2038_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2038_cast_fp16 = slice_by_index(begin = var_2038_begin_0, end = var_2038_end_0, end_mask = var_2038_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2038_cast_fp16")];
+            tensor<int32, [4]> var_2042_begin_0 = const()[name = tensor<string, []>("op_2042_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_2042_end_0 = const()[name = tensor<string, []>("op_2042_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_2042_end_mask_0 = const()[name = tensor<string, []>("op_2042_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2042_cast_fp16 = slice_by_index(begin = var_2042_begin_0, end = var_2042_end_0, end_mask = var_2042_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2042_cast_fp16")];
+            tensor<int32, [4]> var_2046_begin_0 = const()[name = tensor<string, []>("op_2046_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_2046_end_0 = const()[name = tensor<string, []>("op_2046_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_2046_end_mask_0 = const()[name = tensor<string, []>("op_2046_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2046_cast_fp16 = slice_by_index(begin = var_2046_begin_0, end = var_2046_end_0, end_mask = var_2046_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2046_cast_fp16")];
+            tensor<int32, [4]> var_2050_begin_0 = const()[name = tensor<string, []>("op_2050_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_2050_end_0 = const()[name = tensor<string, []>("op_2050_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_2050_end_mask_0 = const()[name = tensor<string, []>("op_2050_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2050_cast_fp16 = slice_by_index(begin = var_2050_begin_0, end = var_2050_end_0, end_mask = var_2050_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2050_cast_fp16")];
+            tensor<int32, [4]> var_2054_begin_0 = const()[name = tensor<string, []>("op_2054_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_2054_end_0 = const()[name = tensor<string, []>("op_2054_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_2054_end_mask_0 = const()[name = tensor<string, []>("op_2054_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2054_cast_fp16 = slice_by_index(begin = var_2054_begin_0, end = var_2054_end_0, end_mask = var_2054_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2054_cast_fp16")];
+            tensor<int32, [4]> var_2058_begin_0 = const()[name = tensor<string, []>("op_2058_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_2058_end_0 = const()[name = tensor<string, []>("op_2058_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_2058_end_mask_0 = const()[name = tensor<string, []>("op_2058_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2058_cast_fp16 = slice_by_index(begin = var_2058_begin_0, end = var_2058_end_0, end_mask = var_2058_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2058_cast_fp16")];
+            tensor<int32, [4]> var_2062_begin_0 = const()[name = tensor<string, []>("op_2062_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_2062_end_0 = const()[name = tensor<string, []>("op_2062_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_2062_end_mask_0 = const()[name = tensor<string, []>("op_2062_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2062_cast_fp16 = slice_by_index(begin = var_2062_begin_0, end = var_2062_end_0, end_mask = var_2062_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2062_cast_fp16")];
+            tensor<int32, [4]> var_2066_begin_0 = const()[name = tensor<string, []>("op_2066_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_2066_end_0 = const()[name = tensor<string, []>("op_2066_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_2066_end_mask_0 = const()[name = tensor<string, []>("op_2066_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2066_cast_fp16 = slice_by_index(begin = var_2066_begin_0, end = var_2066_end_0, end_mask = var_2066_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2066_cast_fp16")];
+            tensor<int32, [4]> var_2070_begin_0 = const()[name = tensor<string, []>("op_2070_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_2070_end_0 = const()[name = tensor<string, []>("op_2070_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_2070_end_mask_0 = const()[name = tensor<string, []>("op_2070_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2070_cast_fp16 = slice_by_index(begin = var_2070_begin_0, end = var_2070_end_0, end_mask = var_2070_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2070_cast_fp16")];
+            tensor<int32, [4]> var_2074_begin_0 = const()[name = tensor<string, []>("op_2074_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_2074_end_0 = const()[name = tensor<string, []>("op_2074_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_2074_end_mask_0 = const()[name = tensor<string, []>("op_2074_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2074_cast_fp16 = slice_by_index(begin = var_2074_begin_0, end = var_2074_end_0, end_mask = var_2074_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2074_cast_fp16")];
+            tensor<int32, [4]> var_2078_begin_0 = const()[name = tensor<string, []>("op_2078_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_2078_end_0 = const()[name = tensor<string, []>("op_2078_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_2078_end_mask_0 = const()[name = tensor<string, []>("op_2078_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2078_cast_fp16 = slice_by_index(begin = var_2078_begin_0, end = var_2078_end_0, end_mask = var_2078_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2078_cast_fp16")];
+            tensor<int32, [4]> var_2082_begin_0 = const()[name = tensor<string, []>("op_2082_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_2082_end_0 = const()[name = tensor<string, []>("op_2082_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_2082_end_mask_0 = const()[name = tensor<string, []>("op_2082_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2082_cast_fp16 = slice_by_index(begin = var_2082_begin_0, end = var_2082_end_0, end_mask = var_2082_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2082_cast_fp16")];
+            tensor<int32, [4]> var_2086_begin_0 = const()[name = tensor<string, []>("op_2086_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_2086_end_0 = const()[name = tensor<string, []>("op_2086_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_2086_end_mask_0 = const()[name = tensor<string, []>("op_2086_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2086_cast_fp16 = slice_by_index(begin = var_2086_begin_0, end = var_2086_end_0, end_mask = var_2086_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2086_cast_fp16")];
+            tensor<int32, [4]> var_2090_begin_0 = const()[name = tensor<string, []>("op_2090_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_2090_end_0 = const()[name = tensor<string, []>("op_2090_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_2090_end_mask_0 = const()[name = tensor<string, []>("op_2090_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2090_cast_fp16 = slice_by_index(begin = var_2090_begin_0, end = var_2090_end_0, end_mask = var_2090_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2090_cast_fp16")];
+            tensor<int32, [4]> var_2094_begin_0 = const()[name = tensor<string, []>("op_2094_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_2094_end_0 = const()[name = tensor<string, []>("op_2094_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_2094_end_mask_0 = const()[name = tensor<string, []>("op_2094_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2094_cast_fp16 = slice_by_index(begin = var_2094_begin_0, end = var_2094_end_0, end_mask = var_2094_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2094_cast_fp16")];
+            tensor<int32, [4]> var_2098_begin_0 = const()[name = tensor<string, []>("op_2098_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_2098_end_0 = const()[name = tensor<string, []>("op_2098_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_2098_end_mask_0 = const()[name = tensor<string, []>("op_2098_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2098_cast_fp16 = slice_by_index(begin = var_2098_begin_0, end = var_2098_end_0, end_mask = var_2098_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2098_cast_fp16")];
+            tensor<int32, [4]> var_2102_begin_0 = const()[name = tensor<string, []>("op_2102_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_2102_end_0 = const()[name = tensor<string, []>("op_2102_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_2102_end_mask_0 = const()[name = tensor<string, []>("op_2102_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2102_cast_fp16 = slice_by_index(begin = var_2102_begin_0, end = var_2102_end_0, end_mask = var_2102_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2102_cast_fp16")];
+            tensor<int32, [4]> var_2106_begin_0 = const()[name = tensor<string, []>("op_2106_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_2106_end_0 = const()[name = tensor<string, []>("op_2106_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_2106_end_mask_0 = const()[name = tensor<string, []>("op_2106_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2106_cast_fp16 = slice_by_index(begin = var_2106_begin_0, end = var_2106_end_0, end_mask = var_2106_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2106_cast_fp16")];
+            tensor<int32, [4]> var_2110_begin_0 = const()[name = tensor<string, []>("op_2110_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_2110_end_0 = const()[name = tensor<string, []>("op_2110_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_2110_end_mask_0 = const()[name = tensor<string, []>("op_2110_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2110_cast_fp16 = slice_by_index(begin = var_2110_begin_0, end = var_2110_end_0, end_mask = var_2110_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2110_cast_fp16")];
+            tensor<int32, [4]> var_2114_begin_0 = const()[name = tensor<string, []>("op_2114_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_2114_end_0 = const()[name = tensor<string, []>("op_2114_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_2114_end_mask_0 = const()[name = tensor<string, []>("op_2114_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2114_cast_fp16 = slice_by_index(begin = var_2114_begin_0, end = var_2114_end_0, end_mask = var_2114_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2114_cast_fp16")];
+            tensor<int32, [4]> var_2118_begin_0 = const()[name = tensor<string, []>("op_2118_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_2118_end_0 = const()[name = tensor<string, []>("op_2118_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_2118_end_mask_0 = const()[name = tensor<string, []>("op_2118_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2118_cast_fp16 = slice_by_index(begin = var_2118_begin_0, end = var_2118_end_0, end_mask = var_2118_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2118_cast_fp16")];
+            tensor<int32, [4]> var_2122_begin_0 = const()[name = tensor<string, []>("op_2122_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_2122_end_0 = const()[name = tensor<string, []>("op_2122_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_2122_end_mask_0 = const()[name = tensor<string, []>("op_2122_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2122_cast_fp16 = slice_by_index(begin = var_2122_begin_0, end = var_2122_end_0, end_mask = var_2122_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2122_cast_fp16")];
+            tensor<int32, [4]> var_2126_begin_0 = const()[name = tensor<string, []>("op_2126_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_2126_end_0 = const()[name = tensor<string, []>("op_2126_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_2126_end_mask_0 = const()[name = tensor<string, []>("op_2126_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2126_cast_fp16 = slice_by_index(begin = var_2126_begin_0, end = var_2126_end_0, end_mask = var_2126_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2126_cast_fp16")];
+            tensor<int32, [4]> var_2130_begin_0 = const()[name = tensor<string, []>("op_2130_begin_0"), val = tensor<int32, [4]>([0, 3072, 0, 0])];
+            tensor<int32, [4]> var_2130_end_0 = const()[name = tensor<string, []>("op_2130_end_0"), val = tensor<int32, [4]>([1, 3200, 1, 64])];
+            tensor<bool, [4]> var_2130_end_mask_0 = const()[name = tensor<string, []>("op_2130_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2130_cast_fp16 = slice_by_index(begin = var_2130_begin_0, end = var_2130_end_0, end_mask = var_2130_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2130_cast_fp16")];
+            tensor<int32, [4]> var_2134_begin_0 = const()[name = tensor<string, []>("op_2134_begin_0"), val = tensor<int32, [4]>([0, 3200, 0, 0])];
+            tensor<int32, [4]> var_2134_end_0 = const()[name = tensor<string, []>("op_2134_end_0"), val = tensor<int32, [4]>([1, 3328, 1, 64])];
+            tensor<bool, [4]> var_2134_end_mask_0 = const()[name = tensor<string, []>("op_2134_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2134_cast_fp16 = slice_by_index(begin = var_2134_begin_0, end = var_2134_end_0, end_mask = var_2134_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2134_cast_fp16")];
+            tensor<int32, [4]> var_2138_begin_0 = const()[name = tensor<string, []>("op_2138_begin_0"), val = tensor<int32, [4]>([0, 3328, 0, 0])];
+            tensor<int32, [4]> var_2138_end_0 = const()[name = tensor<string, []>("op_2138_end_0"), val = tensor<int32, [4]>([1, 3456, 1, 64])];
+            tensor<bool, [4]> var_2138_end_mask_0 = const()[name = tensor<string, []>("op_2138_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2138_cast_fp16 = slice_by_index(begin = var_2138_begin_0, end = var_2138_end_0, end_mask = var_2138_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2138_cast_fp16")];
+            tensor<int32, [4]> var_2142_begin_0 = const()[name = tensor<string, []>("op_2142_begin_0"), val = tensor<int32, [4]>([0, 3456, 0, 0])];
+            tensor<int32, [4]> var_2142_end_0 = const()[name = tensor<string, []>("op_2142_end_0"), val = tensor<int32, [4]>([1, 3584, 1, 64])];
+            tensor<bool, [4]> var_2142_end_mask_0 = const()[name = tensor<string, []>("op_2142_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2142_cast_fp16 = slice_by_index(begin = var_2142_begin_0, end = var_2142_end_0, end_mask = var_2142_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2142_cast_fp16")];
+            tensor<int32, [4]> var_2146_begin_0 = const()[name = tensor<string, []>("op_2146_begin_0"), val = tensor<int32, [4]>([0, 3584, 0, 0])];
+            tensor<int32, [4]> var_2146_end_0 = const()[name = tensor<string, []>("op_2146_end_0"), val = tensor<int32, [4]>([1, 3712, 1, 64])];
+            tensor<bool, [4]> var_2146_end_mask_0 = const()[name = tensor<string, []>("op_2146_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2146_cast_fp16 = slice_by_index(begin = var_2146_begin_0, end = var_2146_end_0, end_mask = var_2146_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2146_cast_fp16")];
+            tensor<int32, [4]> var_2150_begin_0 = const()[name = tensor<string, []>("op_2150_begin_0"), val = tensor<int32, [4]>([0, 3712, 0, 0])];
+            tensor<int32, [4]> var_2150_end_0 = const()[name = tensor<string, []>("op_2150_end_0"), val = tensor<int32, [4]>([1, 3840, 1, 64])];
+            tensor<bool, [4]> var_2150_end_mask_0 = const()[name = tensor<string, []>("op_2150_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2150_cast_fp16 = slice_by_index(begin = var_2150_begin_0, end = var_2150_end_0, end_mask = var_2150_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2150_cast_fp16")];
+            tensor<int32, [4]> var_2154_begin_0 = const()[name = tensor<string, []>("op_2154_begin_0"), val = tensor<int32, [4]>([0, 3840, 0, 0])];
+            tensor<int32, [4]> var_2154_end_0 = const()[name = tensor<string, []>("op_2154_end_0"), val = tensor<int32, [4]>([1, 3968, 1, 64])];
+            tensor<bool, [4]> var_2154_end_mask_0 = const()[name = tensor<string, []>("op_2154_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2154_cast_fp16 = slice_by_index(begin = var_2154_begin_0, end = var_2154_end_0, end_mask = var_2154_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2154_cast_fp16")];
+            tensor<int32, [4]> var_2158_begin_0 = const()[name = tensor<string, []>("op_2158_begin_0"), val = tensor<int32, [4]>([0, 3968, 0, 0])];
+            tensor<int32, [4]> var_2158_end_0 = const()[name = tensor<string, []>("op_2158_end_0"), val = tensor<int32, [4]>([1, 4096, 1, 64])];
+            tensor<bool, [4]> var_2158_end_mask_0 = const()[name = tensor<string, []>("op_2158_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_2158_cast_fp16 = slice_by_index(begin = var_2158_begin_0, end = var_2158_end_0, end_mask = var_2158_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_2158_cast_fp16")];
+            tensor<int32, [4]> var_2164_begin_0 = const()[name = tensor<string, []>("op_2164_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2164_end_0 = const()[name = tensor<string, []>("op_2164_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_2164_end_mask_0 = const()[name = tensor<string, []>("op_2164_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2164_cast_fp16 = slice_by_index(begin = var_2164_begin_0, end = var_2164_end_0, end_mask = var_2164_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2164_cast_fp16")];
+            tensor<int32, [4]> var_2168_begin_0 = const()[name = tensor<string, []>("op_2168_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_2168_end_0 = const()[name = tensor<string, []>("op_2168_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_2168_end_mask_0 = const()[name = tensor<string, []>("op_2168_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2168_cast_fp16 = slice_by_index(begin = var_2168_begin_0, end = var_2168_end_0, end_mask = var_2168_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2168_cast_fp16")];
+            tensor<int32, [4]> var_2172_begin_0 = const()[name = tensor<string, []>("op_2172_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_2172_end_0 = const()[name = tensor<string, []>("op_2172_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_2172_end_mask_0 = const()[name = tensor<string, []>("op_2172_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2172_cast_fp16 = slice_by_index(begin = var_2172_begin_0, end = var_2172_end_0, end_mask = var_2172_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2172_cast_fp16")];
+            tensor<int32, [4]> var_2176_begin_0 = const()[name = tensor<string, []>("op_2176_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_2176_end_0 = const()[name = tensor<string, []>("op_2176_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_2176_end_mask_0 = const()[name = tensor<string, []>("op_2176_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2176_cast_fp16 = slice_by_index(begin = var_2176_begin_0, end = var_2176_end_0, end_mask = var_2176_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2176_cast_fp16")];
+            tensor<int32, [4]> var_2180_begin_0 = const()[name = tensor<string, []>("op_2180_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_2180_end_0 = const()[name = tensor<string, []>("op_2180_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_2180_end_mask_0 = const()[name = tensor<string, []>("op_2180_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2180_cast_fp16 = slice_by_index(begin = var_2180_begin_0, end = var_2180_end_0, end_mask = var_2180_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2180_cast_fp16")];
+            tensor<int32, [4]> var_2184_begin_0 = const()[name = tensor<string, []>("op_2184_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_2184_end_0 = const()[name = tensor<string, []>("op_2184_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_2184_end_mask_0 = const()[name = tensor<string, []>("op_2184_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2184_cast_fp16 = slice_by_index(begin = var_2184_begin_0, end = var_2184_end_0, end_mask = var_2184_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2184_cast_fp16")];
+            tensor<int32, [4]> var_2188_begin_0 = const()[name = tensor<string, []>("op_2188_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_2188_end_0 = const()[name = tensor<string, []>("op_2188_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_2188_end_mask_0 = const()[name = tensor<string, []>("op_2188_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2188_cast_fp16 = slice_by_index(begin = var_2188_begin_0, end = var_2188_end_0, end_mask = var_2188_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2188_cast_fp16")];
+            tensor<int32, [4]> var_2192_begin_0 = const()[name = tensor<string, []>("op_2192_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_2192_end_0 = const()[name = tensor<string, []>("op_2192_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_2192_end_mask_0 = const()[name = tensor<string, []>("op_2192_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2192_cast_fp16 = slice_by_index(begin = var_2192_begin_0, end = var_2192_end_0, end_mask = var_2192_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2192_cast_fp16")];
+            tensor<int32, [4]> var_2196_begin_0 = const()[name = tensor<string, []>("op_2196_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1024])];
+            tensor<int32, [4]> var_2196_end_0 = const()[name = tensor<string, []>("op_2196_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1152])];
+            tensor<bool, [4]> var_2196_end_mask_0 = const()[name = tensor<string, []>("op_2196_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2196_cast_fp16 = slice_by_index(begin = var_2196_begin_0, end = var_2196_end_0, end_mask = var_2196_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2196_cast_fp16")];
+            tensor<int32, [4]> var_2200_begin_0 = const()[name = tensor<string, []>("op_2200_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1152])];
+            tensor<int32, [4]> var_2200_end_0 = const()[name = tensor<string, []>("op_2200_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1280])];
+            tensor<bool, [4]> var_2200_end_mask_0 = const()[name = tensor<string, []>("op_2200_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2200_cast_fp16 = slice_by_index(begin = var_2200_begin_0, end = var_2200_end_0, end_mask = var_2200_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2200_cast_fp16")];
+            tensor<int32, [4]> var_2204_begin_0 = const()[name = tensor<string, []>("op_2204_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1280])];
+            tensor<int32, [4]> var_2204_end_0 = const()[name = tensor<string, []>("op_2204_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1408])];
+            tensor<bool, [4]> var_2204_end_mask_0 = const()[name = tensor<string, []>("op_2204_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2204_cast_fp16 = slice_by_index(begin = var_2204_begin_0, end = var_2204_end_0, end_mask = var_2204_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2204_cast_fp16")];
+            tensor<int32, [4]> var_2208_begin_0 = const()[name = tensor<string, []>("op_2208_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1408])];
+            tensor<int32, [4]> var_2208_end_0 = const()[name = tensor<string, []>("op_2208_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1536])];
+            tensor<bool, [4]> var_2208_end_mask_0 = const()[name = tensor<string, []>("op_2208_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2208_cast_fp16 = slice_by_index(begin = var_2208_begin_0, end = var_2208_end_0, end_mask = var_2208_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2208_cast_fp16")];
+            tensor<int32, [4]> var_2212_begin_0 = const()[name = tensor<string, []>("op_2212_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1536])];
+            tensor<int32, [4]> var_2212_end_0 = const()[name = tensor<string, []>("op_2212_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1664])];
+            tensor<bool, [4]> var_2212_end_mask_0 = const()[name = tensor<string, []>("op_2212_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2212_cast_fp16 = slice_by_index(begin = var_2212_begin_0, end = var_2212_end_0, end_mask = var_2212_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2212_cast_fp16")];
+            tensor<int32, [4]> var_2216_begin_0 = const()[name = tensor<string, []>("op_2216_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1664])];
+            tensor<int32, [4]> var_2216_end_0 = const()[name = tensor<string, []>("op_2216_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1792])];
+            tensor<bool, [4]> var_2216_end_mask_0 = const()[name = tensor<string, []>("op_2216_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2216_cast_fp16 = slice_by_index(begin = var_2216_begin_0, end = var_2216_end_0, end_mask = var_2216_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2216_cast_fp16")];
+            tensor<int32, [4]> var_2220_begin_0 = const()[name = tensor<string, []>("op_2220_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1792])];
+            tensor<int32, [4]> var_2220_end_0 = const()[name = tensor<string, []>("op_2220_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1920])];
+            tensor<bool, [4]> var_2220_end_mask_0 = const()[name = tensor<string, []>("op_2220_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2220_cast_fp16 = slice_by_index(begin = var_2220_begin_0, end = var_2220_end_0, end_mask = var_2220_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2220_cast_fp16")];
+            tensor<int32, [4]> var_2224_begin_0 = const()[name = tensor<string, []>("op_2224_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1920])];
+            tensor<int32, [4]> var_2224_end_0 = const()[name = tensor<string, []>("op_2224_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2048])];
+            tensor<bool, [4]> var_2224_end_mask_0 = const()[name = tensor<string, []>("op_2224_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2224_cast_fp16 = slice_by_index(begin = var_2224_begin_0, end = var_2224_end_0, end_mask = var_2224_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2224_cast_fp16")];
+            tensor<int32, [4]> var_2228_begin_0 = const()[name = tensor<string, []>("op_2228_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2048])];
+            tensor<int32, [4]> var_2228_end_0 = const()[name = tensor<string, []>("op_2228_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2176])];
+            tensor<bool, [4]> var_2228_end_mask_0 = const()[name = tensor<string, []>("op_2228_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2228_cast_fp16 = slice_by_index(begin = var_2228_begin_0, end = var_2228_end_0, end_mask = var_2228_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2228_cast_fp16")];
+            tensor<int32, [4]> var_2232_begin_0 = const()[name = tensor<string, []>("op_2232_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2176])];
+            tensor<int32, [4]> var_2232_end_0 = const()[name = tensor<string, []>("op_2232_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2304])];
+            tensor<bool, [4]> var_2232_end_mask_0 = const()[name = tensor<string, []>("op_2232_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2232_cast_fp16 = slice_by_index(begin = var_2232_begin_0, end = var_2232_end_0, end_mask = var_2232_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2232_cast_fp16")];
+            tensor<int32, [4]> var_2236_begin_0 = const()[name = tensor<string, []>("op_2236_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2304])];
+            tensor<int32, [4]> var_2236_end_0 = const()[name = tensor<string, []>("op_2236_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2432])];
+            tensor<bool, [4]> var_2236_end_mask_0 = const()[name = tensor<string, []>("op_2236_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2236_cast_fp16 = slice_by_index(begin = var_2236_begin_0, end = var_2236_end_0, end_mask = var_2236_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2236_cast_fp16")];
+            tensor<int32, [4]> var_2240_begin_0 = const()[name = tensor<string, []>("op_2240_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2432])];
+            tensor<int32, [4]> var_2240_end_0 = const()[name = tensor<string, []>("op_2240_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2560])];
+            tensor<bool, [4]> var_2240_end_mask_0 = const()[name = tensor<string, []>("op_2240_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2240_cast_fp16 = slice_by_index(begin = var_2240_begin_0, end = var_2240_end_0, end_mask = var_2240_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2240_cast_fp16")];
+            tensor<int32, [4]> var_2244_begin_0 = const()[name = tensor<string, []>("op_2244_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2560])];
+            tensor<int32, [4]> var_2244_end_0 = const()[name = tensor<string, []>("op_2244_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2688])];
+            tensor<bool, [4]> var_2244_end_mask_0 = const()[name = tensor<string, []>("op_2244_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2244_cast_fp16 = slice_by_index(begin = var_2244_begin_0, end = var_2244_end_0, end_mask = var_2244_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2244_cast_fp16")];
+            tensor<int32, [4]> var_2248_begin_0 = const()[name = tensor<string, []>("op_2248_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2688])];
+            tensor<int32, [4]> var_2248_end_0 = const()[name = tensor<string, []>("op_2248_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2816])];
+            tensor<bool, [4]> var_2248_end_mask_0 = const()[name = tensor<string, []>("op_2248_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2248_cast_fp16 = slice_by_index(begin = var_2248_begin_0, end = var_2248_end_0, end_mask = var_2248_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2248_cast_fp16")];
+            tensor<int32, [4]> var_2252_begin_0 = const()[name = tensor<string, []>("op_2252_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2816])];
+            tensor<int32, [4]> var_2252_end_0 = const()[name = tensor<string, []>("op_2252_end_0"), val = tensor<int32, [4]>([1, 512, 1, 2944])];
+            tensor<bool, [4]> var_2252_end_mask_0 = const()[name = tensor<string, []>("op_2252_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2252_cast_fp16 = slice_by_index(begin = var_2252_begin_0, end = var_2252_end_0, end_mask = var_2252_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2252_cast_fp16")];
+            tensor<int32, [4]> var_2256_begin_0 = const()[name = tensor<string, []>("op_2256_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 2944])];
+            tensor<int32, [4]> var_2256_end_0 = const()[name = tensor<string, []>("op_2256_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3072])];
+            tensor<bool, [4]> var_2256_end_mask_0 = const()[name = tensor<string, []>("op_2256_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2256_cast_fp16 = slice_by_index(begin = var_2256_begin_0, end = var_2256_end_0, end_mask = var_2256_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2256_cast_fp16")];
+            tensor<int32, [4]> var_2260_begin_0 = const()[name = tensor<string, []>("op_2260_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3072])];
+            tensor<int32, [4]> var_2260_end_0 = const()[name = tensor<string, []>("op_2260_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3200])];
+            tensor<bool, [4]> var_2260_end_mask_0 = const()[name = tensor<string, []>("op_2260_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2260_cast_fp16 = slice_by_index(begin = var_2260_begin_0, end = var_2260_end_0, end_mask = var_2260_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2260_cast_fp16")];
+            tensor<int32, [4]> var_2264_begin_0 = const()[name = tensor<string, []>("op_2264_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3200])];
+            tensor<int32, [4]> var_2264_end_0 = const()[name = tensor<string, []>("op_2264_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3328])];
+            tensor<bool, [4]> var_2264_end_mask_0 = const()[name = tensor<string, []>("op_2264_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2264_cast_fp16 = slice_by_index(begin = var_2264_begin_0, end = var_2264_end_0, end_mask = var_2264_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2264_cast_fp16")];
+            tensor<int32, [4]> var_2268_begin_0 = const()[name = tensor<string, []>("op_2268_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3328])];
+            tensor<int32, [4]> var_2268_end_0 = const()[name = tensor<string, []>("op_2268_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3456])];
+            tensor<bool, [4]> var_2268_end_mask_0 = const()[name = tensor<string, []>("op_2268_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2268_cast_fp16 = slice_by_index(begin = var_2268_begin_0, end = var_2268_end_0, end_mask = var_2268_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2268_cast_fp16")];
+            tensor<int32, [4]> var_2272_begin_0 = const()[name = tensor<string, []>("op_2272_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3456])];
+            tensor<int32, [4]> var_2272_end_0 = const()[name = tensor<string, []>("op_2272_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3584])];
+            tensor<bool, [4]> var_2272_end_mask_0 = const()[name = tensor<string, []>("op_2272_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2272_cast_fp16 = slice_by_index(begin = var_2272_begin_0, end = var_2272_end_0, end_mask = var_2272_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2272_cast_fp16")];
+            tensor<int32, [4]> var_2276_begin_0 = const()[name = tensor<string, []>("op_2276_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3584])];
+            tensor<int32, [4]> var_2276_end_0 = const()[name = tensor<string, []>("op_2276_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3712])];
+            tensor<bool, [4]> var_2276_end_mask_0 = const()[name = tensor<string, []>("op_2276_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2276_cast_fp16 = slice_by_index(begin = var_2276_begin_0, end = var_2276_end_0, end_mask = var_2276_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2276_cast_fp16")];
+            tensor<int32, [4]> var_2280_begin_0 = const()[name = tensor<string, []>("op_2280_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3712])];
+            tensor<int32, [4]> var_2280_end_0 = const()[name = tensor<string, []>("op_2280_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3840])];
+            tensor<bool, [4]> var_2280_end_mask_0 = const()[name = tensor<string, []>("op_2280_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2280_cast_fp16 = slice_by_index(begin = var_2280_begin_0, end = var_2280_end_0, end_mask = var_2280_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2280_cast_fp16")];
+            tensor<int32, [4]> var_2284_begin_0 = const()[name = tensor<string, []>("op_2284_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3840])];
+            tensor<int32, [4]> var_2284_end_0 = const()[name = tensor<string, []>("op_2284_end_0"), val = tensor<int32, [4]>([1, 512, 1, 3968])];
+            tensor<bool, [4]> var_2284_end_mask_0 = const()[name = tensor<string, []>("op_2284_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2284_cast_fp16 = slice_by_index(begin = var_2284_begin_0, end = var_2284_end_0, end_mask = var_2284_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2284_cast_fp16")];
+            tensor<int32, [4]> var_2288_begin_0 = const()[name = tensor<string, []>("op_2288_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 3968])];
+            tensor<int32, [4]> var_2288_end_0 = const()[name = tensor<string, []>("op_2288_end_0"), val = tensor<int32, [4]>([1, 512, 1, 4096])];
+            tensor<bool, [4]> var_2288_end_mask_0 = const()[name = tensor<string, []>("op_2288_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_2288_cast_fp16 = slice_by_index(begin = var_2288_begin_0, end = var_2288_end_0, end_mask = var_2288_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_2288_cast_fp16")];
+            tensor<int32, [4]> var_2290_begin_0 = const()[name = tensor<string, []>("op_2290_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2290_end_0 = const()[name = tensor<string, []>("op_2290_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_2290_end_mask_0 = const()[name = tensor<string, []>("op_2290_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2290_cast_fp16 = slice_by_index(begin = var_2290_begin_0, end = var_2290_end_0, end_mask = var_2290_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2290_cast_fp16")];
+            tensor<int32, [4]> var_2294_begin_0 = const()[name = tensor<string, []>("op_2294_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_2294_end_0 = const()[name = tensor<string, []>("op_2294_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_2294_end_mask_0 = const()[name = tensor<string, []>("op_2294_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2294_cast_fp16 = slice_by_index(begin = var_2294_begin_0, end = var_2294_end_0, end_mask = var_2294_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2294_cast_fp16")];
+            tensor<int32, [4]> var_2298_begin_0 = const()[name = tensor<string, []>("op_2298_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_2298_end_0 = const()[name = tensor<string, []>("op_2298_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_2298_end_mask_0 = const()[name = tensor<string, []>("op_2298_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2298_cast_fp16 = slice_by_index(begin = var_2298_begin_0, end = var_2298_end_0, end_mask = var_2298_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2298_cast_fp16")];
+            tensor<int32, [4]> var_2302_begin_0 = const()[name = tensor<string, []>("op_2302_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_2302_end_0 = const()[name = tensor<string, []>("op_2302_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_2302_end_mask_0 = const()[name = tensor<string, []>("op_2302_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2302_cast_fp16 = slice_by_index(begin = var_2302_begin_0, end = var_2302_end_0, end_mask = var_2302_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2302_cast_fp16")];
+            tensor<int32, [4]> var_2306_begin_0 = const()[name = tensor<string, []>("op_2306_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_2306_end_0 = const()[name = tensor<string, []>("op_2306_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_2306_end_mask_0 = const()[name = tensor<string, []>("op_2306_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2306_cast_fp16 = slice_by_index(begin = var_2306_begin_0, end = var_2306_end_0, end_mask = var_2306_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2306_cast_fp16")];
+            tensor<int32, [4]> var_2310_begin_0 = const()[name = tensor<string, []>("op_2310_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_2310_end_0 = const()[name = tensor<string, []>("op_2310_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_2310_end_mask_0 = const()[name = tensor<string, []>("op_2310_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2310_cast_fp16 = slice_by_index(begin = var_2310_begin_0, end = var_2310_end_0, end_mask = var_2310_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2310_cast_fp16")];
+            tensor<int32, [4]> var_2314_begin_0 = const()[name = tensor<string, []>("op_2314_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_2314_end_0 = const()[name = tensor<string, []>("op_2314_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_2314_end_mask_0 = const()[name = tensor<string, []>("op_2314_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2314_cast_fp16 = slice_by_index(begin = var_2314_begin_0, end = var_2314_end_0, end_mask = var_2314_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2314_cast_fp16")];
+            tensor<int32, [4]> var_2318_begin_0 = const()[name = tensor<string, []>("op_2318_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_2318_end_0 = const()[name = tensor<string, []>("op_2318_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_2318_end_mask_0 = const()[name = tensor<string, []>("op_2318_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2318_cast_fp16 = slice_by_index(begin = var_2318_begin_0, end = var_2318_end_0, end_mask = var_2318_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2318_cast_fp16")];
+            tensor<int32, [4]> var_2322_begin_0 = const()[name = tensor<string, []>("op_2322_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_2322_end_0 = const()[name = tensor<string, []>("op_2322_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 512])];
+            tensor<bool, [4]> var_2322_end_mask_0 = const()[name = tensor<string, []>("op_2322_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2322_cast_fp16 = slice_by_index(begin = var_2322_begin_0, end = var_2322_end_0, end_mask = var_2322_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2322_cast_fp16")];
+            tensor<int32, [4]> var_2326_begin_0 = const()[name = tensor<string, []>("op_2326_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_2326_end_0 = const()[name = tensor<string, []>("op_2326_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 512])];
+            tensor<bool, [4]> var_2326_end_mask_0 = const()[name = tensor<string, []>("op_2326_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2326_cast_fp16 = slice_by_index(begin = var_2326_begin_0, end = var_2326_end_0, end_mask = var_2326_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2326_cast_fp16")];
+            tensor<int32, [4]> var_2330_begin_0 = const()[name = tensor<string, []>("op_2330_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_2330_end_0 = const()[name = tensor<string, []>("op_2330_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 512])];
+            tensor<bool, [4]> var_2330_end_mask_0 = const()[name = tensor<string, []>("op_2330_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2330_cast_fp16 = slice_by_index(begin = var_2330_begin_0, end = var_2330_end_0, end_mask = var_2330_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2330_cast_fp16")];
+            tensor<int32, [4]> var_2334_begin_0 = const()[name = tensor<string, []>("op_2334_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_2334_end_0 = const()[name = tensor<string, []>("op_2334_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 512])];
+            tensor<bool, [4]> var_2334_end_mask_0 = const()[name = tensor<string, []>("op_2334_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2334_cast_fp16 = slice_by_index(begin = var_2334_begin_0, end = var_2334_end_0, end_mask = var_2334_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2334_cast_fp16")];
+            tensor<int32, [4]> var_2338_begin_0 = const()[name = tensor<string, []>("op_2338_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_2338_end_0 = const()[name = tensor<string, []>("op_2338_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 512])];
+            tensor<bool, [4]> var_2338_end_mask_0 = const()[name = tensor<string, []>("op_2338_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2338_cast_fp16 = slice_by_index(begin = var_2338_begin_0, end = var_2338_end_0, end_mask = var_2338_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2338_cast_fp16")];
+            tensor<int32, [4]> var_2342_begin_0 = const()[name = tensor<string, []>("op_2342_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_2342_end_0 = const()[name = tensor<string, []>("op_2342_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 512])];
+            tensor<bool, [4]> var_2342_end_mask_0 = const()[name = tensor<string, []>("op_2342_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2342_cast_fp16 = slice_by_index(begin = var_2342_begin_0, end = var_2342_end_0, end_mask = var_2342_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2342_cast_fp16")];
+            tensor<int32, [4]> var_2346_begin_0 = const()[name = tensor<string, []>("op_2346_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_2346_end_0 = const()[name = tensor<string, []>("op_2346_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 512])];
+            tensor<bool, [4]> var_2346_end_mask_0 = const()[name = tensor<string, []>("op_2346_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2346_cast_fp16 = slice_by_index(begin = var_2346_begin_0, end = var_2346_end_0, end_mask = var_2346_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2346_cast_fp16")];
+            tensor<int32, [4]> var_2350_begin_0 = const()[name = tensor<string, []>("op_2350_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_2350_end_0 = const()[name = tensor<string, []>("op_2350_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 512])];
+            tensor<bool, [4]> var_2350_end_mask_0 = const()[name = tensor<string, []>("op_2350_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2350_cast_fp16 = slice_by_index(begin = var_2350_begin_0, end = var_2350_end_0, end_mask = var_2350_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2350_cast_fp16")];
+            tensor<int32, [4]> var_2354_begin_0 = const()[name = tensor<string, []>("op_2354_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_2354_end_0 = const()[name = tensor<string, []>("op_2354_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 512])];
+            tensor<bool, [4]> var_2354_end_mask_0 = const()[name = tensor<string, []>("op_2354_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2354_cast_fp16 = slice_by_index(begin = var_2354_begin_0, end = var_2354_end_0, end_mask = var_2354_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2354_cast_fp16")];
+            tensor<int32, [4]> var_2358_begin_0 = const()[name = tensor<string, []>("op_2358_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_2358_end_0 = const()[name = tensor<string, []>("op_2358_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 512])];
+            tensor<bool, [4]> var_2358_end_mask_0 = const()[name = tensor<string, []>("op_2358_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2358_cast_fp16 = slice_by_index(begin = var_2358_begin_0, end = var_2358_end_0, end_mask = var_2358_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2358_cast_fp16")];
+            tensor<int32, [4]> var_2362_begin_0 = const()[name = tensor<string, []>("op_2362_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_2362_end_0 = const()[name = tensor<string, []>("op_2362_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 512])];
+            tensor<bool, [4]> var_2362_end_mask_0 = const()[name = tensor<string, []>("op_2362_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2362_cast_fp16 = slice_by_index(begin = var_2362_begin_0, end = var_2362_end_0, end_mask = var_2362_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2362_cast_fp16")];
+            tensor<int32, [4]> var_2366_begin_0 = const()[name = tensor<string, []>("op_2366_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_2366_end_0 = const()[name = tensor<string, []>("op_2366_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 512])];
+            tensor<bool, [4]> var_2366_end_mask_0 = const()[name = tensor<string, []>("op_2366_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2366_cast_fp16 = slice_by_index(begin = var_2366_begin_0, end = var_2366_end_0, end_mask = var_2366_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2366_cast_fp16")];
+            tensor<int32, [4]> var_2370_begin_0 = const()[name = tensor<string, []>("op_2370_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_2370_end_0 = const()[name = tensor<string, []>("op_2370_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 512])];
+            tensor<bool, [4]> var_2370_end_mask_0 = const()[name = tensor<string, []>("op_2370_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2370_cast_fp16 = slice_by_index(begin = var_2370_begin_0, end = var_2370_end_0, end_mask = var_2370_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2370_cast_fp16")];
+            tensor<int32, [4]> var_2374_begin_0 = const()[name = tensor<string, []>("op_2374_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_2374_end_0 = const()[name = tensor<string, []>("op_2374_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 512])];
+            tensor<bool, [4]> var_2374_end_mask_0 = const()[name = tensor<string, []>("op_2374_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2374_cast_fp16 = slice_by_index(begin = var_2374_begin_0, end = var_2374_end_0, end_mask = var_2374_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2374_cast_fp16")];
+            tensor<int32, [4]> var_2378_begin_0 = const()[name = tensor<string, []>("op_2378_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_2378_end_0 = const()[name = tensor<string, []>("op_2378_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 512])];
+            tensor<bool, [4]> var_2378_end_mask_0 = const()[name = tensor<string, []>("op_2378_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2378_cast_fp16 = slice_by_index(begin = var_2378_begin_0, end = var_2378_end_0, end_mask = var_2378_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2378_cast_fp16")];
+            tensor<int32, [4]> var_2382_begin_0 = const()[name = tensor<string, []>("op_2382_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_2382_end_0 = const()[name = tensor<string, []>("op_2382_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 512])];
+            tensor<bool, [4]> var_2382_end_mask_0 = const()[name = tensor<string, []>("op_2382_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2382_cast_fp16 = slice_by_index(begin = var_2382_begin_0, end = var_2382_end_0, end_mask = var_2382_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2382_cast_fp16")];
+            tensor<int32, [4]> var_2386_begin_0 = const()[name = tensor<string, []>("op_2386_begin_0"), val = tensor<int32, [4]>([0, 3072, 0, 0])];
+            tensor<int32, [4]> var_2386_end_0 = const()[name = tensor<string, []>("op_2386_end_0"), val = tensor<int32, [4]>([1, 3200, 1, 512])];
+            tensor<bool, [4]> var_2386_end_mask_0 = const()[name = tensor<string, []>("op_2386_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2386_cast_fp16 = slice_by_index(begin = var_2386_begin_0, end = var_2386_end_0, end_mask = var_2386_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2386_cast_fp16")];
+            tensor<int32, [4]> var_2390_begin_0 = const()[name = tensor<string, []>("op_2390_begin_0"), val = tensor<int32, [4]>([0, 3200, 0, 0])];
+            tensor<int32, [4]> var_2390_end_0 = const()[name = tensor<string, []>("op_2390_end_0"), val = tensor<int32, [4]>([1, 3328, 1, 512])];
+            tensor<bool, [4]> var_2390_end_mask_0 = const()[name = tensor<string, []>("op_2390_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2390_cast_fp16 = slice_by_index(begin = var_2390_begin_0, end = var_2390_end_0, end_mask = var_2390_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2390_cast_fp16")];
+            tensor<int32, [4]> var_2394_begin_0 = const()[name = tensor<string, []>("op_2394_begin_0"), val = tensor<int32, [4]>([0, 3328, 0, 0])];
+            tensor<int32, [4]> var_2394_end_0 = const()[name = tensor<string, []>("op_2394_end_0"), val = tensor<int32, [4]>([1, 3456, 1, 512])];
+            tensor<bool, [4]> var_2394_end_mask_0 = const()[name = tensor<string, []>("op_2394_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2394_cast_fp16 = slice_by_index(begin = var_2394_begin_0, end = var_2394_end_0, end_mask = var_2394_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2394_cast_fp16")];
+            tensor<int32, [4]> var_2398_begin_0 = const()[name = tensor<string, []>("op_2398_begin_0"), val = tensor<int32, [4]>([0, 3456, 0, 0])];
+            tensor<int32, [4]> var_2398_end_0 = const()[name = tensor<string, []>("op_2398_end_0"), val = tensor<int32, [4]>([1, 3584, 1, 512])];
+            tensor<bool, [4]> var_2398_end_mask_0 = const()[name = tensor<string, []>("op_2398_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2398_cast_fp16 = slice_by_index(begin = var_2398_begin_0, end = var_2398_end_0, end_mask = var_2398_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2398_cast_fp16")];
+            tensor<int32, [4]> var_2402_begin_0 = const()[name = tensor<string, []>("op_2402_begin_0"), val = tensor<int32, [4]>([0, 3584, 0, 0])];
+            tensor<int32, [4]> var_2402_end_0 = const()[name = tensor<string, []>("op_2402_end_0"), val = tensor<int32, [4]>([1, 3712, 1, 512])];
+            tensor<bool, [4]> var_2402_end_mask_0 = const()[name = tensor<string, []>("op_2402_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2402_cast_fp16 = slice_by_index(begin = var_2402_begin_0, end = var_2402_end_0, end_mask = var_2402_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2402_cast_fp16")];
+            tensor<int32, [4]> var_2406_begin_0 = const()[name = tensor<string, []>("op_2406_begin_0"), val = tensor<int32, [4]>([0, 3712, 0, 0])];
+            tensor<int32, [4]> var_2406_end_0 = const()[name = tensor<string, []>("op_2406_end_0"), val = tensor<int32, [4]>([1, 3840, 1, 512])];
+            tensor<bool, [4]> var_2406_end_mask_0 = const()[name = tensor<string, []>("op_2406_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2406_cast_fp16 = slice_by_index(begin = var_2406_begin_0, end = var_2406_end_0, end_mask = var_2406_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2406_cast_fp16")];
+            tensor<int32, [4]> var_2410_begin_0 = const()[name = tensor<string, []>("op_2410_begin_0"), val = tensor<int32, [4]>([0, 3840, 0, 0])];
+            tensor<int32, [4]> var_2410_end_0 = const()[name = tensor<string, []>("op_2410_end_0"), val = tensor<int32, [4]>([1, 3968, 1, 512])];
+            tensor<bool, [4]> var_2410_end_mask_0 = const()[name = tensor<string, []>("op_2410_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2410_cast_fp16 = slice_by_index(begin = var_2410_begin_0, end = var_2410_end_0, end_mask = var_2410_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2410_cast_fp16")];
+            tensor<int32, [4]> var_2414_begin_0 = const()[name = tensor<string, []>("op_2414_begin_0"), val = tensor<int32, [4]>([0, 3968, 0, 0])];
+            tensor<int32, [4]> var_2414_end_0 = const()[name = tensor<string, []>("op_2414_end_0"), val = tensor<int32, [4]>([1, 4096, 1, 512])];
+            tensor<bool, [4]> var_2414_end_mask_0 = const()[name = tensor<string, []>("op_2414_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_2414_cast_fp16 = slice_by_index(begin = var_2414_begin_0, end = var_2414_end_0, end_mask = var_2414_end_mask_0, x = v_27_cast_fp16)[name = tensor<string, []>("op_2414_cast_fp16")];
+            tensor<string, []> var_2418_equation_0 = const()[name = tensor<string, []>("op_2418_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2418_cast_fp16 = einsum(equation = var_2418_equation_0, values = (var_2164_cast_fp16, var_2034_cast_fp16))[name = tensor<string, []>("op_2418_cast_fp16")];
+            tensor<fp16, []> var_2419_to_fp16 = const()[name = tensor<string, []>("op_2419_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2420_cast_fp16 = mul(x = var_2418_cast_fp16, y = var_2419_to_fp16)[name = tensor<string, []>("op_2420_cast_fp16")];
+            tensor<string, []> var_2422_equation_0 = const()[name = tensor<string, []>("op_2422_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2422_cast_fp16 = einsum(equation = var_2422_equation_0, values = (var_2168_cast_fp16, var_2038_cast_fp16))[name = tensor<string, []>("op_2422_cast_fp16")];
+            tensor<fp16, []> var_2423_to_fp16 = const()[name = tensor<string, []>("op_2423_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2424_cast_fp16 = mul(x = var_2422_cast_fp16, y = var_2423_to_fp16)[name = tensor<string, []>("op_2424_cast_fp16")];
+            tensor<string, []> var_2426_equation_0 = const()[name = tensor<string, []>("op_2426_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2426_cast_fp16 = einsum(equation = var_2426_equation_0, values = (var_2172_cast_fp16, var_2042_cast_fp16))[name = tensor<string, []>("op_2426_cast_fp16")];
+            tensor<fp16, []> var_2427_to_fp16 = const()[name = tensor<string, []>("op_2427_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2428_cast_fp16 = mul(x = var_2426_cast_fp16, y = var_2427_to_fp16)[name = tensor<string, []>("op_2428_cast_fp16")];
+            tensor<string, []> var_2430_equation_0 = const()[name = tensor<string, []>("op_2430_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2430_cast_fp16 = einsum(equation = var_2430_equation_0, values = (var_2176_cast_fp16, var_2046_cast_fp16))[name = tensor<string, []>("op_2430_cast_fp16")];
+            tensor<fp16, []> var_2431_to_fp16 = const()[name = tensor<string, []>("op_2431_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2432_cast_fp16 = mul(x = var_2430_cast_fp16, y = var_2431_to_fp16)[name = tensor<string, []>("op_2432_cast_fp16")];
+            tensor<string, []> var_2434_equation_0 = const()[name = tensor<string, []>("op_2434_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2434_cast_fp16 = einsum(equation = var_2434_equation_0, values = (var_2180_cast_fp16, var_2050_cast_fp16))[name = tensor<string, []>("op_2434_cast_fp16")];
+            tensor<fp16, []> var_2435_to_fp16 = const()[name = tensor<string, []>("op_2435_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2436_cast_fp16 = mul(x = var_2434_cast_fp16, y = var_2435_to_fp16)[name = tensor<string, []>("op_2436_cast_fp16")];
+            tensor<string, []> var_2438_equation_0 = const()[name = tensor<string, []>("op_2438_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2438_cast_fp16 = einsum(equation = var_2438_equation_0, values = (var_2184_cast_fp16, var_2054_cast_fp16))[name = tensor<string, []>("op_2438_cast_fp16")];
+            tensor<fp16, []> var_2439_to_fp16 = const()[name = tensor<string, []>("op_2439_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2440_cast_fp16 = mul(x = var_2438_cast_fp16, y = var_2439_to_fp16)[name = tensor<string, []>("op_2440_cast_fp16")];
+            tensor<string, []> var_2442_equation_0 = const()[name = tensor<string, []>("op_2442_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2442_cast_fp16 = einsum(equation = var_2442_equation_0, values = (var_2188_cast_fp16, var_2058_cast_fp16))[name = tensor<string, []>("op_2442_cast_fp16")];
+            tensor<fp16, []> var_2443_to_fp16 = const()[name = tensor<string, []>("op_2443_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2444_cast_fp16 = mul(x = var_2442_cast_fp16, y = var_2443_to_fp16)[name = tensor<string, []>("op_2444_cast_fp16")];
+            tensor<string, []> var_2446_equation_0 = const()[name = tensor<string, []>("op_2446_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2446_cast_fp16 = einsum(equation = var_2446_equation_0, values = (var_2192_cast_fp16, var_2062_cast_fp16))[name = tensor<string, []>("op_2446_cast_fp16")];
+            tensor<fp16, []> var_2447_to_fp16 = const()[name = tensor<string, []>("op_2447_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2448_cast_fp16 = mul(x = var_2446_cast_fp16, y = var_2447_to_fp16)[name = tensor<string, []>("op_2448_cast_fp16")];
+            tensor<string, []> var_2450_equation_0 = const()[name = tensor<string, []>("op_2450_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2450_cast_fp16 = einsum(equation = var_2450_equation_0, values = (var_2196_cast_fp16, var_2066_cast_fp16))[name = tensor<string, []>("op_2450_cast_fp16")];
+            tensor<fp16, []> var_2451_to_fp16 = const()[name = tensor<string, []>("op_2451_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2452_cast_fp16 = mul(x = var_2450_cast_fp16, y = var_2451_to_fp16)[name = tensor<string, []>("op_2452_cast_fp16")];
+            tensor<string, []> var_2454_equation_0 = const()[name = tensor<string, []>("op_2454_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2454_cast_fp16 = einsum(equation = var_2454_equation_0, values = (var_2200_cast_fp16, var_2070_cast_fp16))[name = tensor<string, []>("op_2454_cast_fp16")];
+            tensor<fp16, []> var_2455_to_fp16 = const()[name = tensor<string, []>("op_2455_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2456_cast_fp16 = mul(x = var_2454_cast_fp16, y = var_2455_to_fp16)[name = tensor<string, []>("op_2456_cast_fp16")];
+            tensor<string, []> var_2458_equation_0 = const()[name = tensor<string, []>("op_2458_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2458_cast_fp16 = einsum(equation = var_2458_equation_0, values = (var_2204_cast_fp16, var_2074_cast_fp16))[name = tensor<string, []>("op_2458_cast_fp16")];
+            tensor<fp16, []> var_2459_to_fp16 = const()[name = tensor<string, []>("op_2459_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2460_cast_fp16 = mul(x = var_2458_cast_fp16, y = var_2459_to_fp16)[name = tensor<string, []>("op_2460_cast_fp16")];
+            tensor<string, []> var_2462_equation_0 = const()[name = tensor<string, []>("op_2462_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2462_cast_fp16 = einsum(equation = var_2462_equation_0, values = (var_2208_cast_fp16, var_2078_cast_fp16))[name = tensor<string, []>("op_2462_cast_fp16")];
+            tensor<fp16, []> var_2463_to_fp16 = const()[name = tensor<string, []>("op_2463_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2464_cast_fp16 = mul(x = var_2462_cast_fp16, y = var_2463_to_fp16)[name = tensor<string, []>("op_2464_cast_fp16")];
+            tensor<string, []> var_2466_equation_0 = const()[name = tensor<string, []>("op_2466_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2466_cast_fp16 = einsum(equation = var_2466_equation_0, values = (var_2212_cast_fp16, var_2082_cast_fp16))[name = tensor<string, []>("op_2466_cast_fp16")];
+            tensor<fp16, []> var_2467_to_fp16 = const()[name = tensor<string, []>("op_2467_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2468_cast_fp16 = mul(x = var_2466_cast_fp16, y = var_2467_to_fp16)[name = tensor<string, []>("op_2468_cast_fp16")];
+            tensor<string, []> var_2470_equation_0 = const()[name = tensor<string, []>("op_2470_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2470_cast_fp16 = einsum(equation = var_2470_equation_0, values = (var_2216_cast_fp16, var_2086_cast_fp16))[name = tensor<string, []>("op_2470_cast_fp16")];
+            tensor<fp16, []> var_2471_to_fp16 = const()[name = tensor<string, []>("op_2471_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2472_cast_fp16 = mul(x = var_2470_cast_fp16, y = var_2471_to_fp16)[name = tensor<string, []>("op_2472_cast_fp16")];
+            tensor<string, []> var_2474_equation_0 = const()[name = tensor<string, []>("op_2474_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2474_cast_fp16 = einsum(equation = var_2474_equation_0, values = (var_2220_cast_fp16, var_2090_cast_fp16))[name = tensor<string, []>("op_2474_cast_fp16")];
+            tensor<fp16, []> var_2475_to_fp16 = const()[name = tensor<string, []>("op_2475_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2476_cast_fp16 = mul(x = var_2474_cast_fp16, y = var_2475_to_fp16)[name = tensor<string, []>("op_2476_cast_fp16")];
+            tensor<string, []> var_2478_equation_0 = const()[name = tensor<string, []>("op_2478_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2478_cast_fp16 = einsum(equation = var_2478_equation_0, values = (var_2224_cast_fp16, var_2094_cast_fp16))[name = tensor<string, []>("op_2478_cast_fp16")];
+            tensor<fp16, []> var_2479_to_fp16 = const()[name = tensor<string, []>("op_2479_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2480_cast_fp16 = mul(x = var_2478_cast_fp16, y = var_2479_to_fp16)[name = tensor<string, []>("op_2480_cast_fp16")];
+            tensor<string, []> var_2482_equation_0 = const()[name = tensor<string, []>("op_2482_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2482_cast_fp16 = einsum(equation = var_2482_equation_0, values = (var_2228_cast_fp16, var_2098_cast_fp16))[name = tensor<string, []>("op_2482_cast_fp16")];
+            tensor<fp16, []> var_2483_to_fp16 = const()[name = tensor<string, []>("op_2483_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2484_cast_fp16 = mul(x = var_2482_cast_fp16, y = var_2483_to_fp16)[name = tensor<string, []>("op_2484_cast_fp16")];
+            tensor<string, []> var_2486_equation_0 = const()[name = tensor<string, []>("op_2486_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2486_cast_fp16 = einsum(equation = var_2486_equation_0, values = (var_2232_cast_fp16, var_2102_cast_fp16))[name = tensor<string, []>("op_2486_cast_fp16")];
+            tensor<fp16, []> var_2487_to_fp16 = const()[name = tensor<string, []>("op_2487_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2488_cast_fp16 = mul(x = var_2486_cast_fp16, y = var_2487_to_fp16)[name = tensor<string, []>("op_2488_cast_fp16")];
+            tensor<string, []> var_2490_equation_0 = const()[name = tensor<string, []>("op_2490_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2490_cast_fp16 = einsum(equation = var_2490_equation_0, values = (var_2236_cast_fp16, var_2106_cast_fp16))[name = tensor<string, []>("op_2490_cast_fp16")];
+            tensor<fp16, []> var_2491_to_fp16 = const()[name = tensor<string, []>("op_2491_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2492_cast_fp16 = mul(x = var_2490_cast_fp16, y = var_2491_to_fp16)[name = tensor<string, []>("op_2492_cast_fp16")];
+            tensor<string, []> var_2494_equation_0 = const()[name = tensor<string, []>("op_2494_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2494_cast_fp16 = einsum(equation = var_2494_equation_0, values = (var_2240_cast_fp16, var_2110_cast_fp16))[name = tensor<string, []>("op_2494_cast_fp16")];
+            tensor<fp16, []> var_2495_to_fp16 = const()[name = tensor<string, []>("op_2495_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2496_cast_fp16 = mul(x = var_2494_cast_fp16, y = var_2495_to_fp16)[name = tensor<string, []>("op_2496_cast_fp16")];
+            tensor<string, []> var_2498_equation_0 = const()[name = tensor<string, []>("op_2498_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2498_cast_fp16 = einsum(equation = var_2498_equation_0, values = (var_2244_cast_fp16, var_2114_cast_fp16))[name = tensor<string, []>("op_2498_cast_fp16")];
+            tensor<fp16, []> var_2499_to_fp16 = const()[name = tensor<string, []>("op_2499_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2500_cast_fp16 = mul(x = var_2498_cast_fp16, y = var_2499_to_fp16)[name = tensor<string, []>("op_2500_cast_fp16")];
+            tensor<string, []> var_2502_equation_0 = const()[name = tensor<string, []>("op_2502_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2502_cast_fp16 = einsum(equation = var_2502_equation_0, values = (var_2248_cast_fp16, var_2118_cast_fp16))[name = tensor<string, []>("op_2502_cast_fp16")];
+            tensor<fp16, []> var_2503_to_fp16 = const()[name = tensor<string, []>("op_2503_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2504_cast_fp16 = mul(x = var_2502_cast_fp16, y = var_2503_to_fp16)[name = tensor<string, []>("op_2504_cast_fp16")];
+            tensor<string, []> var_2506_equation_0 = const()[name = tensor<string, []>("op_2506_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2506_cast_fp16 = einsum(equation = var_2506_equation_0, values = (var_2252_cast_fp16, var_2122_cast_fp16))[name = tensor<string, []>("op_2506_cast_fp16")];
+            tensor<fp16, []> var_2507_to_fp16 = const()[name = tensor<string, []>("op_2507_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2508_cast_fp16 = mul(x = var_2506_cast_fp16, y = var_2507_to_fp16)[name = tensor<string, []>("op_2508_cast_fp16")];
+            tensor<string, []> var_2510_equation_0 = const()[name = tensor<string, []>("op_2510_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2510_cast_fp16 = einsum(equation = var_2510_equation_0, values = (var_2256_cast_fp16, var_2126_cast_fp16))[name = tensor<string, []>("op_2510_cast_fp16")];
+            tensor<fp16, []> var_2511_to_fp16 = const()[name = tensor<string, []>("op_2511_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2512_cast_fp16 = mul(x = var_2510_cast_fp16, y = var_2511_to_fp16)[name = tensor<string, []>("op_2512_cast_fp16")];
+            tensor<string, []> var_2514_equation_0 = const()[name = tensor<string, []>("op_2514_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2514_cast_fp16 = einsum(equation = var_2514_equation_0, values = (var_2260_cast_fp16, var_2130_cast_fp16))[name = tensor<string, []>("op_2514_cast_fp16")];
+            tensor<fp16, []> var_2515_to_fp16 = const()[name = tensor<string, []>("op_2515_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2516_cast_fp16 = mul(x = var_2514_cast_fp16, y = var_2515_to_fp16)[name = tensor<string, []>("op_2516_cast_fp16")];
+            tensor<string, []> var_2518_equation_0 = const()[name = tensor<string, []>("op_2518_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2518_cast_fp16 = einsum(equation = var_2518_equation_0, values = (var_2264_cast_fp16, var_2134_cast_fp16))[name = tensor<string, []>("op_2518_cast_fp16")];
+            tensor<fp16, []> var_2519_to_fp16 = const()[name = tensor<string, []>("op_2519_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2520_cast_fp16 = mul(x = var_2518_cast_fp16, y = var_2519_to_fp16)[name = tensor<string, []>("op_2520_cast_fp16")];
+            tensor<string, []> var_2522_equation_0 = const()[name = tensor<string, []>("op_2522_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2522_cast_fp16 = einsum(equation = var_2522_equation_0, values = (var_2268_cast_fp16, var_2138_cast_fp16))[name = tensor<string, []>("op_2522_cast_fp16")];
+            tensor<fp16, []> var_2523_to_fp16 = const()[name = tensor<string, []>("op_2523_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2524_cast_fp16 = mul(x = var_2522_cast_fp16, y = var_2523_to_fp16)[name = tensor<string, []>("op_2524_cast_fp16")];
+            tensor<string, []> var_2526_equation_0 = const()[name = tensor<string, []>("op_2526_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2526_cast_fp16 = einsum(equation = var_2526_equation_0, values = (var_2272_cast_fp16, var_2142_cast_fp16))[name = tensor<string, []>("op_2526_cast_fp16")];
+            tensor<fp16, []> var_2527_to_fp16 = const()[name = tensor<string, []>("op_2527_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2528_cast_fp16 = mul(x = var_2526_cast_fp16, y = var_2527_to_fp16)[name = tensor<string, []>("op_2528_cast_fp16")];
+            tensor<string, []> var_2530_equation_0 = const()[name = tensor<string, []>("op_2530_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2530_cast_fp16 = einsum(equation = var_2530_equation_0, values = (var_2276_cast_fp16, var_2146_cast_fp16))[name = tensor<string, []>("op_2530_cast_fp16")];
+            tensor<fp16, []> var_2531_to_fp16 = const()[name = tensor<string, []>("op_2531_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2532_cast_fp16 = mul(x = var_2530_cast_fp16, y = var_2531_to_fp16)[name = tensor<string, []>("op_2532_cast_fp16")];
+            tensor<string, []> var_2534_equation_0 = const()[name = tensor<string, []>("op_2534_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2534_cast_fp16 = einsum(equation = var_2534_equation_0, values = (var_2280_cast_fp16, var_2150_cast_fp16))[name = tensor<string, []>("op_2534_cast_fp16")];
+            tensor<fp16, []> var_2535_to_fp16 = const()[name = tensor<string, []>("op_2535_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2536_cast_fp16 = mul(x = var_2534_cast_fp16, y = var_2535_to_fp16)[name = tensor<string, []>("op_2536_cast_fp16")];
+            tensor<string, []> var_2538_equation_0 = const()[name = tensor<string, []>("op_2538_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2538_cast_fp16 = einsum(equation = var_2538_equation_0, values = (var_2284_cast_fp16, var_2154_cast_fp16))[name = tensor<string, []>("op_2538_cast_fp16")];
+            tensor<fp16, []> var_2539_to_fp16 = const()[name = tensor<string, []>("op_2539_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2540_cast_fp16 = mul(x = var_2538_cast_fp16, y = var_2539_to_fp16)[name = tensor<string, []>("op_2540_cast_fp16")];
+            tensor<string, []> var_2542_equation_0 = const()[name = tensor<string, []>("op_2542_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_2542_cast_fp16 = einsum(equation = var_2542_equation_0, values = (var_2288_cast_fp16, var_2158_cast_fp16))[name = tensor<string, []>("op_2542_cast_fp16")];
+            tensor<fp16, []> var_2543_to_fp16 = const()[name = tensor<string, []>("op_2543_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_2544_cast_fp16 = mul(x = var_2542_cast_fp16, y = var_2543_to_fp16)[name = tensor<string, []>("op_2544_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_129_cast_fp16 = add(x = var_2420_cast_fp16, y = mask)[name = tensor<string, []>("aw_129_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_131_cast_fp16 = add(x = var_2424_cast_fp16, y = mask)[name = tensor<string, []>("aw_131_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_133_cast_fp16 = add(x = var_2428_cast_fp16, y = mask)[name = tensor<string, []>("aw_133_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_135_cast_fp16 = add(x = var_2432_cast_fp16, y = mask)[name = tensor<string, []>("aw_135_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_137_cast_fp16 = add(x = var_2436_cast_fp16, y = mask)[name = tensor<string, []>("aw_137_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_139_cast_fp16 = add(x = var_2440_cast_fp16, y = mask)[name = tensor<string, []>("aw_139_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_141_cast_fp16 = add(x = var_2444_cast_fp16, y = mask)[name = tensor<string, []>("aw_141_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_143_cast_fp16 = add(x = var_2448_cast_fp16, y = mask)[name = tensor<string, []>("aw_143_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_145_cast_fp16 = add(x = var_2452_cast_fp16, y = mask)[name = tensor<string, []>("aw_145_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_147_cast_fp16 = add(x = var_2456_cast_fp16, y = mask)[name = tensor<string, []>("aw_147_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_149_cast_fp16 = add(x = var_2460_cast_fp16, y = mask)[name = tensor<string, []>("aw_149_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_151_cast_fp16 = add(x = var_2464_cast_fp16, y = mask)[name = tensor<string, []>("aw_151_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_153_cast_fp16 = add(x = var_2468_cast_fp16, y = mask)[name = tensor<string, []>("aw_153_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_155_cast_fp16 = add(x = var_2472_cast_fp16, y = mask)[name = tensor<string, []>("aw_155_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_157_cast_fp16 = add(x = var_2476_cast_fp16, y = mask)[name = tensor<string, []>("aw_157_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_159_cast_fp16 = add(x = var_2480_cast_fp16, y = mask)[name = tensor<string, []>("aw_159_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_161_cast_fp16 = add(x = var_2484_cast_fp16, y = mask)[name = tensor<string, []>("aw_161_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_163_cast_fp16 = add(x = var_2488_cast_fp16, y = mask)[name = tensor<string, []>("aw_163_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_165_cast_fp16 = add(x = var_2492_cast_fp16, y = mask)[name = tensor<string, []>("aw_165_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_167_cast_fp16 = add(x = var_2496_cast_fp16, y = mask)[name = tensor<string, []>("aw_167_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_169_cast_fp16 = add(x = var_2500_cast_fp16, y = mask)[name = tensor<string, []>("aw_169_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_171_cast_fp16 = add(x = var_2504_cast_fp16, y = mask)[name = tensor<string, []>("aw_171_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_173_cast_fp16 = add(x = var_2508_cast_fp16, y = mask)[name = tensor<string, []>("aw_173_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_175_cast_fp16 = add(x = var_2512_cast_fp16, y = mask)[name = tensor<string, []>("aw_175_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_177_cast_fp16 = add(x = var_2516_cast_fp16, y = mask)[name = tensor<string, []>("aw_177_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_179_cast_fp16 = add(x = var_2520_cast_fp16, y = mask)[name = tensor<string, []>("aw_179_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_181_cast_fp16 = add(x = var_2524_cast_fp16, y = mask)[name = tensor<string, []>("aw_181_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_183_cast_fp16 = add(x = var_2528_cast_fp16, y = mask)[name = tensor<string, []>("aw_183_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_185_cast_fp16 = add(x = var_2532_cast_fp16, y = mask)[name = tensor<string, []>("aw_185_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_187_cast_fp16 = add(x = var_2536_cast_fp16, y = mask)[name = tensor<string, []>("aw_187_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_189_cast_fp16 = add(x = var_2540_cast_fp16, y = mask)[name = tensor<string, []>("aw_189_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_2544_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2577_cast_fp16 = softmax(axis = var_1886, x = aw_129_cast_fp16)[name = tensor<string, []>("op_2577_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2578_cast_fp16 = softmax(axis = var_1886, x = aw_131_cast_fp16)[name = tensor<string, []>("op_2578_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2579_cast_fp16 = softmax(axis = var_1886, x = aw_133_cast_fp16)[name = tensor<string, []>("op_2579_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2580_cast_fp16 = softmax(axis = var_1886, x = aw_135_cast_fp16)[name = tensor<string, []>("op_2580_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2581_cast_fp16 = softmax(axis = var_1886, x = aw_137_cast_fp16)[name = tensor<string, []>("op_2581_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2582_cast_fp16 = softmax(axis = var_1886, x = aw_139_cast_fp16)[name = tensor<string, []>("op_2582_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2583_cast_fp16 = softmax(axis = var_1886, x = aw_141_cast_fp16)[name = tensor<string, []>("op_2583_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2584_cast_fp16 = softmax(axis = var_1886, x = aw_143_cast_fp16)[name = tensor<string, []>("op_2584_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2585_cast_fp16 = softmax(axis = var_1886, x = aw_145_cast_fp16)[name = tensor<string, []>("op_2585_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2586_cast_fp16 = softmax(axis = var_1886, x = aw_147_cast_fp16)[name = tensor<string, []>("op_2586_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2587_cast_fp16 = softmax(axis = var_1886, x = aw_149_cast_fp16)[name = tensor<string, []>("op_2587_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2588_cast_fp16 = softmax(axis = var_1886, x = aw_151_cast_fp16)[name = tensor<string, []>("op_2588_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2589_cast_fp16 = softmax(axis = var_1886, x = aw_153_cast_fp16)[name = tensor<string, []>("op_2589_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2590_cast_fp16 = softmax(axis = var_1886, x = aw_155_cast_fp16)[name = tensor<string, []>("op_2590_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2591_cast_fp16 = softmax(axis = var_1886, x = aw_157_cast_fp16)[name = tensor<string, []>("op_2591_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2592_cast_fp16 = softmax(axis = var_1886, x = aw_159_cast_fp16)[name = tensor<string, []>("op_2592_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2593_cast_fp16 = softmax(axis = var_1886, x = aw_161_cast_fp16)[name = tensor<string, []>("op_2593_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2594_cast_fp16 = softmax(axis = var_1886, x = aw_163_cast_fp16)[name = tensor<string, []>("op_2594_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2595_cast_fp16 = softmax(axis = var_1886, x = aw_165_cast_fp16)[name = tensor<string, []>("op_2595_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2596_cast_fp16 = softmax(axis = var_1886, x = aw_167_cast_fp16)[name = tensor<string, []>("op_2596_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2597_cast_fp16 = softmax(axis = var_1886, x = aw_169_cast_fp16)[name = tensor<string, []>("op_2597_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2598_cast_fp16 = softmax(axis = var_1886, x = aw_171_cast_fp16)[name = tensor<string, []>("op_2598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2599_cast_fp16 = softmax(axis = var_1886, x = aw_173_cast_fp16)[name = tensor<string, []>("op_2599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2600_cast_fp16 = softmax(axis = var_1886, x = aw_175_cast_fp16)[name = tensor<string, []>("op_2600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2601_cast_fp16 = softmax(axis = var_1886, x = aw_177_cast_fp16)[name = tensor<string, []>("op_2601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2602_cast_fp16 = softmax(axis = var_1886, x = aw_179_cast_fp16)[name = tensor<string, []>("op_2602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2603_cast_fp16 = softmax(axis = var_1886, x = aw_181_cast_fp16)[name = tensor<string, []>("op_2603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2604_cast_fp16 = softmax(axis = var_1886, x = aw_183_cast_fp16)[name = tensor<string, []>("op_2604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2605_cast_fp16 = softmax(axis = var_1886, x = aw_185_cast_fp16)[name = tensor<string, []>("op_2605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2606_cast_fp16 = softmax(axis = var_1886, x = aw_187_cast_fp16)[name = tensor<string, []>("op_2606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2607_cast_fp16 = softmax(axis = var_1886, x = aw_189_cast_fp16)[name = tensor<string, []>("op_2607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_2608_cast_fp16 = softmax(axis = var_1886, x = aw_cast_fp16)[name = tensor<string, []>("op_2608_cast_fp16")];
+            tensor<string, []> var_2610_equation_0 = const()[name = tensor<string, []>("op_2610_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2610_cast_fp16 = einsum(equation = var_2610_equation_0, values = (var_2290_cast_fp16, var_2577_cast_fp16))[name = tensor<string, []>("op_2610_cast_fp16")];
+            tensor<string, []> var_2612_equation_0 = const()[name = tensor<string, []>("op_2612_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2612_cast_fp16 = einsum(equation = var_2612_equation_0, values = (var_2294_cast_fp16, var_2578_cast_fp16))[name = tensor<string, []>("op_2612_cast_fp16")];
+            tensor<string, []> var_2614_equation_0 = const()[name = tensor<string, []>("op_2614_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2614_cast_fp16 = einsum(equation = var_2614_equation_0, values = (var_2298_cast_fp16, var_2579_cast_fp16))[name = tensor<string, []>("op_2614_cast_fp16")];
+            tensor<string, []> var_2616_equation_0 = const()[name = tensor<string, []>("op_2616_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2616_cast_fp16 = einsum(equation = var_2616_equation_0, values = (var_2302_cast_fp16, var_2580_cast_fp16))[name = tensor<string, []>("op_2616_cast_fp16")];
+            tensor<string, []> var_2618_equation_0 = const()[name = tensor<string, []>("op_2618_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2618_cast_fp16 = einsum(equation = var_2618_equation_0, values = (var_2306_cast_fp16, var_2581_cast_fp16))[name = tensor<string, []>("op_2618_cast_fp16")];
+            tensor<string, []> var_2620_equation_0 = const()[name = tensor<string, []>("op_2620_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2620_cast_fp16 = einsum(equation = var_2620_equation_0, values = (var_2310_cast_fp16, var_2582_cast_fp16))[name = tensor<string, []>("op_2620_cast_fp16")];
+            tensor<string, []> var_2622_equation_0 = const()[name = tensor<string, []>("op_2622_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2622_cast_fp16 = einsum(equation = var_2622_equation_0, values = (var_2314_cast_fp16, var_2583_cast_fp16))[name = tensor<string, []>("op_2622_cast_fp16")];
+            tensor<string, []> var_2624_equation_0 = const()[name = tensor<string, []>("op_2624_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2624_cast_fp16 = einsum(equation = var_2624_equation_0, values = (var_2318_cast_fp16, var_2584_cast_fp16))[name = tensor<string, []>("op_2624_cast_fp16")];
+            tensor<string, []> var_2626_equation_0 = const()[name = tensor<string, []>("op_2626_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2626_cast_fp16 = einsum(equation = var_2626_equation_0, values = (var_2322_cast_fp16, var_2585_cast_fp16))[name = tensor<string, []>("op_2626_cast_fp16")];
+            tensor<string, []> var_2628_equation_0 = const()[name = tensor<string, []>("op_2628_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2628_cast_fp16 = einsum(equation = var_2628_equation_0, values = (var_2326_cast_fp16, var_2586_cast_fp16))[name = tensor<string, []>("op_2628_cast_fp16")];
+            tensor<string, []> var_2630_equation_0 = const()[name = tensor<string, []>("op_2630_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2630_cast_fp16 = einsum(equation = var_2630_equation_0, values = (var_2330_cast_fp16, var_2587_cast_fp16))[name = tensor<string, []>("op_2630_cast_fp16")];
+            tensor<string, []> var_2632_equation_0 = const()[name = tensor<string, []>("op_2632_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2632_cast_fp16 = einsum(equation = var_2632_equation_0, values = (var_2334_cast_fp16, var_2588_cast_fp16))[name = tensor<string, []>("op_2632_cast_fp16")];
+            tensor<string, []> var_2634_equation_0 = const()[name = tensor<string, []>("op_2634_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2634_cast_fp16 = einsum(equation = var_2634_equation_0, values = (var_2338_cast_fp16, var_2589_cast_fp16))[name = tensor<string, []>("op_2634_cast_fp16")];
+            tensor<string, []> var_2636_equation_0 = const()[name = tensor<string, []>("op_2636_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2636_cast_fp16 = einsum(equation = var_2636_equation_0, values = (var_2342_cast_fp16, var_2590_cast_fp16))[name = tensor<string, []>("op_2636_cast_fp16")];
+            tensor<string, []> var_2638_equation_0 = const()[name = tensor<string, []>("op_2638_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2638_cast_fp16 = einsum(equation = var_2638_equation_0, values = (var_2346_cast_fp16, var_2591_cast_fp16))[name = tensor<string, []>("op_2638_cast_fp16")];
+            tensor<string, []> var_2640_equation_0 = const()[name = tensor<string, []>("op_2640_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2640_cast_fp16 = einsum(equation = var_2640_equation_0, values = (var_2350_cast_fp16, var_2592_cast_fp16))[name = tensor<string, []>("op_2640_cast_fp16")];
+            tensor<string, []> var_2642_equation_0 = const()[name = tensor<string, []>("op_2642_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2642_cast_fp16 = einsum(equation = var_2642_equation_0, values = (var_2354_cast_fp16, var_2593_cast_fp16))[name = tensor<string, []>("op_2642_cast_fp16")];
+            tensor<string, []> var_2644_equation_0 = const()[name = tensor<string, []>("op_2644_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2644_cast_fp16 = einsum(equation = var_2644_equation_0, values = (var_2358_cast_fp16, var_2594_cast_fp16))[name = tensor<string, []>("op_2644_cast_fp16")];
+            tensor<string, []> var_2646_equation_0 = const()[name = tensor<string, []>("op_2646_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2646_cast_fp16 = einsum(equation = var_2646_equation_0, values = (var_2362_cast_fp16, var_2595_cast_fp16))[name = tensor<string, []>("op_2646_cast_fp16")];
+            tensor<string, []> var_2648_equation_0 = const()[name = tensor<string, []>("op_2648_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2648_cast_fp16 = einsum(equation = var_2648_equation_0, values = (var_2366_cast_fp16, var_2596_cast_fp16))[name = tensor<string, []>("op_2648_cast_fp16")];
+            tensor<string, []> var_2650_equation_0 = const()[name = tensor<string, []>("op_2650_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2650_cast_fp16 = einsum(equation = var_2650_equation_0, values = (var_2370_cast_fp16, var_2597_cast_fp16))[name = tensor<string, []>("op_2650_cast_fp16")];
+            tensor<string, []> var_2652_equation_0 = const()[name = tensor<string, []>("op_2652_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2652_cast_fp16 = einsum(equation = var_2652_equation_0, values = (var_2374_cast_fp16, var_2598_cast_fp16))[name = tensor<string, []>("op_2652_cast_fp16")];
+            tensor<string, []> var_2654_equation_0 = const()[name = tensor<string, []>("op_2654_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2654_cast_fp16 = einsum(equation = var_2654_equation_0, values = (var_2378_cast_fp16, var_2599_cast_fp16))[name = tensor<string, []>("op_2654_cast_fp16")];
+            tensor<string, []> var_2656_equation_0 = const()[name = tensor<string, []>("op_2656_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2656_cast_fp16 = einsum(equation = var_2656_equation_0, values = (var_2382_cast_fp16, var_2600_cast_fp16))[name = tensor<string, []>("op_2656_cast_fp16")];
+            tensor<string, []> var_2658_equation_0 = const()[name = tensor<string, []>("op_2658_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2658_cast_fp16 = einsum(equation = var_2658_equation_0, values = (var_2386_cast_fp16, var_2601_cast_fp16))[name = tensor<string, []>("op_2658_cast_fp16")];
+            tensor<string, []> var_2660_equation_0 = const()[name = tensor<string, []>("op_2660_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2660_cast_fp16 = einsum(equation = var_2660_equation_0, values = (var_2390_cast_fp16, var_2602_cast_fp16))[name = tensor<string, []>("op_2660_cast_fp16")];
+            tensor<string, []> var_2662_equation_0 = const()[name = tensor<string, []>("op_2662_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2662_cast_fp16 = einsum(equation = var_2662_equation_0, values = (var_2394_cast_fp16, var_2603_cast_fp16))[name = tensor<string, []>("op_2662_cast_fp16")];
+            tensor<string, []> var_2664_equation_0 = const()[name = tensor<string, []>("op_2664_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2664_cast_fp16 = einsum(equation = var_2664_equation_0, values = (var_2398_cast_fp16, var_2604_cast_fp16))[name = tensor<string, []>("op_2664_cast_fp16")];
+            tensor<string, []> var_2666_equation_0 = const()[name = tensor<string, []>("op_2666_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2666_cast_fp16 = einsum(equation = var_2666_equation_0, values = (var_2402_cast_fp16, var_2605_cast_fp16))[name = tensor<string, []>("op_2666_cast_fp16")];
+            tensor<string, []> var_2668_equation_0 = const()[name = tensor<string, []>("op_2668_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2668_cast_fp16 = einsum(equation = var_2668_equation_0, values = (var_2406_cast_fp16, var_2606_cast_fp16))[name = tensor<string, []>("op_2668_cast_fp16")];
+            tensor<string, []> var_2670_equation_0 = const()[name = tensor<string, []>("op_2670_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2670_cast_fp16 = einsum(equation = var_2670_equation_0, values = (var_2410_cast_fp16, var_2607_cast_fp16))[name = tensor<string, []>("op_2670_cast_fp16")];
+            tensor<string, []> var_2672_equation_0 = const()[name = tensor<string, []>("op_2672_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_2672_cast_fp16 = einsum(equation = var_2672_equation_0, values = (var_2414_cast_fp16, var_2608_cast_fp16))[name = tensor<string, []>("op_2672_cast_fp16")];
+            tensor<bool, []> x_43_interleave_0 = const()[name = tensor<string, []>("x_43_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 4096, 1, 64]> x_43_cast_fp16 = concat(axis = var_1886, interleave = x_43_interleave_0, values = (var_2610_cast_fp16, var_2612_cast_fp16, var_2614_cast_fp16, var_2616_cast_fp16, var_2618_cast_fp16, var_2620_cast_fp16, var_2622_cast_fp16, var_2624_cast_fp16, var_2626_cast_fp16, var_2628_cast_fp16, var_2630_cast_fp16, var_2632_cast_fp16, var_2634_cast_fp16, var_2636_cast_fp16, var_2638_cast_fp16, var_2640_cast_fp16, var_2642_cast_fp16, var_2644_cast_fp16, var_2646_cast_fp16, var_2648_cast_fp16, var_2650_cast_fp16, var_2652_cast_fp16, var_2654_cast_fp16, var_2656_cast_fp16, var_2658_cast_fp16, var_2660_cast_fp16, var_2662_cast_fp16, var_2664_cast_fp16, var_2666_cast_fp16, var_2668_cast_fp16, var_2670_cast_fp16, var_2672_cast_fp16))[name = tensor<string, []>("x_43_cast_fp16")];
+            tensor<int32, [4]> var_2677 = const()[name = tensor<string, []>("op_2677"), val = tensor<int32, [4]>([1, 4096, -1, 8])];
+            tensor<fp16, [1, 4096, 8, 8]> input_23_cast_fp16 = reshape(shape = var_2677, x = x_43_cast_fp16)[name = tensor<string, []>("input_23_cast_fp16")];
+            tensor<int32, [2]> var_2681 = const()[name = tensor<string, []>("op_2681"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_2683 = const()[name = tensor<string, []>("op_2683"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_2685_pad_type_0 = const()[name = tensor<string, []>("op_2685_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_2685_pad_0 = const()[name = tensor<string, []>("op_2685_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 8, 8]> var_2685_cast_fp16 = conv(dilations = var_2683, groups = var_1886, pad = var_2685_pad_0, pad_type = var_2685_pad_type_0, strides = var_2681, weight = blocks_2_attn_proj_weight_palettized_cast_fp16, x = input_23_cast_fp16)[name = tensor<string, []>("op_2685_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_attn_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_attn_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303804736)))];
+            tensor<fp16, [1, 4096, 8, 8]> attention_output_cast_fp16 = mul(x = var_2685_cast_fp16, y = blocks_2_attn_proj_output_scales_to_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 4096, 8, 8]> x_45_cast_fp16 = add(x = attention_output_cast_fp16, y = x_33_cast_fp16)[name = tensor<string, []>("x_45_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303812992)))];
+            tensor<fp16, [1, 4097, 8, 8]> x_eps_cast_fp16 = concat(axis = var_1886, interleave = x_eps_interleave_0, values = (x_45_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_1889, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 4096, 8, 8]> x_normed_31_cast_fp16 = real_div(x = x_45_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_31_cast_fp16")];
+            tensor<fp16, []> var_2710_to_fp16 = const()[name = tensor<string, []>("op_2710_to_fp16"), val = tensor<fp16, []>(0x1p+6)];
+            tensor<fp16, [1, 4096, 8, 8]> x_normed_33_cast_fp16 = mul(x = x_normed_31_cast_fp16, y = var_2710_to_fp16)[name = tensor<string, []>("x_normed_33_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_2_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303813184)))];
+            tensor<fp16, [1, 4096, 8, 8]> input_25_cast_fp16 = mul(x = x_normed_33_cast_fp16, y = blocks_2_norm_2_weight_to_fp16)[name = tensor<string, []>("input_25_cast_fp16")];
+            tensor<int32, [2]> var_2722 = const()[name = tensor<string, []>("op_2722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_2724 = const()[name = tensor<string, []>("op_2724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_2726_pad_type_0 = const()[name = tensor<string, []>("op_2726_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_2726_pad_0 = const()[name = tensor<string, []>("op_2726_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 8, 8]> var_2726_cast_fp16 = conv(dilations = var_2724, groups = var_1886, pad = var_2726_pad_0, pad_type = var_2726_pad_type_0, strides = var_2722, weight = blocks_2_mlp_fc_1_weight_palettized_cast_fp16, x = input_25_cast_fp16)[name = tensor<string, []>("op_2726_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_1_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_mlp_fc_1_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303821440)))];
+            tensor<fp16, [1, 11008, 8, 8]> input_27_cast_fp16 = mul(x = var_2726_cast_fp16, y = blocks_2_mlp_fc_1_output_scales_to_fp16)[name = tensor<string, []>("input_27_cast_fp16")];
+            tensor<int32, [2]> var_2730 = const()[name = tensor<string, []>("op_2730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_2732 = const()[name = tensor<string, []>("op_2732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_2734_pad_type_0 = const()[name = tensor<string, []>("op_2734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_2734_pad_0 = const()[name = tensor<string, []>("op_2734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 11008, 8, 8]> var_2734_cast_fp16 = conv(dilations = var_2732, groups = var_1886, pad = var_2734_pad_0, pad_type = var_2734_pad_type_0, strides = var_2730, weight = blocks_2_mlp_fc_2_weight_palettized_cast_fp16, x = input_25_cast_fp16)[name = tensor<string, []>("op_2734_cast_fp16")];
+            tensor<fp16, [1, 11008, 1, 1]> blocks_2_mlp_fc_2_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_mlp_fc_2_output_scales_to_fp16"), val = tensor<fp16, [1, 11008, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303843520)))];
+            tensor<fp16, [1, 11008, 8, 8]> x_fc_2_cast_fp16 = mul(x = var_2734_cast_fp16, y = blocks_2_mlp_fc_2_output_scales_to_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 11008, 8, 8]> var_2736_cast_fp16 = silu(x = input_27_cast_fp16)[name = tensor<string, []>("op_2736_cast_fp16")];
+            tensor<fp16, [1, 11008, 8, 8]> input_cast_fp16 = mul(x = var_2736_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_2740 = const()[name = tensor<string, []>("op_2740"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_2742 = const()[name = tensor<string, []>("op_2742"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_2744_pad_type_0 = const()[name = tensor<string, []>("op_2744_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_2744_pad_0 = const()[name = tensor<string, []>("op_2744_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1, 4096, 8, 8]> var_2744_cast_fp16 = conv(dilations = var_2742, groups = var_1886, pad = var_2744_pad_0, pad_type = var_2744_pad_type_0, strides = var_2740, weight = blocks_2_mlp_proj_weight_palettized_cast_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_2744_cast_fp16")];
+            tensor<fp16, [1, 4096, 1, 1]> blocks_2_mlp_proj_output_scales_to_fp16 = const()[name = tensor<string, []>("blocks_2_mlp_proj_output_scales_to_fp16"), val = tensor<fp16, [1, 4096, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303865600)))];
+            tensor<fp16, [1, 4096, 8, 8]> var_2745_cast_fp16 = mul(x = var_2744_cast_fp16, y = blocks_2_mlp_proj_output_scales_to_fp16)[name = tensor<string, []>("op_2745_cast_fp16")];
+            tensor<fp16, [1, 4096, 8, 8]> new_x = add(x = var_2745_cast_fp16, y = x_45_cast_fp16)[name = tensor<string, []>("op_2746_cast_fp16")];
         } -> (new_x, new_k_cache_0, new_k_cache_1, new_k_cache_2, new_v_cache_0, new_v_cache_1, new_v_cache_2);
 }
\ No newline at end of file