Make `configuration_phi4flash.py` and `modeling_phi4flash.py` compatible with standard sliding window config (#7)

- `sliding_window: list[Optional[int]]` -> `sliding_window: int` + `layer_types: list[str]` (8928fa33c0a966cf0822f7601ec722d38dd61285)

Files changed (2) hide show

configuration_phi4flash.py +10 -4
modeling_phi4flash.py +9 -9

configuration_phi4flash.py CHANGED Viewed

@@ -112,6 +112,7 @@ class Phi4FlashConfig(PretrainedConfig):
         bos_token_id=1,
         eos_token_id=2,
         sliding_window=2047,
         mb_per_layer= 2,
         mamba_d_state=16,
         mamba_d_conv=4,
@@ -141,11 +142,16 @@ class Phi4FlashConfig(PretrainedConfig):
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.mb_per_layer = mb_per_layer
-        self.sliding_window = [
-            sliding_window if layer_idx < num_hidden_layers // 2 and layer_idx % 2 == 1 else None
-            for layer_idx in range(num_hidden_layers)
-        ]
         self.mamba_d_state = mamba_d_state
         self.mamba_d_conv = mamba_d_conv
         self.mamba_expand = mamba_expand

         bos_token_id=1,
         eos_token_id=2,
         sliding_window=2047,
+        layer_types=None,
         mb_per_layer= 2,
         mamba_d_state=16,
         mamba_d_conv=4,
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.mb_per_layer = mb_per_layer
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            is_sliding = lambda i: i < num_hidden_layers // 2 and i % 2 == 1,
+            self.layer_types = [
+                "sliding_attention" if is_sliding(layer_idx) else "full_attention"
+                for layer_idx in range(num_hidden_layers)
+            ]
         self.mamba_d_state = mamba_d_state
         self.mamba_d_conv = mamba_d_conv
         self.mamba_expand = mamba_expand

modeling_phi4flash.py CHANGED Viewed

@@ -129,7 +129,7 @@ def _get_cache(
         cache_to_check = self._cache.self_attention_cache if requires_cross_attention_cache else self._cache
     if cache_implementation == "sliding_window":
-        max_cache_len = min(self.config.sliding_window[1], max_cache_len)
     need_new_cache = (
         not hasattr(self, "_cache")
@@ -243,7 +243,7 @@ class SambaYCache(Cache):
         sliding_cache_shape = (
             self.max_batch_size,
             self.num_key_value_heads,
-            min(config.sliding_window[1], max_cache_len),
             self.head_dim,
         )
         conv_cache_shape = (self.max_batch_size, intermediate_size, conv_kernel_size)
@@ -573,7 +573,7 @@ class SambaYFlashAttention2(SambaYAttention):
             key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
             value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-            use_sliding_windows = self.config.sliding_window is not None and self.config.sliding_window[self.layer_idx] is not None
             if past_key_value is not None:
@@ -710,8 +710,8 @@ class SambaYFlashAttention2(SambaYAttention):
                     softmax_scale=softmax_scale,
                     causal=causal,
                     window_size=(
-                        self.config.sliding_window[self.layer_idx] -1,
-                        self.config.sliding_window[self.layer_idx] -1,
                     ),
                 )
@@ -735,8 +735,8 @@ class SambaYFlashAttention2(SambaYAttention):
                     softmax_scale=softmax_scale,
                     causal=causal,
                     window_size=(
-                        self.config.sliding_window[self.layer_idx] -1,
-                        self.config.sliding_window[self.layer_idx] -1,
                     ),
                 )
@@ -1085,9 +1085,9 @@ class SambaYDecoderLayer(nn.Module):
             residual = residual.to(torch.float32)
             self_attn_weights = None
         else:
-            if self.config.sliding_window is not None and self.config.sliding_window[self.layer_idx] is not None and attention_mask is not None:  # efficient SDPA and no padding
                 if past_key_value is not None and cache_position[0] > 0:  # when decoding
-                    attention_mask = attention_mask[:, -self.config.sliding_window[self.layer_idx]:]
             #hidden_states = self.input_layernorm2(hidden_states.to(dtype=self.input_layernorm2.weight.dtype))
             # Self Attention
             attn_outputs, self_attn_weights, yoco_key_values = self.attn(

         cache_to_check = self._cache.self_attention_cache if requires_cross_attention_cache else self._cache
     if cache_implementation == "sliding_window":
+        max_cache_len = min(self.config.sliding_window, max_cache_len)
     need_new_cache = (
         not hasattr(self, "_cache")
         sliding_cache_shape = (
             self.max_batch_size,
             self.num_key_value_heads,
+            min(config.sliding_window, max_cache_len),
             self.head_dim,
         )
         conv_cache_shape = (self.max_batch_size, intermediate_size, conv_kernel_size)
             key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
             value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+            use_sliding_windows = self.config.sliding_window is not None and self.config.layer_types[self.layer_idx] is not None
             if past_key_value is not None:
                     softmax_scale=softmax_scale,
                     causal=causal,
                     window_size=(
+                        self.config.layer_types[self.layer_idx] -1,
+                        self.config.layer_types[self.layer_idx] -1,
                     ),
                 )
                     softmax_scale=softmax_scale,
                     causal=causal,
                     window_size=(
+                        self.config.layer_types[self.layer_idx] -1,
+                        self.config.layer_types[self.layer_idx] -1,
                     ),
                 )
             residual = residual.to(torch.float32)
             self_attn_weights = None
         else:
+            if self.config.sliding_window is not None and self.config.layer_types[self.layer_idx] is not None and attention_mask is not None:  # efficient SDPA and no padding
                 if past_key_value is not None and cache_position[0] > 0:  # when decoding
+                    attention_mask = attention_mask[:, -self.config.layer_types[self.layer_idx]:]
             #hidden_states = self.input_layernorm2(hidden_states.to(dtype=self.input_layernorm2.weight.dtype))
             # Self Attention
             attn_outputs, self_attn_weights, yoco_key_values = self.attn(