BucketOfFish
/

simplified_phi2

@@ -28,7 +28,7 @@ class RotaryEmbedding(nn.Module):
         d_rotary: int,
         rotary_base: float = 10000.0,
         initial_cos_sin_cache_len: int = 2048,
-        device: torch.device = "cuda",
     ) -> None:
         super().__init__()
         self.d_rotary = d_rotary
@@ -37,31 +37,37 @@ class RotaryEmbedding(nn.Module):
         self.dtype = torch.float32
         self._update_cos_sin_cache(seqlen=initial_cos_sin_cache_len)
-    def _update_cos_sin_cache(self, seqlen: int) -> None:
         # only call this function when seqlen is larger than _max_seqlen
         self._max_seqlen = seqlen
         # m * theta_i = m * base^(-2i/d) = m * (1 / base^(2i/d)), where i in [1, d/2]
         m = torch.arange(
             seqlen,
-            device=self.device,
-            dtype=self.dtype,
         )
         theta_i = 1.0 / (
             self.rotary_base ** (
                 torch.arange(
                     start=0,
                     end=self.d_rotary,
-                    device=self.device,
-                    dtype=self.dtype,
                 ) / self.d_rotary
             )
         )
         # torch.outer, since torch.einsum converts from fp32 to fp16 if used with torch.amp
         # TODO: does this matter if I'm disabling torch.autocast?
         m_theta_i = torch.outer(m, theta_i)
-        self._cos_cached = torch.cos(m_theta_i).to(self.dtype).to(self.device)
-        self._sin_cached = torch.sin(m_theta_i).to(self.dtype).to(self.device)
         # TODO: scale_base caching is labelled as not yet done in Phi2
         """
@@ -90,14 +96,17 @@ class RotaryEmbedding(nn.Module):
         sin: torch.FloatTensor,  # dim: (_max_seqlen, d_rotary)
     ) -> torch.FloatTensor:
         seqlen = x.shape[1]
-        x1, x2 = x.chunk(2, dim=-1)  # dim: (batch_size, seqlen, Optional[n_qkv], n_heads, d_head/2)
         broadcast_rearrange = "s d -> s 1 d" if x1.ndim == 4 else "s d -> s 1 1 d"
         c, s = rearrange(cos[:seqlen], broadcast_rearrange), rearrange(sin[:seqlen], broadcast_rearrange)
         x1, x2, c, s = [t.to(dtype=torch.float32) for t in [x1, x2, c, s]]  # make sure rotary embedding is in float32
-        return cast(
             torch.FloatTensor,
             torch.cat([x1 * c - x2 * s, x1 * s + x2 * c], dim=-1).to(x.dtype)
         )
     def forward(
         self,
@@ -107,9 +116,11 @@ class RotaryEmbedding(nn.Module):
         if (
             not self._max_seqlen
             or self._max_seqlen < x.shape[1] + seqlen_offset
             or (self.training and self._cos_cached.is_inference())
         ):
-            self._update_cos_sin_cache(seqlen=x.shape[1] + seqlen_offset)
         return self._apply_rotary_emb_qkv(
             x,
             cast(torch.FloatTensor, self._cos_cached[seqlen_offset:]),
@@ -269,7 +280,8 @@ class MHA(nn.Module):
             else RotaryEmbedding
         )
         self.rotary_emb = rotary_cls(
-            d_rotary=math.ceil((d_embedding // n_attn_heads) / 2),  # d_rotary is half of d_head
             initial_cos_sin_cache_len=initial_cos_sin_cache_len,
         )
@@ -378,12 +390,20 @@ class MHA(nn.Module):
         kv_cache: KVCache,
         key_padding_mask: torch.BoolTensor | None,
     ) -> torch.FloatTensor:
-        q = qkv[:, :, 0, :, :]
-        q = self.rotary_emb(
-            q,
             seqlen_offset = 0 if kv_cache is None else kv_cache.seqlen_offset,
         )
-        kv = cast(torch.FloatTensor, qkv[:, :, 1:, :, :])
         self._update_kv_cache(kv, kv_cache, self.block_n)
         causal = False  # turning off causal mask for cross attention

         d_rotary: int,
         rotary_base: float = 10000.0,
         initial_cos_sin_cache_len: int = 2048,
+        device: torch.device | None = None,
     ) -> None:
         super().__init__()
         self.d_rotary = d_rotary
         self.dtype = torch.float32
         self._update_cos_sin_cache(seqlen=initial_cos_sin_cache_len)
+    def _update_cos_sin_cache(
+        self,
+        seqlen: int,
+        device: str | None = None,
+        dtype: torch.dtype | None = None,
+    ) -> None:
         # only call this function when seqlen is larger than _max_seqlen
         self._max_seqlen = seqlen
         # m * theta_i = m * base^(-2i/d) = m * (1 / base^(2i/d)), where i in [1, d/2]
         m = torch.arange(
             seqlen,
+            device=device,
+            dtype=torch.float32,
         )
         theta_i = 1.0 / (
             self.rotary_base ** (
                 torch.arange(
                     start=0,
                     end=self.d_rotary,
+                    step=2,
+                    device=device,
+                    dtype=torch.float32,
                 ) / self.d_rotary
             )
         )
         # torch.outer, since torch.einsum converts from fp32 to fp16 if used with torch.amp
         # TODO: does this matter if I'm disabling torch.autocast?
         m_theta_i = torch.outer(m, theta_i)
+        self._cos_cached = torch.cos(m_theta_i).to(dtype)
+        self._sin_cached = torch.sin(m_theta_i).to(dtype)
         # TODO: scale_base caching is labelled as not yet done in Phi2
         """
         sin: torch.FloatTensor,  # dim: (_max_seqlen, d_rotary)
     ) -> torch.FloatTensor:
         seqlen = x.shape[1]
+        x_to_rotate = x[..., :self.d_rotary]
+        x_to_keep_unrotated = x[..., self.d_rotary:]
+        x1, x2 = x_to_rotate.chunk(2, dim=-1)  # dim: (batch_size, seqlen, Optional[n_qkv], n_heads, d_rotary/2)
         broadcast_rearrange = "s d -> s 1 d" if x1.ndim == 4 else "s d -> s 1 1 d"
         c, s = rearrange(cos[:seqlen], broadcast_rearrange), rearrange(sin[:seqlen], broadcast_rearrange)
         x1, x2, c, s = [t.to(dtype=torch.float32) for t in [x1, x2, c, s]]  # make sure rotary embedding is in float32
+        x_rotated = cast(
             torch.FloatTensor,
             torch.cat([x1 * c - x2 * s, x1 * s + x2 * c], dim=-1).to(x.dtype)
         )
+        return torch.cat([x_rotated, x_to_keep_unrotated], axis=-1)
     def forward(
         self,
         if (
             not self._max_seqlen
             or self._max_seqlen < x.shape[1] + seqlen_offset
+            or self._cos_cached.device != x.device
+            or self._cos_cached.dtype != x.dtype
             or (self.training and self._cos_cached.is_inference())
         ):
+            self._update_cos_sin_cache(seqlen=x.shape[1] + seqlen_offset, device=x.device, dtype=x.dtype)
         return self._apply_rotary_emb_qkv(
             x,
             cast(torch.FloatTensor, self._cos_cached[seqlen_offset:]),
             else RotaryEmbedding
         )
         self.rotary_emb = rotary_cls(
+            # d_rotary=math.ceil((d_embedding // n_attn_heads) / 2),  # d_rotary is half of d_head
+            d_rotary=32,  # TODO: figure out why Phi2 uses this
             initial_cos_sin_cache_len=initial_cos_sin_cache_len,
         )
         kv_cache: KVCache,
         key_padding_mask: torch.BoolTensor | None,
     ) -> torch.FloatTensor:
+        qk = qkv[:, :, :2, :, :]
+        qk = self.rotary_emb(
+            qk,
             seqlen_offset = 0 if kv_cache is None else kv_cache.seqlen_offset,
         )
+        v = cast(torch.FloatTensor, qkv[:, :, 2, :, :])
+        q = qk[:, :, 0, :, :]
+        kv = torch.cat(
+            [
+                qk[:, :, 1, :, :].unsqueeze(2),
+                v.unsqueeze(2),
+            ],
+            dim=2,
+        )
         self._update_kv_cache(kv, kv_cache, self.block_n)
         causal = False  # turning off causal mask for cross attention