Upload modeling_moonshine.py

Browse files

Add full support for batching. Update decoding loop and input mask logic.

Files changed (1) hide show

modeling_moonshine.py +47 -22

modeling_moonshine.py CHANGED Viewed

@@ -113,11 +113,11 @@ class MultiHeadCrossAttentionWithKVCache(MultiHeadAttention):
     def __init__(self, dim, inner_dim, n_head):
         super().__init__(dim, inner_dim, n_head)
-    def forward(self, q, k_cache, v_cache):
         q = self.to_q(q)
         q = rearrange(q, "b n (h d) -> b h n d", h=self.n_head)
-        return super().sdp_attention(q, k_cache, v_cache)
 class FFLinearGelu(nn.Module):
@@ -162,10 +162,10 @@ class EncoderLayer(nn.Module):
         self.ff = FFSwiGLU(dim, ff_mult) if ff_swiglu else FFLinearGelu(dim, ff_mult)
-    def forward(self, x, rot_pos_emb):
         _x = x
         x = self.norm1(x)
-        x, _, _ = self.attention(q=x, k=x, v=x, rot_pos_emb=rot_pos_emb)
         x = x + _x
         _x = x
@@ -187,12 +187,12 @@ class Encoder(nn.Module):
         )
         self.post_norm = nn.LayerNorm(dim, bias=False)
-    def forward(self, x):
-        pos = torch.arange(x.shape[1], device=x.device)
         rot_pos_emb = self.rot_pos_emb(pos)
-        for layer in self.layers:
-            x = layer(x, rot_pos_emb=rot_pos_emb)
         return self.post_norm(x)
@@ -214,7 +214,7 @@ class DecoderLayer(nn.Module):
         self.norm3 = nn.LayerNorm(dim, bias=False)
         self.ff = FFSwiGLU(dim, ff_mult) if ff_swiglu else FFLinearGelu(dim, ff_mult)
-    def forward(self, x, k_cache, v_cache, x_attn_k_cache, x_attn_v_cache, rot_pos_emb):
         dim = x.size()[1]
         causal_mask = torch.ones((dim, dim), dtype=torch.bool).triu(1).to(x.device)
         _x = x
@@ -232,7 +232,7 @@ class DecoderLayer(nn.Module):
         _x = x
         x = self.norm2(x)
-        x = self.cross_attention(q=x, k_cache=x_attn_k_cache, v_cache=x_attn_v_cache)
         x = x + _x
         _x = x
@@ -259,7 +259,7 @@ class Decoder(nn.Module):
         self.final_norm = nn.LayerNorm(dim, bias=False)
         self.token_embedding = nn.Embedding(dec_voc_size, dim)
-    def forward(self, x, *args):
         pos = torch.arange(x.shape[1], device=x.device)
         rot_pos_emb = self.rot_pos_emb(pos)
         x = self.token_embedding(x)
@@ -279,6 +279,7 @@ class Decoder(nn.Module):
                 x_attn_k_cache=x_attn_k_cache[idx],
                 x_attn_v_cache=x_attn_v_cache[idx],
                 rot_pos_emb=rot_pos_emb,
             )
             k_cache_new.append(new_k_line)
             v_cache_new.append(new_v_line)
@@ -306,7 +307,7 @@ class InitialDecoderLayer(nn.Module):
         self.norm3 = nn.LayerNorm(dim, bias=False)
         self.ff = FFSwiGLU(dim, ff_mult) if ff_swiglu else FFLinearGelu(dim, ff_mult)
-    def forward(self, x, context, rot_pos_emb):
         dim = x.size()[1]
         causal_mask = torch.ones((dim, dim), dtype=torch.bool).triu(1).to(x.device)
         _x = x
@@ -323,7 +324,7 @@ class InitialDecoderLayer(nn.Module):
         _x = x
         x = self.norm2(x)
         x, x_attn_k_cache, x_attn_v_cache = self.cross_attention(
-            q=x, k=context, v=context
         )
         x = x + _x
@@ -345,7 +346,7 @@ class DecoderInitial(Decoder):
             ]
         )
-    def forward(self, x, enc_src):
         pos = torch.arange(x.shape[1], device=x.device)
         rot_pos_emb = self.rot_pos_emb(pos)
         x = self.token_embedding(x)
@@ -362,6 +363,7 @@ class DecoderInitial(Decoder):
                 x,
                 enc_src,
                 rot_pos_emb,
             )
             k_cache.append(new_k_line)
@@ -429,16 +431,34 @@ class MoonshineModelTorch(nn.Module):
         self.n_head = n_head
         self.d_head = inner_dim // n_head
-    def generate(self, src):
         preprocessed = self.preprocessor(src)
-        enc = self.encoder(preprocessed)
         sot_token = 1
         eot_token = 2
-        sot_array = [[sot_token] for _ in range(enc.shape[0])]
         seq = torch.as_tensor(sot_array).to(src.device)
-        vals = self.decoder_initial(x=seq, enc_src=enc)
         logits = vals[0]
         k_cache, v_cache, x_attn_k_cache, x_attn_v_cache = [
             vals[i : i + self.dec_depth]
@@ -448,10 +468,11 @@ class MoonshineModelTorch(nn.Module):
         sample = logits[:, -1].argmax(dim=-1, keepdim=True)
         seq = torch.cat((seq, sample), dim=-1)
-        seq_len = int(src.shape[-1] * 6.5 / 16000)
-        while any([eot_token not in sub_seq for sub_seq in seq]) and seq.shape[-1] <= seq_len:
             vals = self.decoder(
                 seq,
                 *k_cache,
                 *v_cache,
                 *x_attn_k_cache,
@@ -462,6 +483,10 @@ class MoonshineModelTorch(nn.Module):
             v_cache = vals[self.dec_depth + 1 :]
             logits = logits[:, -1]  # get last token
             sample = logits.argmax(dim=-1, keepdim=True)
             seq = torch.cat((seq, sample), dim=-1)
         return seq
@@ -483,5 +508,5 @@ class MoonshineModel(PreTrainedModel):
             dec_ff_swiglu = config.dec_ff_swiglu,
         )
-    def forward(self, tensor):
-        return self.model.generate(tensor)

     def __init__(self, dim, inner_dim, n_head):
         super().__init__(dim, inner_dim, n_head)
+    def forward(self, q, k_cache, v_cache, mask):
         q = self.to_q(q)
         q = rearrange(q, "b n (h d) -> b h n d", h=self.n_head)
+        return super().sdp_attention(q, k_cache, v_cache, mask=mask)
 class FFLinearGelu(nn.Module):
         self.ff = FFSwiGLU(dim, ff_mult) if ff_swiglu else FFLinearGelu(dim, ff_mult)
+    def forward(self, x, rot_pos_emb, mask):
         _x = x
         x = self.norm1(x)
+        x, _, _ = self.attention(q=x, k=x, v=x, rot_pos_emb=rot_pos_emb, mask=mask)
         x = x + _x
         _x = x
         )
         self.post_norm = nn.LayerNorm(dim, bias=False)
+    def forward(self, x, mask):
+        pos = torch.arange(x.shape[-2], device=x.device)
         rot_pos_emb = self.rot_pos_emb(pos)
+        for idx, layer in enumerate(self.layers):
+            x = layer(x, rot_pos_emb=rot_pos_emb, mask=mask)
         return self.post_norm(x)
         self.norm3 = nn.LayerNorm(dim, bias=False)
         self.ff = FFSwiGLU(dim, ff_mult) if ff_swiglu else FFLinearGelu(dim, ff_mult)
+    def forward(self, x, k_cache, v_cache, x_attn_k_cache, x_attn_v_cache, rot_pos_emb, input_mask):
         dim = x.size()[1]
         causal_mask = torch.ones((dim, dim), dtype=torch.bool).triu(1).to(x.device)
         _x = x
         _x = x
         x = self.norm2(x)
+        x = self.cross_attention(q=x, k_cache=x_attn_k_cache, v_cache=x_attn_v_cache, mask=input_mask)
         x = x + _x
         _x = x
         self.final_norm = nn.LayerNorm(dim, bias=False)
         self.token_embedding = nn.Embedding(dec_voc_size, dim)
+    def forward(self, x, input_mask, *args):
         pos = torch.arange(x.shape[1], device=x.device)
         rot_pos_emb = self.rot_pos_emb(pos)
         x = self.token_embedding(x)
                 x_attn_k_cache=x_attn_k_cache[idx],
                 x_attn_v_cache=x_attn_v_cache[idx],
                 rot_pos_emb=rot_pos_emb,
+                input_mask=input_mask,
             )
             k_cache_new.append(new_k_line)
             v_cache_new.append(new_v_line)
         self.norm3 = nn.LayerNorm(dim, bias=False)
         self.ff = FFSwiGLU(dim, ff_mult) if ff_swiglu else FFLinearGelu(dim, ff_mult)
+    def forward(self, x, context, rot_pos_emb, input_mask):
         dim = x.size()[1]
         causal_mask = torch.ones((dim, dim), dtype=torch.bool).triu(1).to(x.device)
         _x = x
         _x = x
         x = self.norm2(x)
         x, x_attn_k_cache, x_attn_v_cache = self.cross_attention(
+            q=x, k=context, v=context, mask=input_mask,
         )
         x = x + _x
             ]
         )
+    def forward(self, x, enc_src, input_mask):
         pos = torch.arange(x.shape[1], device=x.device)
         rot_pos_emb = self.rot_pos_emb(pos)
         x = self.token_embedding(x)
                 x,
                 enc_src,
                 rot_pos_emb,
+                input_mask,
             )
             k_cache.append(new_k_line)
         self.n_head = n_head
         self.d_head = inner_dim // n_head
+    def generate(self, src, mask):
         preprocessed = self.preprocessor(src)
+        batch_size = preprocessed.shape[0]
+        # Get max sequence length based on number of unmasked inputs for each sample in batch.
+        token_limit_factor = 6.5 / 16000.0 # Maximum of 6.5 tokens per second.
+        if mask is not None:
+            seq_lens = torch.sum(mask, dim=-1, keepdim=True) * token_limit_factor
+        else:
+            token_limit = torch.tensor([src.shape[-1] * token_limit_factor])
+            seq_lens = torch.stack([token_limit for _ in range(batch_size)])
+        seq_lens = seq_lens.to(torch.int32).to(src.device).squeeze()
+        # Preprocess mask so that it matches preprocessed audio.
+        if mask is not None:
+            mask = mask[..., :-127:64][..., :-7:3][..., :-3:2].to(torch.bool)
+            mask = ~mask.reshape((batch_size, 1, 1, -1))
+            mask = torch.nn.functional.pad(mask, (0, preprocessed.shape[-2] - mask.shape[-1]))
+        enc = self.encoder(preprocessed, mask)
         sot_token = 1
         eot_token = 2
+        sot_array = [[sot_token] for _ in range(batch_size)]
         seq = torch.as_tensor(sot_array).to(src.device)
+        vals = self.decoder_initial(x=seq, enc_src=enc, input_mask=mask)
         logits = vals[0]
         k_cache, v_cache, x_attn_k_cache, x_attn_v_cache = [
             vals[i : i + self.dec_depth]
         sample = logits[:, -1].argmax(dim=-1, keepdim=True)
         seq = torch.cat((seq, sample), dim=-1)
+        eot_mask = torch.zeros((batch_size), dtype=torch.bool).to(src.device)
+        while not torch.all(eot_mask):
             vals = self.decoder(
                 seq,
+                mask,
                 *k_cache,
                 *v_cache,
                 *x_attn_k_cache,
             v_cache = vals[self.dec_depth + 1 :]
             logits = logits[:, -1]  # get last token
             sample = logits.argmax(dim=-1, keepdim=True)
+            # For each sample in batch detect EOT or token limit reached.
+            eot_mask = eot_mask | (sample.squeeze() == eot_token)
+            eot_mask = eot_mask | (seq.shape[-1] >= seq_lens)
+            sample = sample.masked_fill(eot_mask.reshape((-1, 1)), eot_token)
             seq = torch.cat((seq, sample), dim=-1)
         return seq
             dec_ff_swiglu = config.dec_ff_swiglu,
         )
+    def forward(self, tensor, input_mask=None):
+        return self.model.generate(tensor, input_mask)