UsefulSensors
/

moonshine-base

@@ -398,7 +398,7 @@ class AudioPreprocessor(nn.Module):
         assert (
             src.shape[-1] >= 1023
         ), f"src shape[-1] {src.shape[-1]} should be at least 1023"
-        src = src.unsqueeze(-2)
         return self.audio_preprocess(src)
@@ -435,7 +435,8 @@ class MoonshineModelTorch(nn.Module):
         sot_token = 1
         eot_token = 2
-        seq = torch.as_tensor([[sot_token]]).to(src.device)
         vals = self.decoder_initial(x=seq, enc_src=enc)
         logits = vals[0]
@@ -448,7 +449,7 @@ class MoonshineModelTorch(nn.Module):
         seq = torch.cat((seq, sample), dim=-1)
         seq_len = int(src.shape[-1] * 6.5 / 16000)
-        while sample != eot_token and len(seq.flatten()) <= seq_len:
             vals = self.decoder(
                 seq,
                 *k_cache,

         assert (
             src.shape[-1] >= 1023
         ), f"src shape[-1] {src.shape[-1]} should be at least 1023"
+        src = src.reshape((-1, 1, src.shape[-1]))
         return self.audio_preprocess(src)
         sot_token = 1
         eot_token = 2
+        sot_array = [[sot_token] for _ in range(enc.shape[0])]
+        seq = torch.as_tensor(sot_array).to(src.device)
         vals = self.decoder_initial(x=seq, enc_src=enc)
         logits = vals[0]
         seq = torch.cat((seq, sample), dim=-1)
         seq_len = int(src.shape[-1] * 6.5 / 16000)
+        while any([eot_token not in sub_seq for sub_seq in seq]) and seq.shape[-1] <= seq_len:
             vals = self.decoder(
                 seq,
                 *k_cache,