Spaces:

Hematej
/

llasatts-tej

Running

App Files Files Community

Hematej commited on Jun 1

Commit

902f49d

verified ·

1 Parent(s): 0e999cf

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -12

app.py CHANGED Viewed

@@ -33,18 +33,26 @@ whisper_turbo_pipe = pipeline(
 )
 def ids_to_speech_tokens(speech_ids):
-    return [f"<|s_{speech_id}|>" for speech_id in speech_ids]
 def extract_speech_ids(speech_tokens_str):
     speech_ids = []
     for token_str in speech_tokens_str:
         if token_str.startswith('<|s_') and token_str.endswith('|>'):
-            try:
-                speech_ids.append(int(token_str[4:-2]))
-            except ValueError:
-                print(f"Unexpected token: {token_str}")
     return speech_ids
 @spaces.GPU(duration=60)
 def infer(sample_audio_path, target_text, progress=gr.Progress()):
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
@@ -54,10 +62,12 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
             gr.Warning("Trimming audio to first 15secs.")
             waveform = waveform[:, :sample_rate*15]
-        # ✅ Convert stereo to mono dynamically
-        waveform_mono = waveform.mean(dim=0, keepdim=True) if waveform.size(0) > 1 else waveform
-        waveform_mono = waveform_mono.to(device)
         prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_mono)
         prompt_text = whisper_turbo_pipe(prompt_wav[0].cpu().numpy(), language="en")['text'].strip()  # ✅ Force English transcription
         progress(0.5, 'Transcribed! Generating speech...')
@@ -68,10 +78,10 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
             gr.Warning("Text is too long. Please keep it under 300 characters.")
             target_text = target_text[:300]
-        input_text = f"{prompt_text} {target_text}"
         with torch.no_grad():
-            vq_code_prompt = Codec_model.encode_code(prompt_wav)
             vq_code_prompt = vq_code_prompt[0,0,:]
             speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)
@@ -110,7 +120,7 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
             if not speech_tokens:
                 raise ValueError("Error: No valid speech tokens extracted!")
-            speech_tensor = torch.tensor(speech_tokens).unsqueeze(0).unsqueeze(0).to(device)
             gen_wav = Codec_model.decode_code(speech_tensor)
             gen_wav = gen_wav[:,:,prompt_wav.shape[1]:]

 )
 def ids_to_speech_tokens(speech_ids):
+    speech_tokens_str = []
+    for speech_id in speech_ids:
+        speech_tokens_str.append(f"<|s_{speech_id}|>")
+    return speech_tokens_str
 def extract_speech_ids(speech_tokens_str):
     speech_ids = []
     for token_str in speech_tokens_str:
         if token_str.startswith('<|s_') and token_str.endswith('|>'):
+            num_str = token_str[4:-2]
+            num = int(num_str)
+            speech_ids.append(num)
+        else:
+            print(f"Unexpected token: {token_str}")
     return speech_ids
 @spaces.GPU(duration=60)
 def infer(sample_audio_path, target_text, progress=gr.Progress()):
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
             gr.Warning("Trimming audio to first 15secs.")
             waveform = waveform[:, :sample_rate*15]
+        if waveform.size(0) > 1:
+            # Convert stereo to mono by averaging the channels
+            waveform_mono = torch.mean(waveform, dim=0, keepdim=True)
+        else:
+            # If already mono, just use the original waveform
+            waveform_mono = waveform
         prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_mono)
         prompt_text = whisper_turbo_pipe(prompt_wav[0].cpu().numpy(), language="en")['text'].strip()  # ✅ Force English transcription
         progress(0.5, 'Transcribed! Generating speech...')
             gr.Warning("Text is too long. Please keep it under 300 characters.")
             target_text = target_text[:300]
+        input_text = prompt_text + ' ' + target_text
         with torch.no_grad():
+            vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)
             vq_code_prompt = vq_code_prompt[0,0,:]
             speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)
             if not speech_tokens:
                 raise ValueError("Error: No valid speech tokens extracted!")
+            speech_tokens = torch.tensor(speech_tokens).unsqueeze(0).unsqueeze(0).to(device)
             gen_wav = Codec_model.decode_code(speech_tensor)
             gen_wav = gen_wav[:,:,prompt_wav.shape[1]:]