Spaces:

Hematej
/

hema-tts

Runtime error

App Files Files Community

Hematej commited on Jun 1

Commit

5a53ea0

verified ·

1 Parent(s): e50b6dd

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -27

app.py CHANGED Viewed

@@ -26,9 +26,8 @@ Codec_model.eval().to(device)
 whisper_turbo_pipe = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-large-v3-turbo",
-    torch_dtype=torch.float16 if device == "cuda" else torch.bfloat16,
-    device=device,
 )
 def ids_to_speech_tokens(speech_ids):
@@ -47,56 +46,54 @@ def extract_speech_ids(speech_tokens_str):
     return speech_ids
 def infer(sample_audio_path, target_text, progress=gr.Progress()):
-    global tokenizer  # ✅ Declare before using
     if tokenizer is None:
         print("Warning: Tokenizer is missing, reloading...")
-        tokenizer = AutoTokenizer.from_pretrained(llasa_3b)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         progress(0.2, 'Loading audio...')
         waveform, sample_rate = torchaudio.load(sample_audio_path)
-        if len(waveform[0]) / sample_rate > 60:
-            print("Trimming audio to first 1 minute.")
             waveform = waveform[:, :sample_rate * 60]
         progress(0.4, 'Trimming audio...')
         if waveform.shape[1] / sample_rate > 30:
-            print("Trimming audio to 30 seconds for Whisper ASR.")
             waveform = waveform[:, :sample_rate * 30]
         if waveform.size(0) > 1:
             waveform_mono = torch.mean(waveform, dim=0, keepdim=True)
         else:
             waveform_mono = waveform
-        waveform_mono = waveform_mono.to(device)
         prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_mono)
         prompt_wav_np = prompt_wav[0].detach().cpu().numpy()
         progress(0.6, 'Transcribing speech...')
         try:
-            prompt_text = whisper_turbo_pipe(prompt_wav_np)['text'].strip()  # ✅ First call without timestamps
         except Exception:
-            print("Whisper ASR failed. Retrying without timestamps...")
-            prompt_text = whisper_turbo_pipe(prompt_wav_np, return_timestamps=False)['text'].strip()
         if not prompt_text or prompt_text.lower() in ["error: unable to transcribe", ""]:
-            print("Warning: Whisper ASR output is empty. Defaulting to target text.")
             prompt_text = target_text
         progress(0.8, 'Generating synthesized audio...')
         if len(target_text) == 0:
             return None
         elif len(target_text) > 500:
             target_text = target_text[:500]
-            print("Text is too long. Please keep it under 500 characters.")
         input_text = " ".join(filter(None, [prompt_text.strip(), target_text.strip()]))
         with torch.no_grad():
-            vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)
             vq_code_prompt = vq_code_prompt[0,0,:]
             speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)
@@ -108,11 +105,11 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
             ]
             input_ids = tokenizer.apply_chat_template(chat, tokenize=True, return_tensors='pt', continue_final_message=True)
-            input_ids = input_ids.to(device)
             speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
             if speech_end_id is None:
-                raise ValueError("Error: `<|SPEECH_GENERATION_END|>` token not found in tokenizer!")
             outputs = model.generate(
                 input_ids,
@@ -125,19 +122,14 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
                 temperature=0.7,
             )
-            if len(outputs[0]) < input_ids.shape[1] - len(speech_ids_prefix):
-                print("Warning: Generated output is shorter than expected.")
-                generated_ids = outputs[0]
-            else:
-                generated_ids = outputs[0][input_ids.shape[1] - len(speech_ids_prefix):-1]
             speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
             speech_tokens = extract_speech_ids(speech_tokens)
             if not speech_tokens:
-                raise ValueError("Error: No valid speech tokens extracted—speech synthesis may fail.")
             else:
-                speech_tokens = torch.tensor(speech_tokens).to(device).unsqueeze(0).unsqueeze(0)
             gen_wav = Codec_model.decode_code(speech_tokens)
             gen_wav = gen_wav[:,:,prompt_wav.shape[1]:]
@@ -145,7 +137,6 @@ def infer(sample_audio_path, target_text, progress=gr.Progress()):
             progress(1.0, 'Complete!')
         return (16000, gen_wav[0, 0, :].cpu().numpy())
 with gr.Blocks() as app_tts:
     gr.Markdown("# Zero Shot Voice Clone TTS")
     ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")

 whisper_turbo_pipe = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-large-v3-turbo",
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.bfloat16,
+    device="cuda" if torch.cuda.is_available() else "cpu"
 )
 def ids_to_speech_tokens(speech_ids):
     return speech_ids
 def infer(sample_audio_path, target_text, progress=gr.Progress()):
+    global tokenizer
     if tokenizer is None:
         print("Warning: Tokenizer is missing, reloading...")
+        tokenizer = AutoTokenizer.from_pretrained("llasa_3b")
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         progress(0.2, 'Loading audio...')
         waveform, sample_rate = torchaudio.load(sample_audio_path)
+        # ✅ Trim audio for compatibility
+        if waveform.size(1) / sample_rate > 60:
             waveform = waveform[:, :sample_rate * 60]
         progress(0.4, 'Trimming audio...')
         if waveform.shape[1] / sample_rate > 30:
             waveform = waveform[:, :sample_rate * 30]
         if waveform.size(0) > 1:
             waveform_mono = torch.mean(waveform, dim=0, keepdim=True)
         else:
             waveform_mono = waveform
+        waveform_mono = waveform_mono.to("cuda" if torch.cuda.is_available() else "cpu")
         prompt_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform_mono)
         prompt_wav_np = prompt_wav[0].detach().cpu().numpy()
         progress(0.6, 'Transcribing speech...')
         try:
+            prompt_text = whisper_turbo_pipe(prompt_wav_np, language="en")['text'].strip()  # ✅ Force English transcription
         except Exception:
+            print("Whisper ASR failed. Retrying...")
+            prompt_text = whisper_turbo_pipe(prompt_wav_np)['text'].strip()
         if not prompt_text or prompt_text.lower() in ["error: unable to transcribe", ""]:
             prompt_text = target_text
         progress(0.8, 'Generating synthesized audio...')
         if len(target_text) == 0:
             return None
         elif len(target_text) > 500:
             target_text = target_text[:500]
+            print("Text truncated to 500 characters.")
         input_text = " ".join(filter(None, [prompt_text.strip(), target_text.strip()]))
         with torch.no_grad():
+            vq_code_prompt = Codec_model.encode_code(prompt_wav)
             vq_code_prompt = vq_code_prompt[0,0,:]
             speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)
             ]
             input_ids = tokenizer.apply_chat_template(chat, tokenize=True, return_tensors='pt', continue_final_message=True)
+            input_ids = input_ids.to("cuda" if torch.cuda.is_available() else "cpu")
             speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
             if speech_end_id is None:
+                raise ValueError("Error: `<|SPEECH_GENERATION_END|>` token not found!")
             outputs = model.generate(
                 input_ids,
                 temperature=0.7,
             )
+            generated_ids = outputs[0][input_ids.shape[1] - len(speech_ids_prefix):-1]
             speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
             speech_tokens = extract_speech_ids(speech_tokens)
             if not speech_tokens:
+                raise ValueError("Error: No valid speech tokens extracted!")
             else:
+                speech_tokens = torch.tensor(speech_tokens).to("cuda" if torch.cuda.is_available() else "cpu").unsqueeze(0).unsqueeze(0)
             gen_wav = Codec_model.decode_code(speech_tokens)
             gen_wav = gen_wav[:,:,prompt_wav.shape[1]:]
             progress(1.0, 'Complete!')
         return (16000, gen_wav[0, 0, :].cpu().numpy())
 with gr.Blocks() as app_tts:
     gr.Markdown("# Zero Shot Voice Clone TTS")
     ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")