Spaces:

Gregniuki
/

f5-tts_Polish_English_German

Running on Zero

Gregniuki commited on Nov 28, 2024

Commit

4d14e15

verified ·

1 Parent(s): 391f60a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -172,9 +172,30 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
     elif exp_name == "Deutsh":
         ema_model = E2TTS_ema_model2
     #ref_audio, ref_text = preprocess_ref_audio_text(ref_audio, ref_text, show_info=show_info)
-    ref_audio = remove_silence_edges(ref_audio) + AudioSegment.silent(duration=50)
     audio, sr = ref_audio
     if audio.shape[0] > 1:
         audio = torch.mean(audio, dim=0, keepdim=True)

     elif exp_name == "Deutsh":
         ema_model = E2TTS_ema_model2
     #ref_audio, ref_text = preprocess_ref_audio_text(ref_audio, ref_text, show_info=show_info)
     audio, sr = ref_audio
+# Convert audio to PyDub AudioSegment (assuming it's a NumPy array in [-1, 1] range)
+    audio_segment = AudioSegment(
+        (audio * (2**15)).astype(np.int16).tobytes(),  # Scale to 16-bit PCM range
+        frame_rate=sr,
+        sample_width=2,  # 16-bit audio
+        channels=1 if len(audio.shape) == 1 else audio.shape[0]  # Mono or multi-channel
+    )
+# Remove silence using your custom function
+    audio_segment = remove_silence_edges(audio_segment, silence_threshold=-42)
+# Convert back to NumPy array for further processing
+    audio_trimmed = np.array(audio_segment.get_array_of_samples(), dtype=np.float32) / (2**15)
+# Continue processing with trimmed audio
+    audio = torch.from_numpy(audio_trimmed).unsqueeze(0)  # Add batch/channel dimension
     if audio.shape[0] > 1:
         audio = torch.mean(audio, dim=0, keepdim=True)