Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -172,9 +172,30 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
172 |
elif exp_name == "Deutsh":
|
173 |
ema_model = E2TTS_ema_model2
|
174 |
#ref_audio, ref_text = preprocess_ref_audio_text(ref_audio, ref_text, show_info=show_info)
|
175 |
-
|
176 |
|
177 |
audio, sr = ref_audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
if audio.shape[0] > 1:
|
179 |
audio = torch.mean(audio, dim=0, keepdim=True)
|
180 |
|
|
|
172 |
elif exp_name == "Deutsh":
|
173 |
ema_model = E2TTS_ema_model2
|
174 |
#ref_audio, ref_text = preprocess_ref_audio_text(ref_audio, ref_text, show_info=show_info)
|
175 |
+
|
176 |
|
177 |
audio, sr = ref_audio
|
178 |
+
|
179 |
+
# Convert audio to PyDub AudioSegment (assuming it's a NumPy array in [-1, 1] range)
|
180 |
+
audio_segment = AudioSegment(
|
181 |
+
(audio * (2**15)).astype(np.int16).tobytes(), # Scale to 16-bit PCM range
|
182 |
+
frame_rate=sr,
|
183 |
+
sample_width=2, # 16-bit audio
|
184 |
+
channels=1 if len(audio.shape) == 1 else audio.shape[0] # Mono or multi-channel
|
185 |
+
)
|
186 |
+
|
187 |
+
# Remove silence using your custom function
|
188 |
+
audio_segment = remove_silence_edges(audio_segment, silence_threshold=-42)
|
189 |
+
|
190 |
+
# Convert back to NumPy array for further processing
|
191 |
+
audio_trimmed = np.array(audio_segment.get_array_of_samples(), dtype=np.float32) / (2**15)
|
192 |
+
|
193 |
+
# Continue processing with trimmed audio
|
194 |
+
audio = torch.from_numpy(audio_trimmed).unsqueeze(0) # Add batch/channel dimension
|
195 |
+
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
if audio.shape[0] > 1:
|
200 |
audio = torch.mean(audio, dim=0, keepdim=True)
|
201 |
|