Gregniuki commited on
Commit
4d14e15
·
verified ·
1 Parent(s): 391f60a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -1
app.py CHANGED
@@ -172,9 +172,30 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
172
  elif exp_name == "Deutsh":
173
  ema_model = E2TTS_ema_model2
174
  #ref_audio, ref_text = preprocess_ref_audio_text(ref_audio, ref_text, show_info=show_info)
175
- ref_audio = remove_silence_edges(ref_audio) + AudioSegment.silent(duration=50)
176
 
177
  audio, sr = ref_audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  if audio.shape[0] > 1:
179
  audio = torch.mean(audio, dim=0, keepdim=True)
180
 
 
172
  elif exp_name == "Deutsh":
173
  ema_model = E2TTS_ema_model2
174
  #ref_audio, ref_text = preprocess_ref_audio_text(ref_audio, ref_text, show_info=show_info)
175
+
176
 
177
  audio, sr = ref_audio
178
+
179
+ # Convert audio to PyDub AudioSegment (assuming it's a NumPy array in [-1, 1] range)
180
+ audio_segment = AudioSegment(
181
+ (audio * (2**15)).astype(np.int16).tobytes(), # Scale to 16-bit PCM range
182
+ frame_rate=sr,
183
+ sample_width=2, # 16-bit audio
184
+ channels=1 if len(audio.shape) == 1 else audio.shape[0] # Mono or multi-channel
185
+ )
186
+
187
+ # Remove silence using your custom function
188
+ audio_segment = remove_silence_edges(audio_segment, silence_threshold=-42)
189
+
190
+ # Convert back to NumPy array for further processing
191
+ audio_trimmed = np.array(audio_segment.get_array_of_samples(), dtype=np.float32) / (2**15)
192
+
193
+ # Continue processing with trimmed audio
194
+ audio = torch.from_numpy(audio_trimmed).unsqueeze(0) # Add batch/channel dimension
195
+
196
+
197
+
198
+
199
  if audio.shape[0] > 1:
200
  audio = torch.mean(audio, dim=0, keepdim=True)
201