Gregniuki commited on
Commit
1ba672e
·
verified ·
1 Parent(s): 8380400

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -29
app.py CHANGED
@@ -175,40 +175,18 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
175
 
176
 
177
  audio, sr = ref_audio
178
-
179
- # Convert PyTorch tensor to NumPy array (ensure it's dtype=float16)
180
- audio_np = audio.cpu().numpy().astype(np.float16)
181
-
182
- # Convert NumPy audio array to PyDub AudioSegment
183
- audio_segment = AudioSegment(
184
- (audio_np * 32768).astype(np.int16).tobytes(), # Scale to 16-bit PCM
185
- frame_rate=sr,
186
- sample_width=2, # 16-bit audio
187
- channels=1 if len(audio_np.shape) == 1 else audio_np.shape[0] # Mono or multi-channel
188
- )
189
-
190
- # Remove silence using the custom function
191
- audio_trimmed = remove_silence_edges(audio_segment, silence_threshold=-42)
192
-
193
- # Convert trimmed audio back to a PyTorch tensor with dtype=float16
194
- audio = torch.tensor(
195
- np.array(audio_trimmed.get_array_of_samples(), dtype=np.float16) / 32768,
196
- dtype=torch.float16
197
- ).unsqueeze(0) # Add batch/channel dimension
198
-
199
- # Normalize and resample
200
  if audio.shape[0] > 1:
201
- audio = audio.mean(dim=0, keepdim=True) # Convert to mono
202
 
203
- rms = torch.sqrt((audio**2).mean()) # Compute RMS
204
  if rms < target_rms:
205
- audio *= target_rms / rms # Adjust RMS
206
-
207
  if sr != target_sample_rate:
208
- audio = torchaudio.transforms.Resample(sr, target_sample_rate)(audio)
209
-
210
- audio = audio.to(device) # Move to target device
211
 
 
212
 
213
 
214
  generated_waves = []
@@ -341,6 +319,7 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
341
  gr.Info("Converting audio...")
342
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
343
  aseg = AudioSegment.from_file(ref_audio_orig)
 
344
 
345
  non_silent_segs = silence.split_on_silence(
346
  aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000
 
175
 
176
 
177
  audio, sr = ref_audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  if audio.shape[0] > 1:
179
+ audio = torch.mean(audio, dim=0, keepdim=True)
180
 
181
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
182
  if rms < target_rms:
183
+ audio = audio * target_rms / rms
 
184
  if sr != target_sample_rate:
185
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
186
+ audio = resampler(audio)
187
+
188
 
189
+ audio = audio.to(device)
190
 
191
 
192
  generated_waves = []
 
319
  gr.Info("Converting audio...")
320
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
321
  aseg = AudioSegment.from_file(ref_audio_orig)
322
+ aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
323
 
324
  non_silent_segs = silence.split_on_silence(
325
  aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000