Gregniuki commited on
Commit
f387293
·
verified ·
1 Parent(s): 4d14e15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -20
app.py CHANGED
@@ -176,38 +176,38 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
176
 
177
  audio, sr = ref_audio
178
 
179
- # Convert audio to PyDub AudioSegment (assuming it's a NumPy array in [-1, 1] range)
 
 
 
180
  audio_segment = AudioSegment(
181
- (audio * (2**15)).astype(np.int16).tobytes(), # Scale to 16-bit PCM range
182
  frame_rate=sr,
183
  sample_width=2, # 16-bit audio
184
- channels=1 if len(audio.shape) == 1 else audio.shape[0] # Mono or multi-channel
185
  )
186
 
187
- # Remove silence using your custom function
188
- audio_segment = remove_silence_edges(audio_segment, silence_threshold=-42)
189
-
190
- # Convert back to NumPy array for further processing
191
- audio_trimmed = np.array(audio_segment.get_array_of_samples(), dtype=np.float32) / (2**15)
192
-
193
- # Continue processing with trimmed audio
194
- audio = torch.from_numpy(audio_trimmed).unsqueeze(0) # Add batch/channel dimension
195
-
196
 
 
 
 
 
197
 
198
-
199
  if audio.shape[0] > 1:
200
- audio = torch.mean(audio, dim=0, keepdim=True)
201
 
202
- rms = torch.sqrt(torch.mean(torch.square(audio)))
203
  if rms < target_rms:
204
- audio = audio * target_rms / rms
 
205
  if sr != target_sample_rate:
206
- resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
207
- audio = resampler(audio)
208
-
209
 
210
- audio = audio.to(device)
211
 
212
  generated_waves = []
213
  spectrograms = []
 
176
 
177
  audio, sr = ref_audio
178
 
179
+ # Convert PyTorch tensor to NumPy array before scaling and processing
180
+ audio_np = audio.cpu().numpy() # Convert to NumPy (if it's a tensor)
181
+
182
+ # Convert NumPy audio array to PyDub AudioSegment
183
  audio_segment = AudioSegment(
184
+ (audio_np * 32768).astype(np.int16).tobytes(), # Scale to 16-bit PCM
185
  frame_rate=sr,
186
  sample_width=2, # 16-bit audio
187
+ channels=1 if len(audio_np.shape) == 1 else audio_np.shape[0] # Mono or multi-channel
188
  )
189
 
190
+ # Remove silence using the custom function
191
+ audio_trimmed = remove_silence_edges(audio_segment, silence_threshold=-42)
 
 
 
 
 
 
 
192
 
193
+ # Convert trimmed audio back to a PyTorch tensor
194
+ audio = torch.tensor(
195
+ np.array(audio_trimmed.get_array_of_samples(), dtype=np.float32) / 32768
196
+ ).unsqueeze(0) # Add batch/channel dimension
197
 
198
+ # Normalize and resample
199
  if audio.shape[0] > 1:
200
+ audio = audio.mean(dim=0, keepdim=True) # Convert to mono
201
 
202
+ rms = torch.sqrt((audio**2).mean()) # Compute RMS
203
  if rms < target_rms:
204
+ audio *= target_rms / rms # Adjust RMS
205
+
206
  if sr != target_sample_rate:
207
+ audio = torchaudio.transforms.Resample(sr, target_sample_rate)(audio)
208
+
209
+ audio = audio.to(device) # Move to target device
210
 
 
211
 
212
  generated_waves = []
213
  spectrograms = []