Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -176,38 +176,38 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
176 |
|
177 |
audio, sr = ref_audio
|
178 |
|
179 |
-
# Convert
|
|
|
|
|
|
|
180 |
audio_segment = AudioSegment(
|
181 |
-
(
|
182 |
frame_rate=sr,
|
183 |
sample_width=2, # 16-bit audio
|
184 |
-
channels=1 if len(
|
185 |
)
|
186 |
|
187 |
-
# Remove silence using
|
188 |
-
|
189 |
-
|
190 |
-
# Convert back to NumPy array for further processing
|
191 |
-
audio_trimmed = np.array(audio_segment.get_array_of_samples(), dtype=np.float32) / (2**15)
|
192 |
-
|
193 |
-
# Continue processing with trimmed audio
|
194 |
-
audio = torch.from_numpy(audio_trimmed).unsqueeze(0) # Add batch/channel dimension
|
195 |
-
|
196 |
|
|
|
|
|
|
|
|
|
197 |
|
198 |
-
|
199 |
if audio.shape[0] > 1:
|
200 |
-
audio =
|
201 |
|
202 |
-
rms = torch.sqrt(
|
203 |
if rms < target_rms:
|
204 |
-
audio
|
|
|
205 |
if sr != target_sample_rate:
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
|
210 |
-
audio = audio.to(device)
|
211 |
|
212 |
generated_waves = []
|
213 |
spectrograms = []
|
|
|
176 |
|
177 |
audio, sr = ref_audio
|
178 |
|
179 |
+
# Convert PyTorch tensor to NumPy array before scaling and processing
|
180 |
+
audio_np = audio.cpu().numpy() # Convert to NumPy (if it's a tensor)
|
181 |
+
|
182 |
+
# Convert NumPy audio array to PyDub AudioSegment
|
183 |
audio_segment = AudioSegment(
|
184 |
+
(audio_np * 32768).astype(np.int16).tobytes(), # Scale to 16-bit PCM
|
185 |
frame_rate=sr,
|
186 |
sample_width=2, # 16-bit audio
|
187 |
+
channels=1 if len(audio_np.shape) == 1 else audio_np.shape[0] # Mono or multi-channel
|
188 |
)
|
189 |
|
190 |
+
# Remove silence using the custom function
|
191 |
+
audio_trimmed = remove_silence_edges(audio_segment, silence_threshold=-42)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
+
# Convert trimmed audio back to a PyTorch tensor
|
194 |
+
audio = torch.tensor(
|
195 |
+
np.array(audio_trimmed.get_array_of_samples(), dtype=np.float32) / 32768
|
196 |
+
).unsqueeze(0) # Add batch/channel dimension
|
197 |
|
198 |
+
# Normalize and resample
|
199 |
if audio.shape[0] > 1:
|
200 |
+
audio = audio.mean(dim=0, keepdim=True) # Convert to mono
|
201 |
|
202 |
+
rms = torch.sqrt((audio**2).mean()) # Compute RMS
|
203 |
if rms < target_rms:
|
204 |
+
audio *= target_rms / rms # Adjust RMS
|
205 |
+
|
206 |
if sr != target_sample_rate:
|
207 |
+
audio = torchaudio.transforms.Resample(sr, target_sample_rate)(audio)
|
208 |
+
|
209 |
+
audio = audio.to(device) # Move to target device
|
210 |
|
|
|
211 |
|
212 |
generated_waves = []
|
213 |
spectrograms = []
|