Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -175,40 +175,18 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
175 |
|
176 |
|
177 |
audio, sr = ref_audio
|
178 |
-
|
179 |
-
# Convert PyTorch tensor to NumPy array (ensure it's dtype=float16)
|
180 |
-
audio_np = audio.cpu().numpy().astype(np.float16)
|
181 |
-
|
182 |
-
# Convert NumPy audio array to PyDub AudioSegment
|
183 |
-
audio_segment = AudioSegment(
|
184 |
-
(audio_np * 32768).astype(np.int16).tobytes(), # Scale to 16-bit PCM
|
185 |
-
frame_rate=sr,
|
186 |
-
sample_width=2, # 16-bit audio
|
187 |
-
channels=1 if len(audio_np.shape) == 1 else audio_np.shape[0] # Mono or multi-channel
|
188 |
-
)
|
189 |
-
|
190 |
-
# Remove silence using the custom function
|
191 |
-
audio_trimmed = remove_silence_edges(audio_segment, silence_threshold=-42)
|
192 |
-
|
193 |
-
# Convert trimmed audio back to a PyTorch tensor with dtype=float16
|
194 |
-
audio = torch.tensor(
|
195 |
-
np.array(audio_trimmed.get_array_of_samples(), dtype=np.float16) / 32768,
|
196 |
-
dtype=torch.float16
|
197 |
-
).unsqueeze(0) # Add batch/channel dimension
|
198 |
-
|
199 |
-
# Normalize and resample
|
200 |
if audio.shape[0] > 1:
|
201 |
-
audio =
|
202 |
|
203 |
-
rms = torch.sqrt(
|
204 |
if rms < target_rms:
|
205 |
-
audio
|
206 |
-
|
207 |
if sr != target_sample_rate:
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
|
|
|
212 |
|
213 |
|
214 |
generated_waves = []
|
@@ -341,6 +319,7 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
|
|
341 |
gr.Info("Converting audio...")
|
342 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
343 |
aseg = AudioSegment.from_file(ref_audio_orig)
|
|
|
344 |
|
345 |
non_silent_segs = silence.split_on_silence(
|
346 |
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000
|
|
|
175 |
|
176 |
|
177 |
audio, sr = ref_audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
if audio.shape[0] > 1:
|
179 |
+
audio = torch.mean(audio, dim=0, keepdim=True)
|
180 |
|
181 |
+
rms = torch.sqrt(torch.mean(torch.square(audio)))
|
182 |
if rms < target_rms:
|
183 |
+
audio = audio * target_rms / rms
|
|
|
184 |
if sr != target_sample_rate:
|
185 |
+
resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
|
186 |
+
audio = resampler(audio)
|
187 |
+
|
188 |
|
189 |
+
audio = audio.to(device)
|
190 |
|
191 |
|
192 |
generated_waves = []
|
|
|
319 |
gr.Info("Converting audio...")
|
320 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
321 |
aseg = AudioSegment.from_file(ref_audio_orig)
|
322 |
+
aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
|
323 |
|
324 |
non_silent_segs = silence.split_on_silence(
|
325 |
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000
|