Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -193,27 +193,19 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
| 193 |
|
| 194 |
|
| 195 |
for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
ref_text = gen_text
|
| 200 |
-
|
| 201 |
-
# Prepare the text for pinyin conversion
|
| 202 |
text_list = [ref_text + gen_text]
|
| 203 |
final_text_list = convert_char_to_pinyin(text_list)
|
| 204 |
|
| 205 |
-
|
| 206 |
-
print(f"ref len: {len(ref_text.encode('utf-8'))} chars")
|
| 207 |
-
print(f"gen len: {len(gen_text.encode('utf-8'))} chars")
|
| 208 |
-
|
| 209 |
ref_audio_len = audio.shape[-1] // hop_length
|
| 210 |
zh_pause_punc = r"。,、;:?!"
|
| 211 |
ref_text_len = len(ref_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, ref_text))
|
| 212 |
gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
|
|
|
|
| 213 |
|
| 214 |
-
# Calculate duration based on the lengths of ref_text and gen_text
|
| 215 |
-
duration = min(2000, max(270, int( (ref_audio_len + (ref_audio_len / ref_text_len * gen_text_len / speed))))
|
| 216 |
-
|
| 217 |
# Print the calculated duration
|
| 218 |
print(f"Duration: {duration} seconds")
|
| 219 |
# inference
|
|
@@ -332,13 +324,13 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
|
|
| 332 |
|
| 333 |
if not ref_text.strip():
|
| 334 |
gr.Info("No reference text provided, transcribing reference audio...")
|
| 335 |
-
ref_text =
|
| 336 |
ref_audio,
|
| 337 |
chunk_length_s=30,
|
| 338 |
batch_size=128,
|
| 339 |
generate_kwargs={"task": "transcribe"},
|
| 340 |
return_timestamps=False,
|
| 341 |
-
)[
|
| 342 |
gr.Info("Finished transcription")
|
| 343 |
else:
|
| 344 |
gr.Info("Using custom reference text...")
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
|
| 196 |
+
# Prepare the text
|
| 197 |
+
if len(ref_text[-1].encode('utf-8')) == 1:
|
| 198 |
+
ref_text = ref_text + " "
|
|
|
|
|
|
|
|
|
|
| 199 |
text_list = [ref_text + gen_text]
|
| 200 |
final_text_list = convert_char_to_pinyin(text_list)
|
| 201 |
|
| 202 |
+
# Calculate duration
|
|
|
|
|
|
|
|
|
|
| 203 |
ref_audio_len = audio.shape[-1] // hop_length
|
| 204 |
zh_pause_punc = r"。,、;:?!"
|
| 205 |
ref_text_len = len(ref_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, ref_text))
|
| 206 |
gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
|
| 207 |
+
duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
|
| 208 |
|
|
|
|
|
|
|
|
|
|
| 209 |
# Print the calculated duration
|
| 210 |
print(f"Duration: {duration} seconds")
|
| 211 |
# inference
|
|
|
|
| 324 |
|
| 325 |
if not ref_text.strip():
|
| 326 |
gr.Info("No reference text provided, transcribing reference audio...")
|
| 327 |
+
ref_text = pipe(
|
| 328 |
ref_audio,
|
| 329 |
chunk_length_s=30,
|
| 330 |
batch_size=128,
|
| 331 |
generate_kwargs={"task": "transcribe"},
|
| 332 |
return_timestamps=False,
|
| 333 |
+
)["text"].strip()
|
| 334 |
gr.Info("Finished transcription")
|
| 335 |
else:
|
| 336 |
gr.Info("Using custom reference text...")
|