Gregniuki commited on
Commit
646de05
·
verified ·
1 Parent(s): cc95ac6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -15
app.py CHANGED
@@ -193,27 +193,19 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
193
 
194
 
195
  for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
196
- # If the last character of ref_text is a single byte (likely punctuation or space), use gen_text as reference
197
-
198
- if len(ref_text.encode('utf-8')) <= 2:
199
- ref_text = gen_text
200
-
201
- # Prepare the text for pinyin conversion
202
  text_list = [ref_text + gen_text]
203
  final_text_list = convert_char_to_pinyin(text_list)
204
 
205
- # Calculate text lengths including punctuation-based adjustments
206
- print(f"ref len: {len(ref_text.encode('utf-8'))} chars")
207
- print(f"gen len: {len(gen_text.encode('utf-8'))} chars")
208
-
209
  ref_audio_len = audio.shape[-1] // hop_length
210
  zh_pause_punc = r"。,、;:?!"
211
  ref_text_len = len(ref_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, ref_text))
212
  gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
 
213
 
214
- # Calculate duration based on the lengths of ref_text and gen_text
215
- duration = min(2000, max(270, int( (ref_audio_len + (ref_audio_len / ref_text_len * gen_text_len / speed))))
216
-
217
  # Print the calculated duration
218
  print(f"Duration: {duration} seconds")
219
  # inference
@@ -332,13 +324,13 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fa
332
 
333
  if not ref_text.strip():
334
  gr.Info("No reference text provided, transcribing reference audio...")
335
- ref_text = outputs = pipe(
336
  ref_audio,
337
  chunk_length_s=30,
338
  batch_size=128,
339
  generate_kwargs={"task": "transcribe"},
340
  return_timestamps=False,
341
- )['text'].strip()
342
  gr.Info("Finished transcription")
343
  else:
344
  gr.Info("Using custom reference text...")
 
193
 
194
 
195
  for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
196
+ # Prepare the text
197
+ if len(ref_text[-1].encode('utf-8')) == 1:
198
+ ref_text = ref_text + " "
 
 
 
199
  text_list = [ref_text + gen_text]
200
  final_text_list = convert_char_to_pinyin(text_list)
201
 
202
+ # Calculate duration
 
 
 
203
  ref_audio_len = audio.shape[-1] // hop_length
204
  zh_pause_punc = r"。,、;:?!"
205
  ref_text_len = len(ref_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, ref_text))
206
  gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
207
+ duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
208
 
 
 
 
209
  # Print the calculated duration
210
  print(f"Duration: {duration} seconds")
211
  # inference
 
324
 
325
  if not ref_text.strip():
326
  gr.Info("No reference text provided, transcribing reference audio...")
327
+ ref_text = pipe(
328
  ref_audio,
329
  chunk_length_s=30,
330
  batch_size=128,
331
  generate_kwargs={"task": "transcribe"},
332
  return_timestamps=False,
333
+ )["text"].strip()
334
  gr.Info("Finished transcription")
335
  else:
336
  gr.Info("Using custom reference text...")