MXNXVMadman commited on
Commit
bab94e4
·
1 Parent(s): eabeba1
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README.md +8 -7
  3. app.py +679 -0
  4. ffmpeg.zip +3 -0
  5. packages.txt +1 -0
  6. requirements.txt +12 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/female.wav filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: Space
3
- emoji: 🔥
4
- colorFrom: yellow
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.5.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: XTTS
3
+ emoji: 🐸
4
+ colorFrom: green
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 3.48.0
8
  app_file: app.py
9
  pinned: false
10
+ models:
11
+ - coqui/XTTS-v2
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,679 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import io, os, stat
3
+ import subprocess
4
+ import random
5
+ from zipfile import ZipFile
6
+ import uuid
7
+ import time
8
+ import torch
9
+ import torchaudio
10
+
11
+
12
+ #download for mecab
13
+ os.system('python -m unidic download')
14
+
15
+ # By using XTTS you agree to CPML license https://coqui.ai/cpml
16
+ os.environ["COQUI_TOS_AGREED"] = "1"
17
+
18
+ # langid is used to detect language for longer text
19
+ # Most users expect text to be their own language, there is checkbox to disable it
20
+ import langid
21
+ import base64
22
+ import csv
23
+ from io import StringIO
24
+ import datetime
25
+ import re
26
+
27
+ import gradio as gr
28
+ from scipy.io.wavfile import write
29
+ from pydub import AudioSegment
30
+
31
+ from TTS.api import TTS
32
+ from TTS.tts.configs.xtts_config import XttsConfig
33
+ from TTS.tts.models.xtts import Xtts
34
+ from TTS.utils.generic_utils import get_user_data_dir
35
+
36
+ HF_TOKEN = os.environ.get("HF_TOKEN")
37
+
38
+ from huggingface_hub import HfApi
39
+
40
+ # will use api to restart space on a unrecoverable error
41
+ api = HfApi(token=HF_TOKEN)
42
+ repo_id = "coqui/xtts"
43
+
44
+ # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
45
+ print("Export newer ffmpeg binary for denoise filter")
46
+ ZipFile("ffmpeg.zip").extractall()
47
+ print("Make ffmpeg binary executable")
48
+ st = os.stat("ffmpeg")
49
+ os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
50
+
51
+ # This will trigger downloading model
52
+ print("Downloading if not downloaded Coqui XTTS V2")
53
+ from TTS.utils.manage import ModelManager
54
+
55
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
56
+ ModelManager().download_model(model_name)
57
+ model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
58
+ print("XTTS downloaded")
59
+
60
+ config = XttsConfig()
61
+ config.load_json(os.path.join(model_path, "config.json"))
62
+
63
+ model = Xtts.init_from_config(config)
64
+ model.load_checkpoint(
65
+ config,
66
+ checkpoint_path=os.path.join(model_path, "model.pth"),
67
+ vocab_path=os.path.join(model_path, "vocab.json"),
68
+ eval=True,
69
+ use_deepspeed=True,
70
+ )
71
+ model.cuda()
72
+
73
+ # This is for debugging purposes only
74
+ DEVICE_ASSERT_DETECTED = 0
75
+ DEVICE_ASSERT_PROMPT = None
76
+ DEVICE_ASSERT_LANG = None
77
+
78
+ supported_languages = config.languages
79
+
80
+ def predict(
81
+ prompt,
82
+ language,
83
+ audio_file_pth,
84
+ mic_file_path,
85
+ use_mic,
86
+ voice_cleanup,
87
+ no_lang_auto_detect,
88
+ agree,
89
+ ):
90
+ if agree == True:
91
+ if language not in supported_languages:
92
+ gr.Warning(
93
+ f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
94
+ )
95
+
96
+ return (
97
+ None,
98
+ None,
99
+ None,
100
+ None,
101
+ )
102
+
103
+ language_predicted = langid.classify(prompt)[
104
+ 0
105
+ ].strip() # strip need as there is space at end!
106
+
107
+ # tts expects chinese as zh-cn
108
+ if language_predicted == "zh":
109
+ # we use zh-cn
110
+ language_predicted = "zh-cn"
111
+
112
+ print(f"Detected language:{language_predicted}, Chosen language:{language}")
113
+
114
+ # After text character length 15 trigger language detection
115
+ if len(prompt) > 15:
116
+ # allow any language for short text as some may be common
117
+ # If user unchecks language autodetection it will not trigger
118
+ # You may remove this completely for own use
119
+ if language_predicted != language and not no_lang_auto_detect:
120
+ # Please duplicate and remove this check if you really want this
121
+ # Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
122
+ gr.Warning(
123
+ f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox"
124
+ )
125
+
126
+ return (
127
+ None,
128
+ None,
129
+ None,
130
+ None,
131
+ )
132
+
133
+ if use_mic == True:
134
+ if mic_file_path is not None:
135
+ speaker_wav = mic_file_path
136
+ else:
137
+ gr.Warning(
138
+ "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
139
+ )
140
+ return (
141
+ None,
142
+ None,
143
+ None,
144
+ None,
145
+ )
146
+
147
+ else:
148
+ speaker_wav = audio_file_pth
149
+
150
+ # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
151
+ # This is fast filtering not perfect
152
+
153
+ # Apply all on demand
154
+ lowpassfilter = denoise = trim = loudness = True
155
+
156
+ if lowpassfilter:
157
+ lowpass_highpass = "lowpass=8000,highpass=75,"
158
+ else:
159
+ lowpass_highpass = ""
160
+
161
+ if trim:
162
+ # better to remove silence in beginning and end for microphone
163
+ trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
164
+ else:
165
+ trim_silence = ""
166
+
167
+ if voice_cleanup:
168
+ try:
169
+ out_filename = (
170
+ speaker_wav + str(uuid.uuid4()) + ".wav"
171
+ ) # ffmpeg to know output format
172
+
173
+ # we will use newer ffmpeg as that has afftn denoise filter
174
+ shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(
175
+ " "
176
+ )
177
+
178
+ command_result = subprocess.run(
179
+ [item for item in shell_command],
180
+ capture_output=False,
181
+ text=True,
182
+ check=True,
183
+ )
184
+ speaker_wav = out_filename
185
+ print("Filtered microphone input")
186
+ except subprocess.CalledProcessError:
187
+ # There was an error - command exited with non-zero code
188
+ print("Error: failed filtering, use original microphone input")
189
+ else:
190
+ speaker_wav = speaker_wav
191
+
192
+ if len(prompt) < 2:
193
+ gr.Warning("Please give a longer prompt text")
194
+ return (
195
+ None,
196
+ None,
197
+ None,
198
+ None,
199
+ )
200
+ if len(prompt) > 200:
201
+ gr.Warning(
202
+ "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
203
+ )
204
+ return (
205
+ None,
206
+ None,
207
+ None,
208
+ None,
209
+ )
210
+ global DEVICE_ASSERT_DETECTED
211
+ if DEVICE_ASSERT_DETECTED:
212
+ global DEVICE_ASSERT_PROMPT
213
+ global DEVICE_ASSERT_LANG
214
+ # It will likely never come here as we restart space on first unrecoverable error now
215
+ print(
216
+ f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}"
217
+ )
218
+
219
+ # HF Space specific.. This error is unrecoverable need to restart space
220
+ space = api.get_space_runtime(repo_id=repo_id)
221
+ if space.stage!="BUILDING":
222
+ api.restart_space(repo_id=repo_id)
223
+ else:
224
+ print("TRIED TO RESTART but space is building")
225
+
226
+ try:
227
+ metrics_text = ""
228
+ t_latent = time.time()
229
+
230
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
231
+ try:
232
+ (
233
+ gpt_cond_latent,
234
+ speaker_embedding,
235
+ ) = model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, max_ref_length=60)
236
+ except Exception as e:
237
+ print("Speaker encoding error", str(e))
238
+ gr.Warning(
239
+ "It appears something wrong with reference, did you unmute your microphone?"
240
+ )
241
+ return (
242
+ None,
243
+ None,
244
+ None,
245
+ None,
246
+ )
247
+
248
+ latent_calculation_time = time.time() - t_latent
249
+ # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
250
+
251
+ # temporary comma fix
252
+ prompt= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
253
+
254
+ wav_chunks = []
255
+ ## Direct mode
256
+ """
257
+ print("I: Generating new audio...")
258
+ t0 = time.time()
259
+ out = model.inference(
260
+ prompt,
261
+ language,
262
+ gpt_cond_latent,
263
+ speaker_embedding
264
+ )
265
+ inference_time = time.time() - t0
266
+ print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
267
+ metrics_text+=f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
268
+ real_time_factor= (time.time() - t0) / out['wav'].shape[-1] * 24000
269
+ print(f"Real-time factor (RTF): {real_time_factor}")
270
+ metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
271
+ torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
272
+ """
273
+
274
+ print("I: Generating new audio in streaming mode...")
275
+ t0 = time.time()
276
+ chunks = model.inference_stream(
277
+ prompt,
278
+ language,
279
+ gpt_cond_latent,
280
+ speaker_embedding,
281
+ repetition_penalty=7.0,
282
+ temperature=0.85,
283
+ )
284
+
285
+ first_chunk = True
286
+ for i, chunk in enumerate(chunks):
287
+ if first_chunk:
288
+ first_chunk_time = time.time() - t0
289
+ metrics_text += f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
290
+ first_chunk = False
291
+ wav_chunks.append(chunk)
292
+ print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
293
+ inference_time = time.time() - t0
294
+ print(
295
+ f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
296
+ )
297
+ #metrics_text += (
298
+ # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
299
+ #)
300
+
301
+ wav = torch.cat(wav_chunks, dim=0)
302
+ print(wav.shape)
303
+ real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
304
+ print(f"Real-time factor (RTF): {real_time_factor}")
305
+ metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
306
+
307
+ torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
308
+
309
+ except RuntimeError as e:
310
+ if "device-side assert" in str(e):
311
+ # cannot do anything on cuda device side error, need tor estart
312
+ print(
313
+ f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
314
+ flush=True,
315
+ )
316
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
317
+ print("Cuda device-assert Runtime encountered need restart")
318
+ if not DEVICE_ASSERT_DETECTED:
319
+ DEVICE_ASSERT_DETECTED = 1
320
+ DEVICE_ASSERT_PROMPT = prompt
321
+ DEVICE_ASSERT_LANG = language
322
+
323
+ # just before restarting save what caused the issue so we can handle it in future
324
+ # Uploading Error data only happens for unrecovarable error
325
+ error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
326
+ error_data = [
327
+ error_time,
328
+ prompt,
329
+ language,
330
+ audio_file_pth,
331
+ mic_file_path,
332
+ use_mic,
333
+ voice_cleanup,
334
+ no_lang_auto_detect,
335
+ agree,
336
+ ]
337
+ error_data = [str(e) if type(e) != str else e for e in error_data]
338
+ print(error_data)
339
+ print(speaker_wav)
340
+ write_io = StringIO()
341
+ csv.writer(write_io).writerows([error_data])
342
+ csv_upload = write_io.getvalue().encode()
343
+
344
+ filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
345
+ print("Writing error csv")
346
+ error_api = HfApi()
347
+ error_api.upload_file(
348
+ path_or_fileobj=csv_upload,
349
+ path_in_repo=filename,
350
+ repo_id="coqui/xtts-flagged-dataset",
351
+ repo_type="dataset",
352
+ )
353
+
354
+ # speaker_wav
355
+ print("Writing error reference audio")
356
+ speaker_filename = (
357
+ error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
358
+ )
359
+ error_api = HfApi()
360
+ error_api.upload_file(
361
+ path_or_fileobj=speaker_wav,
362
+ path_in_repo=speaker_filename,
363
+ repo_id="coqui/xtts-flagged-dataset",
364
+ repo_type="dataset",
365
+ )
366
+
367
+ # HF Space specific.. This error is unrecoverable need to restart space
368
+ space = api.get_space_runtime(repo_id=repo_id)
369
+ if space.stage!="BUILDING":
370
+ api.restart_space(repo_id=repo_id)
371
+ else:
372
+ print("TRIED TO RESTART but space is building")
373
+
374
+ else:
375
+ if "Failed to decode" in str(e):
376
+ print("Speaker encoding error", str(e))
377
+ gr.Warning(
378
+ "It appears something wrong with reference, did you unmute your microphone?"
379
+ )
380
+ else:
381
+ print("RuntimeError: non device-side assert error:", str(e))
382
+ gr.Warning("Something unexpected happened please retry again.")
383
+ return (
384
+ None,
385
+ None,
386
+ None,
387
+ None,
388
+ )
389
+ return (
390
+ gr.make_waveform(
391
+ audio="output.wav",
392
+ ),
393
+ "output.wav",
394
+ metrics_text,
395
+ speaker_wav,
396
+ )
397
+ else:
398
+ gr.Warning("Please accept the Terms & Condition!")
399
+ return (
400
+ None,
401
+ None,
402
+ None,
403
+ None,
404
+ )
405
+
406
+
407
+ title = "Simple voice cloning using Coqui🐸 XTTS"
408
+
409
+ description = """
410
+
411
+ Simple voice model
412
+ """
413
+
414
+ links = """
415
+
416
+
417
+ """
418
+
419
+ article = """
420
+
421
+ """
422
+ examples = [
423
+ [
424
+ "Once when I was six years old I saw a magnificent picture",
425
+ "en",
426
+ "examples/female.wav",
427
+ None,
428
+ False,
429
+ False,
430
+ False,
431
+ True,
432
+ ],
433
+ [
434
+ "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
435
+ "fr",
436
+ "examples/male.wav",
437
+ None,
438
+ False,
439
+ False,
440
+ False,
441
+ True,
442
+ ],
443
+ [
444
+ "Als ich sechs war, sah ich einmal ein wunderbares Bild",
445
+ "de",
446
+ "examples/female.wav",
447
+ None,
448
+ False,
449
+ False,
450
+ False,
451
+ True,
452
+ ],
453
+ [
454
+ "Cuando tenía seis años, vi una vez una imagen magnífica",
455
+ "es",
456
+ "examples/male.wav",
457
+ None,
458
+ False,
459
+ False,
460
+ False,
461
+ True,
462
+ ],
463
+ [
464
+ "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
465
+ "pt",
466
+ "examples/female.wav",
467
+ None,
468
+ False,
469
+ False,
470
+ False,
471
+ True,
472
+ ],
473
+ [
474
+ "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
475
+ "pl",
476
+ "examples/male.wav",
477
+ None,
478
+ False,
479
+ False,
480
+ False,
481
+ True,
482
+ ],
483
+ [
484
+ "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
485
+ "it",
486
+ "examples/female.wav",
487
+ None,
488
+ False,
489
+ False,
490
+ False,
491
+ True,
492
+ ],
493
+ [
494
+ "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
495
+ "tr",
496
+ "examples/female.wav",
497
+ None,
498
+ False,
499
+ False,
500
+ False,
501
+ True,
502
+ ],
503
+ [
504
+ "Когда мне было шесть лет, я увидел однажды удивительную картинку",
505
+ "ru",
506
+ "examples/female.wav",
507
+ None,
508
+ False,
509
+ False,
510
+ False,
511
+ True,
512
+ ],
513
+ [
514
+ "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
515
+ "nl",
516
+ "examples/male.wav",
517
+ None,
518
+ False,
519
+ False,
520
+ False,
521
+ True,
522
+ ],
523
+ [
524
+ "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
525
+ "cs",
526
+ "examples/female.wav",
527
+ None,
528
+ False,
529
+ False,
530
+ False,
531
+ True,
532
+ ],
533
+ [
534
+ "当我还只有六岁的时候, 看到了一副精彩的插画",
535
+ "zh-cn",
536
+ "examples/female.wav",
537
+ None,
538
+ False,
539
+ False,
540
+ False,
541
+ True,
542
+ ],
543
+ [
544
+ "かつて 六歳のとき、素晴らしい絵を見ました",
545
+ "ja",
546
+ "examples/female.wav",
547
+ None,
548
+ False,
549
+ True,
550
+ False,
551
+ True,
552
+ ],
553
+ [
554
+ "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
555
+ "ko",
556
+ "examples/female.wav",
557
+ None,
558
+ False,
559
+ True,
560
+ False,
561
+ True,
562
+ ],
563
+ [
564
+ "Egyszer hat éves koromban láttam egy csodálatos képet",
565
+ "hu",
566
+ "examples/male.wav",
567
+ None,
568
+ False,
569
+ True,
570
+ False,
571
+ True,
572
+ ],
573
+ ]
574
+
575
+
576
+
577
+ with gr.Blocks(analytics_enabled=False) as demo:
578
+ with gr.Row():
579
+ with gr.Column():
580
+ gr.Markdown(
581
+ """
582
+ ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
583
+ """
584
+ )
585
+ with gr.Column():
586
+ # placeholder to align the image
587
+ pass
588
+
589
+ with gr.Row():
590
+ with gr.Column():
591
+ gr.Markdown(description)
592
+ with gr.Column():
593
+ gr.Markdown(links)
594
+
595
+ with gr.Row():
596
+ with gr.Column():
597
+ input_text_gr = gr.Textbox(
598
+ label="Text Prompt",
599
+ info="One or two sentences at a time is better. Up to 200 text characters.",
600
+ value="Hi there, I'm your new voice clone. Try your best to upload quality audio",
601
+ )
602
+ language_gr = gr.Dropdown(
603
+ label="Language",
604
+ info="Select an output language for the synthesised speech",
605
+ choices=[
606
+ "en",
607
+ "es",
608
+ "fr",
609
+ "de",
610
+ "it",
611
+ "pt",
612
+ "pl",
613
+ "tr",
614
+ "ru",
615
+ "nl",
616
+ "cs",
617
+ "ar",
618
+ "zh-cn",
619
+ "ja",
620
+ "ko",
621
+ "hu"
622
+ ],
623
+ max_choices=1,
624
+ value="en",
625
+ )
626
+ ref_gr = gr.Audio(
627
+ label="Reference Audio",
628
+ info="Click on the ✎ button to upload your own target speaker audio",
629
+ type="filepath",
630
+ value="examples/female.wav",
631
+ )
632
+ mic_gr = gr.Audio(
633
+ source="microphone",
634
+ type="filepath",
635
+ info="Use your microphone to record audio",
636
+ label="Use Microphone for Reference",
637
+ )
638
+ use_mic_gr = gr.Checkbox(
639
+ label="Use Microphone",
640
+ value=False,
641
+ info="Notice: Microphone input may not work properly under traffic",
642
+ )
643
+ clean_ref_gr = gr.Checkbox(
644
+ label="Cleanup Reference Voice",
645
+ value=False,
646
+ info="This check can improve output if your microphone or reference voice is noisy",
647
+ )
648
+ auto_det_lang_gr = gr.Checkbox(
649
+ label="Do not use language auto-detect",
650
+ value=False,
651
+ info="Check to disable language auto-detection",
652
+ )
653
+ tos_gr = gr.Checkbox(
654
+ label="Agree",
655
+ value=False,
656
+ info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
657
+ )
658
+
659
+ tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
660
+
661
+
662
+ with gr.Column():
663
+ video_gr = gr.Video(label="Waveform Visual")
664
+ audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
665
+ out_text_gr = gr.Text(label="Metrics")
666
+ ref_audio_gr = gr.Audio(label="Reference Audio Used")
667
+
668
+ with gr.Row():
669
+ gr.Examples(examples,
670
+ label="Examples",
671
+ inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
672
+ outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
673
+ fn=predict,
674
+ cache_examples=False,)
675
+
676
+ tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
677
+
678
+ demo.queue()
679
+ demo.launch(debug=True, show_api=True)
ffmpeg.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c04aa2958762686cf94a3bd1456b4738fd537d19bb0a9b622fc788a5e4ce723
3
+ size 29207056
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ unzip
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Preinstall requirements from TTS
2
+ TTS @ git+https://github.com/coqui-ai/[email protected]
3
+ pydantic==1.10.13
4
+ python-multipart==0.0.6
5
+ typing-extensions>=4.8.0
6
+ cutlet
7
+ mecab-python3==1.0.6
8
+ unidic-lite==1.0.8
9
+ unidic==1.1.0
10
+ langid
11
+ deepspeed
12
+ pydub