Spaces:

kakao-enterprise
/

vits

Running on Zero

App Files Files Community

sanchit-gandhi commited on Sep 1, 2023

Commit

72d1bae

1 Parent(s): 1e5a7f2

Create app.py

Browse files

Files changed (1) hide show

app.py +117 -0

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import gradio as gr
+import torch
+from transformers import VitsModel, VitsTokenizer, set_seed
+title = """
+<div style="text-align: center; max-width: 700px; margin: 0 auto;">
+    <div
+        style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;"
+    > <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
+        VITS TTS Demo
+    </h1> </div>
+</div>
+ """
+description = """
+VITS is an end-to-end speech synthesis model that predicts a speech waveform conditional on an input text sequence. It
+is a conditional variational autoencoder (VAE) comprised of a posterior encoder, decoder, and conditional prior.
+This demo showcases the official VITS checkpoints, trained on the [LJ
+Speech](https://huggingface.co/kakao-enterprise/vits-ljs) and [VCTK](https://huggingface.co/kakao-enterprise/vits-vctk)
+datasets.
+"""
+article = "Model by Jaehyeon Kim et al. from Kakao Enterprise. Code and demo by 🤗 Hugging Face."
+ljs_model = VitsModel.from_pretrained("kakao-enterprise/vits-ljs")
+ljs_tokenizer = VitsTokenizer.from_pretrained("kakao-enterprise/vits-ljs")
+vctk_model = VitsModel.from_pretrained("kakao-enterprise/vits-vctk")
+vctk_tokenizer = VitsTokenizer.from_pretrained("kakao-enterprise/vits-vctk")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+ljs_model.to(device)
+vctk_model.to(device)
+def ljs_forward(text, speaking_rate=1.0):
+    inputs = ljs_tokenizer(text, return_tensors="pt")
+    ljs_model.speaking_rate = speaking_rate
+    set_seed(555)
+    with torch.no_grad():
+        outputs = ljs_model(**inputs)[0]
+    waveform = outputs[0].cpu().float().numpy()
+    return gr.make_waveform((22050, waveform))
+def vctk_forward(text, speaking_rate=1.0, speaker_id=1):
+    inputs = vctk_tokenizer(text, return_tensors="pt")
+    vctk_model.speaking_rate = speaking_rate
+    set_seed(555)
+    with torch.no_grad():
+        outputs = vctk_model(**inputs, speaker_id=speaker_id - 1)[0]
+    waveform = outputs[0].cpu().float().numpy()
+    return gr.make_waveform((22050, waveform))
+ljs_inference = gr.Interface(
+    fn=ljs_forward,
+    inputs=[
+        gr.Textbox(
+            value="Hey, it's Hugging Face on the phone",
+            max_lines=1,
+            label="Input text",
+        ),
+        gr.Slider(
+            0.5,
+            1.5,
+            value=1,
+            step=0.1,
+            label="Speaking rate",
+        ),
+    ],
+    outputs=gr.Audio(),
+)
+vctk_inference = gr.Interface(
+    fn=vctk_forward,
+    inputs=[
+        gr.Textbox(
+            value="Hey, it's Hugging Face on the phone",
+            max_lines=1,
+            label="Input text",
+        ),
+        gr.Slider(
+            0.5,
+            1.5,
+            value=1,
+            step=0.1,
+            label="Speaking rate",
+        ),
+        gr.Slider(
+            1,
+            vctk_model.config.num_speakers,
+            value=1,
+            step=1,
+            label="Speaker id",
+            info=f"The VCTK model is trained on {vctk_model.config.num_speakers} speakers. You can prompt the model using one of these speaker ids.",
+        ),
+    ],
+    outputs=gr.Audio(),
+)
+demo = gr.Blocks()
+with demo:
+    gr.Markdown(title)
+    gr.Markdown(description)
+    gr.TabbedInterface([ljs_inference, vctk_inference], ["LJ Speech", "VCTK"])
+    gr.Markdown(article)
+demo.queue(max_size=10)
+demo.launch()