Spaces:

AlexK-PL
/

Tacotron2_GST_eng

Sleeping

App Files Files Community

AlexK-PL commited on Sep 25, 2023

Commit

cc8638f

1 Parent(s): 97a9463

Update app.py

Browse files

Files changed (1) hide show

app.py +199 -43

app.py CHANGED Viewed

@@ -3,13 +3,20 @@ import gradio as gr
 from hyper_parameters import tacotron_params as hparams
 from training import load_model
 from text import text_to_sequence
-from melgan.model.generator import Generator
-from melgan.utils.hparams import load_hparam
 import torch
 import numpy as np
 from matplotlib import pyplot as plt
@@ -28,69 +35,218 @@ The whole architecture has been trained from scratch with the LJSpeech dataset.
 of each style token, we configured the attention module as a single-head.
 Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
-generated speech may show more distortion and misspronunciations.
 """
-# load trained tacotron2 + GST model:
-model = load_model(hparams)
-checkpoint_path = "trained_models/checkpoint_78000.model"
-model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
-# model.to('cuda')
-_ = model.eval()
-# load pre trained MelGAN model for mel2audio:
-vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt"
-checkpoint = torch.load(vocoder_checkpoint_path, map_location="cpu")
-hp_melgan = load_hparam("melgan/config/default.yaml")
-vocoder_model = Generator(80)
-vocoder_model.load_state_dict(checkpoint['model_g'])
-# vocoder_model = vocoder_model.to('cuda')
-vocoder_model.eval(inference=False)
-def plot_spec_align(mel, align):
     fig_mel = plt.figure()
-    ax_mel = fig_mel.add_subplot(211)
     ax_mel.imshow(mel)
-    ax_mel.set_title('Mel-Scale Spectrogram', fontsize=12)
-    # fig_align = plt.figure()
-    ax_align = fig_mel.add_subplot(212)  # fig_align
     ax_align.imshow(align)
-    ax_align.set_title('Alignment', fontsize=12)
-    return fig_mel # fig_align
-def synthesize(text, gst_1, gst_2, gst_3):
     sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
     sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
-    # gst_head_scores = np.array([0.5, 0.15, 0.35])  # originally ([0.5, 0.15, 0.35])
     gst_head_scores = np.array([gst_1, gst_2, gst_3])
     gst_scores = torch.from_numpy(gst_head_scores).float()
-    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
-    # mel2wav inference:
     with torch.no_grad():
-      audio = vocoder_model.inference(mel_outputs_postnet)
-    audio_numpy = audio.data.cpu().detach().numpy()
     # prepare plot for the output:
     mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
     mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
     alignments = alignments.squeeze().T.detach().numpy()
-    fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
-    return (22050, audio_numpy), fig_mel  # fig_align
-iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:", value=0.4),
-                                            gr.Slider(0.2, 0.45, label="Second style token weight:", value=0.26),
-                                            gr.Slider(0.2, 0.45, label="Third style token weight:", value=0.33)],
-                     outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.Plot(label="Output"),],
-                     title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
-iface.launch()

 from hyper_parameters import tacotron_params as hparams
 from training import load_model
+from audio_processing import griffin_lim
+from nn_layers import TacotronSTFT
 from text import text_to_sequence
+from hifigan.env import AttrDict
+from examples_taco2 import *
+from hifigan.models import Generator
 import torch
 import numpy as np
+import json
+import os
 from matplotlib import pyplot as plt
 of each style token, we configured the attention module as a single-head.
 Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
+generated speech may show more distortion and miss-pronunciations.
 """
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print("Loading '{}'".format(filepath))
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    print("Complete.")
+    return checkpoint_dict
+def plot_spec_align_sep(mel, align):
+    plt.figure(figsize=(4, 3))
     fig_mel = plt.figure()
+    ax_mel = fig_mel.add_subplot(111)
+    fig_mel.tight_layout()
     ax_mel.imshow(mel)
+    # fig_mel.set_title('Mel-Scale Spectrogram', fontsize=12)
+    fig_align = plt.figure()
+    ax_align = fig_align.add_subplot(111)  # fig_align
+    fig_align.tight_layout()
     ax_align.imshow(align)
+    # fig_align.set_title('Alignment', fontsize=12)
+    return fig_mel, fig_align
+# load trained tacotron2 + GST model:
+model = load_model(hparams)
+checkpoint_path = "models/checkpoint_78000.model"
+model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
+# model.to('cuda')
+_ = model.eval()
+# load pre-trained HiFi-GAN model for mel2audio:
+hifigan_checkpoint_path = "models/generator_v1"
+config_file = os.path.join(os.path.split(hifigan_checkpoint_path)[0], 'config.json')
+with open(config_file) as f:
+    data = f.read()
+json_config = json.loads(data)
+h = AttrDict(json_config)
+device = torch.device("cpu")
+generator = Generator(h).to(device)
+state_dict_g = load_checkpoint(hifigan_checkpoint_path, device)
+generator.load_state_dict(state_dict_g['generator'])
+generator.eval()
+generator.remove_weight_norm()
+def synthesize(text, gst_1, gst_2, gst_3, voc):
     sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
     sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
+    # gst_head_scores = np.array([0.5, 0.15, 0.35])
     gst_head_scores = np.array([gst_1, gst_2, gst_3])
     gst_scores = torch.from_numpy(gst_head_scores).float()
     with torch.no_grad():
+        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
+    if voc == 0:
+        # mel2wav inference:
+        with torch.no_grad():
+            y_g_hat = generator(mel_outputs_postnet)
+            audio = y_g_hat.squeeze()
+            audio = audio * MAX_WAV_VALUE
+            audio_numpy = audio.cpu().numpy().astype('int16')
+            # audio = vocoder_model.inference(mel_outputs_postnet)
+            # audio_numpy = audio.data.cpu().detach().numpy()
+    else:
+        # Griffin Lim vocoder synthesis:
+        griffin_iters = 60
+        taco_stft = TacotronSTFT(hparams['filter_length'], hparams['hop_length'], hparams['win_length'],
+                                 sampling_rate=hparams['sampling_rate'])
+        mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
+        mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
+        spec_from_mel_scaling = 60
+        spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
+        spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
+        spec_from_mel = spec_from_mel * spec_from_mel_scaling
+        audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters)
+        audio = audio.squeeze()
+        audio_numpy = audio.cpu().numpy()
     # prepare plot for the output:
     mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
     mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
     alignments = alignments.squeeze().T.detach().numpy()
+    # fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
+    # fig_mel, fig_align = plot_spec_align_sep(mel_outputs_postnet, alignments)
+    # normalize numpy arrays between [-1, 1]
+    min_val = np.min(mel_outputs_postnet)
+    max_val = np.max(mel_outputs_postnet)
+    scaled_mel = (mel_outputs_postnet - min_val) / (max_val - min_val)
+    normalized_mel = 2 * scaled_mel - 1
+    min_val = np.min(alignments)
+    max_val = np.max(alignments)
+    scaled_align = (alignments - min_val) / (max_val - min_val)
+    normalized_align = 2 * scaled_align - 1
+    aw = gr.make_waveform((22050, audio_numpy), bg_image='background_images/wallpaper_test_1_crop_3.jpg',
+                          bars_color=('#f3df4b', '#63edb7'), bar_count=100, bar_width=0.7, animate=True)
+    return aw, normalized_mel, normalized_align  # (22050, audio_numpy), fig_mel, fig_align
+# Custom Demo Interface:
+# theme='ysharma/steampunk',
+#                css=".gradio-container {background: url('file=background_images/wallpaper_test_mod_2.jpg')}"
+with gr.Blocks() as demo:
+    gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> "
+                "<h2>Speech Synthesis with Partial Style Control</h2></center><br>")
+    # gr.Markdown("## <center>Unsupervised Style Tokens using Single-Head Attention Parallel Encoder "
+    #             "with Tacotron2</center>")
+    with gr.Row():
+        with gr.Column(scale=1):
+            inp = gr.Textbox(label="Input Text", value="Speech synthesis has evolved dramatically since the "
+                                                       "development of neural architectures capable of generating "
+                                                       "high quality samples.")
+            clear_btn = gr.ClearButton(value='Clear Text', size='sm', components=[inp])
+            # gr.Markdown("A continuació, calibrem els pesos dels *style tokens*:")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    with gr.Tab("Global Style Tokens"):
+                        gst_1 = gr.Slider(0.2, 0.45, label="GST 1", value=0.4)
+                        gst_2 = gr.Slider(0.2, 0.45, label="GST 2", value=0.26)
+                        gst_3 = gr.Slider(0.2, 0.45, label="GST 3", value=0.33)
+                with gr.Column(scale=0):
+                    with gr.Tab("Vocoder"):
+                        vocoder = gr.Radio([("HiFi-GAN", 0), ("Griffin-Lim", 1)],
+                                           container=False, value=0, min_width=300)  # label="Vocoder")
+                    greet_btn = gr.Button("Synthesize!", scale=1)
+        with gr.Column():
+            # wave_video = gr.make_waveform(audio)
+            with gr.Tab("Spectrogram"):
+                # spec_plot = gr.Plot()
+                spec_plot = gr.Image(container=False)
+            with gr.Tab("Alignment"):
+                # align_plot = gr.Plot()
+                align_plot = gr.Image(container=False)
+            wave_video = gr.Video(label="Waveform", height=150, width=800, container=False)
+            # play_video = gr.Button(label="Play", size='sm')
+            # audio_clip = gr.Audio(label="Generated Speech", type="numpy")
+    def display_video():
+        return wave_video
+    # play_video.click(fn=display_video)
+    greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder],
+                    outputs=[wave_video, spec_plot, align_plot],
+                    api_name="synthesize")
+    with gr.Row():
+        with gr.Column():
+            # gr.Markdown("### Audio Examples")
+            gr.Examples(examples=infer_from_text_examples,
+                        inputs=[inp, gst_1, gst_2, gst_3, vocoder],
+                        outputs=[wave_video, spec_plot, align_plot],
+                        fn=synthesize,
+                        cache_examples=True, )
+    gr.Markdown("""
+    ### Details and Indications
+    This is a Text-to-Speech (TTS) system that consists of two modules: 1) a Tacotron2 replicated model, which generates
+    the spectrogram of the speech corresponding to the input text. And 2) a pre-trained HiFiGAN vocoder that maps the
+    spectrogram to a digital waveform. Global Style Tokens (GST) have been implemented to catch style information from
+    the female speaker with which the model has been trained (see the links below for more information).
+    Please, feel free to play with the GST scores and observe how the synthetic voice spells the input text.
+    Keep in mind that GSTs have been trained in an unsupervised way, so there is no specific control of
+    style attributes. Moreover, try to balance the GST scores by making them add up to a value close to 1. Below or
+    higher than 1 may cause low energy, mispronunciations or distortion.
+    You can choose between the HiFiGAN trained vocoder and the iterative algorithm Griffin-Lim, which does not need
+    to be trained, but produces a speech quite "robotic".
+    ### More Information
+    Spectrogram generator has been adapted and trained from the
+    [NVIDIA's](https://github.com/NVIDIA/tacotron2) Tacotron2 replica published in
+    <a href="https://arxiv.org/abs/1712.05884" style="display: inline-block;margin-top: .5em;margin-right: .25em;"
+    target="_blank"> <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
+    src="https://img.shields.io/badge/ArXiv-Tacotron2-b31b1b" alt="Tacotron2"></a>
+    <br>
+    The neural vocoder is a pre-trained model replicated from <a href="https://arxiv.org/abs/2010.05646"
+    style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom:
+    0em;display: inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-HiFi%20GAN-b31b1b"
+    alt="HiFiGAN"></a>
+    <br>
+    Unsupervised style control has been implemented based on <a href="https://arxiv.org/abs/1803.09017" style="display:
+    inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom: 0em;display:
+    inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-Global%20Style%20Tokens-b31b1b"
+    alt="Global Style Tokens"></a>
+    <br>
+    """)
+    """Instead of using multiple heads for the attention module, we just set one single
+    head for simplicity, ease control purposes, but also to observer whether this attention still
+    works with just one head."""
+    # gr.Markdown("This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens "
+    #             "(GST). The whole architecture has been trained from scratch with the LJSpeech dataset. In order "
+    #             "to control the relevance of each style token, we configured the attention module as a single-head. "
+    #             "Keep in mind that, for a better synthetic output, the sum of the three style weights should be around "
+    #             "1. A combination that sums less than 1 may work, but higher the generated speech may show more "
+    #             "distortion and miss-pronunciations.")
+demo.launch()