File size: 11,714 Bytes
bfcd0c0
 
080259f
 
bfcd0c0
cc8638f
 
 
 
080259f
cc8638f
 
cb82d78
cc8638f
cb82d78
080259f
 
cc8638f
 
080259f
2f6ba98
 
3ed419f
 
 
 
 
 
cb82d78
4d5330a
848f2f7
2f6ba98
b48749f
 
 
2f6ba98
 
cc8638f
b48749f
 
a25696f
cc8638f
 
 
 
 
 
a25696f
cb82d78
cc8638f
 
92be68f
 
cc8638f
 
92be68f
cc8638f
92be68f
cc8638f
 
 
92be68f
cc8638f
 
 
 
 
 
 
 
 
 
 
2f6ba98
cc8638f
 
 
 
 
 
 
 
2f6ba98
cc8638f
2f6ba98
cc8638f
 
 
 
 
 
 
cb82d78
f95a0dc
cb82d78
cc8638f
2f6ba98
f95a0dc
c472fbf
cb82d78
cc8638f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb82d78
2f6ba98
c628e3e
 
 
cc8638f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import gradio as gr

from hyper_parameters import tacotron_params as hparams
from training import load_model

from audio_processing import griffin_lim
from nn_layers import TacotronSTFT


from text import text_to_sequence
from hifigan.env import AttrDict
from examples_taco2 import *

from hifigan.models import Generator

import torch
import numpy as np
import json
import os

from matplotlib import pyplot as plt

# Adjust vertical spacing between subplots
plt.subplots_adjust(hspace=0.15)  # You can adjust the value as needed

# Adjust the white space (margins) around the plot
plt.tight_layout(pad=0.5)  # You can adjust the pad value as needed

torch.manual_seed(1234)
MAX_WAV_VALUE = 32768.0

DESCRIPTION = """
This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST). 
The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
of each style token, we configured the attention module as a single-head.

Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
generated speech may show more distortion and miss-pronunciations.
"""


def load_checkpoint(filepath, device):
    assert os.path.isfile(filepath)
    print("Loading '{}'".format(filepath))
    checkpoint_dict = torch.load(filepath, map_location=device)
    print("Complete.")
    return checkpoint_dict


def plot_spec_align_sep(mel, align):
    plt.figure(figsize=(4, 3))

    fig_mel = plt.figure()
    ax_mel = fig_mel.add_subplot(111)
    fig_mel.tight_layout()
    ax_mel.imshow(mel)
    # fig_mel.set_title('Mel-Scale Spectrogram', fontsize=12)

    fig_align = plt.figure()
    ax_align = fig_align.add_subplot(111)  # fig_align
    fig_align.tight_layout()
    ax_align.imshow(align)
    # fig_align.set_title('Alignment', fontsize=12)

    return fig_mel, fig_align


# load trained tacotron2 + GST model:
model = load_model(hparams)
checkpoint_path = "models/checkpoint_78000.model"
model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
# model.to('cuda')
_ = model.eval()

# load pre-trained HiFi-GAN model for mel2audio:
hifigan_checkpoint_path = "models/generator_v1"
config_file = os.path.join(os.path.split(hifigan_checkpoint_path)[0], 'config.json')
with open(config_file) as f:
    data = f.read()
json_config = json.loads(data)
h = AttrDict(json_config)
device = torch.device("cpu")

generator = Generator(h).to(device)

state_dict_g = load_checkpoint(hifigan_checkpoint_path, device)
generator.load_state_dict(state_dict_g['generator'])
generator.eval()
generator.remove_weight_norm()


def synthesize(text, gst_1, gst_2, gst_3, voc):
    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)

    # gst_head_scores = np.array([0.5, 0.15, 0.35])
    gst_head_scores = np.array([gst_1, gst_2, gst_3])
    gst_scores = torch.from_numpy(gst_head_scores).float()

    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)

    if voc == 0:
        # mel2wav inference:
        with torch.no_grad():
            y_g_hat = generator(mel_outputs_postnet)
            audio = y_g_hat.squeeze()
            audio = audio * MAX_WAV_VALUE
            audio_numpy = audio.cpu().numpy().astype('int16')
            # audio = vocoder_model.inference(mel_outputs_postnet)
            # audio_numpy = audio.data.cpu().detach().numpy()

    else:
        # Griffin Lim vocoder synthesis:
        griffin_iters = 60
        taco_stft = TacotronSTFT(hparams['filter_length'], hparams['hop_length'], hparams['win_length'],
                                 sampling_rate=hparams['sampling_rate'])

        mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
        mel_decompress = mel_decompress.transpose(1, 2).data.cpu()

        spec_from_mel_scaling = 60
        spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
        spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
        spec_from_mel = spec_from_mel * spec_from_mel_scaling

        audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters)

        audio = audio.squeeze()
        audio_numpy = audio.cpu().numpy()

    # prepare plot for the output:
    mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
    mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
    alignments = alignments.squeeze().T.detach().numpy()
    # fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
    # fig_mel, fig_align = plot_spec_align_sep(mel_outputs_postnet, alignments)

    # normalize numpy arrays between [-1, 1]
    min_val = np.min(mel_outputs_postnet)
    max_val = np.max(mel_outputs_postnet)
    scaled_mel = (mel_outputs_postnet - min_val) / (max_val - min_val)
    normalized_mel = 2 * scaled_mel - 1

    min_val = np.min(alignments)
    max_val = np.max(alignments)
    scaled_align = (alignments - min_val) / (max_val - min_val)
    normalized_align = 2 * scaled_align - 1

    aw = gr.make_waveform((22050, audio_numpy), bg_image='background_images/wallpaper_test_1_crop_3.jpg',
                          bars_color=('#f3df4b', '#63edb7'), bar_count=100, bar_width=0.7, animate=True)

    return aw, normalized_mel, normalized_align  # (22050, audio_numpy), fig_mel, fig_align


# Custom Demo Interface:
# theme='ysharma/steampunk',
#                css=".gradio-container {background: url('file=background_images/wallpaper_test_mod_2.jpg')}"
with gr.Blocks() as demo:
    gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> "
                "<h2>Speech Synthesis with Partial Style Control</h2></center><br>")
    # gr.Markdown("## <center>Unsupervised Style Tokens using Single-Head Attention Parallel Encoder "
    #             "with Tacotron2</center>")
    with gr.Row():
        with gr.Column(scale=1):
            inp = gr.Textbox(label="Input Text", value="Speech synthesis has evolved dramatically since the "
                                                       "development of neural architectures capable of generating "
                                                       "high quality samples.")
            clear_btn = gr.ClearButton(value='Clear Text', size='sm', components=[inp])
            # gr.Markdown("A continuació, calibrem els pesos dels *style tokens*:")
            with gr.Row():
                with gr.Column(scale=2):
                    with gr.Tab("Global Style Tokens"):
                        gst_1 = gr.Slider(0.2, 0.45, label="GST 1", value=0.4)
                        gst_2 = gr.Slider(0.2, 0.45, label="GST 2", value=0.26)
                        gst_3 = gr.Slider(0.2, 0.45, label="GST 3", value=0.33)
                with gr.Column(scale=0):
                    with gr.Tab("Vocoder"):
                        vocoder = gr.Radio([("HiFi-GAN", 0), ("Griffin-Lim", 1)],
                                           container=False, value=0, min_width=300)  # label="Vocoder")
                    greet_btn = gr.Button("Synthesize!", scale=1)
        with gr.Column():
            # wave_video = gr.make_waveform(audio)
            with gr.Tab("Spectrogram"):
                # spec_plot = gr.Plot()
                spec_plot = gr.Image(container=False)
            with gr.Tab("Alignment"):
                # align_plot = gr.Plot()
                align_plot = gr.Image(container=False)
            wave_video = gr.Video(label="Waveform", height=150, width=800, container=False)
            # play_video = gr.Button(label="Play", size='sm')
            # audio_clip = gr.Audio(label="Generated Speech", type="numpy")

    def display_video():
        return wave_video
    # play_video.click(fn=display_video)
    greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder],
                    outputs=[wave_video, spec_plot, align_plot],
                    api_name="synthesize")

    with gr.Row():
        with gr.Column():
            # gr.Markdown("### Audio Examples")
            gr.Examples(examples=infer_from_text_examples,
                        inputs=[inp, gst_1, gst_2, gst_3, vocoder],
                        outputs=[wave_video, spec_plot, align_plot],
                        fn=synthesize,
                        cache_examples=True, )
    gr.Markdown("""
    ### Details and Indications
    This is a Text-to-Speech (TTS) system that consists of two modules: 1) a Tacotron2 replicated model, which generates
    the spectrogram of the speech corresponding to the input text. And 2) a pre-trained HiFiGAN vocoder that maps the 
    spectrogram to a digital waveform. Global Style Tokens (GST) have been implemented to catch style information from
    the female speaker with which the model has been trained (see the links below for more information).
    Please, feel free to play with the GST scores and observe how the synthetic voice spells the input text. 
    Keep in mind that GSTs have been trained in an unsupervised way, so there is no specific control of 
    style attributes. Moreover, try to balance the GST scores by making them add up to a value close to 1. Below or 
    higher than 1 may cause low energy, mispronunciations or distortion. 
    You can choose between the HiFiGAN trained vocoder and the iterative algorithm Griffin-Lim, which does not need 
    to be trained, but produces a speech quite "robotic".
    
    ### More Information
    Spectrogram generator has been adapted and trained from the 
    [NVIDIA's](https://github.com/NVIDIA/tacotron2) Tacotron2 replica published in 
    <a href="https://arxiv.org/abs/1712.05884" style="display: inline-block;margin-top: .5em;margin-right: .25em;" 
    target="_blank"> <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" 
    src="https://img.shields.io/badge/ArXiv-Tacotron2-b31b1b" alt="Tacotron2"></a> 
    <br> 
    The neural vocoder is a pre-trained model replicated from <a href="https://arxiv.org/abs/2010.05646" 
    style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom: 
    0em;display: inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-HiFi%20GAN-b31b1b" 
    alt="HiFiGAN"></a> 
    <br> 
    Unsupervised style control has been implemented based on <a href="https://arxiv.org/abs/1803.09017" style="display: 
    inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom: 0em;display: 
    inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-Global%20Style%20Tokens-b31b1b" 
    alt="Global Style Tokens"></a> 
    <br> 
    """)

    """Instead of using multiple heads for the attention module, we just set one single 
    head for simplicity, ease control purposes, but also to observer whether this attention still 
    works with just one head."""

    # gr.Markdown("This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens "
    #             "(GST). The whole architecture has been trained from scratch with the LJSpeech dataset. In order "
    #             "to control the relevance of each style token, we configured the attention module as a single-head. "
    #             "Keep in mind that, for a better synthetic output, the sum of the three style weights should be around "
    #             "1. A combination that sums less than 1 may work, but higher the generated speech may show more "
    #             "distortion and miss-pronunciations.")

demo.launch()