AlexK-PL commited on
Commit
cc8638f
1 Parent(s): 97a9463

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -43
app.py CHANGED
@@ -3,13 +3,20 @@ import gradio as gr
3
  from hyper_parameters import tacotron_params as hparams
4
  from training import load_model
5
 
 
 
 
 
6
  from text import text_to_sequence
 
 
7
 
8
- from melgan.model.generator import Generator
9
- from melgan.utils.hparams import load_hparam
10
 
11
  import torch
12
  import numpy as np
 
 
13
 
14
  from matplotlib import pyplot as plt
15
 
@@ -28,69 +35,218 @@ The whole architecture has been trained from scratch with the LJSpeech dataset.
28
  of each style token, we configured the attention module as a single-head.
29
 
30
  Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
31
- generated speech may show more distortion and misspronunciations.
32
  """
33
 
34
- # load trained tacotron2 + GST model:
35
- model = load_model(hparams)
36
- checkpoint_path = "trained_models/checkpoint_78000.model"
37
- model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
38
- # model.to('cuda')
39
- _ = model.eval()
40
 
41
- # load pre trained MelGAN model for mel2audio:
42
- vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt"
43
- checkpoint = torch.load(vocoder_checkpoint_path, map_location="cpu")
44
- hp_melgan = load_hparam("melgan/config/default.yaml")
45
- vocoder_model = Generator(80)
46
- vocoder_model.load_state_dict(checkpoint['model_g'])
47
- # vocoder_model = vocoder_model.to('cuda')
48
- vocoder_model.eval(inference=False)
49
 
50
 
51
- def plot_spec_align(mel, align):
 
52
 
53
  fig_mel = plt.figure()
54
- ax_mel = fig_mel.add_subplot(211)
 
55
  ax_mel.imshow(mel)
56
- ax_mel.set_title('Mel-Scale Spectrogram', fontsize=12)
57
 
58
- # fig_align = plt.figure()
59
- ax_align = fig_mel.add_subplot(212) # fig_align
 
60
  ax_align.imshow(align)
61
- ax_align.set_title('Alignment', fontsize=12)
 
 
 
 
 
 
 
 
 
 
62
 
63
- return fig_mel # fig_align
 
 
 
 
 
 
 
64
 
 
65
 
66
- def synthesize(text, gst_1, gst_2, gst_3):
 
 
 
 
 
 
67
  sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
68
  sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
69
 
70
- # gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35])
71
  gst_head_scores = np.array([gst_1, gst_2, gst_3])
72
  gst_scores = torch.from_numpy(gst_head_scores).float()
73
 
74
- mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
75
-
76
- # mel2wav inference:
77
  with torch.no_grad():
78
- audio = vocoder_model.inference(mel_outputs_postnet)
79
- audio_numpy = audio.data.cpu().detach().numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  # prepare plot for the output:
82
  mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
83
  mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
84
  alignments = alignments.squeeze().T.detach().numpy()
85
- fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
86
-
87
- return (22050, audio_numpy), fig_mel # fig_align
88
-
89
-
90
- iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:", value=0.4),
91
- gr.Slider(0.2, 0.45, label="Second style token weight:", value=0.26),
92
- gr.Slider(0.2, 0.45, label="Third style token weight:", value=0.33)],
93
- outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.Plot(label="Output"),],
94
- title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
95
- iface.launch()
96
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from hyper_parameters import tacotron_params as hparams
4
  from training import load_model
5
 
6
+ from audio_processing import griffin_lim
7
+ from nn_layers import TacotronSTFT
8
+
9
+
10
  from text import text_to_sequence
11
+ from hifigan.env import AttrDict
12
+ from examples_taco2 import *
13
 
14
+ from hifigan.models import Generator
 
15
 
16
  import torch
17
  import numpy as np
18
+ import json
19
+ import os
20
 
21
  from matplotlib import pyplot as plt
22
 
 
35
  of each style token, we configured the attention module as a single-head.
36
 
37
  Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
38
+ generated speech may show more distortion and miss-pronunciations.
39
  """
40
 
 
 
 
 
 
 
41
 
42
+ def load_checkpoint(filepath, device):
43
+ assert os.path.isfile(filepath)
44
+ print("Loading '{}'".format(filepath))
45
+ checkpoint_dict = torch.load(filepath, map_location=device)
46
+ print("Complete.")
47
+ return checkpoint_dict
 
 
48
 
49
 
50
+ def plot_spec_align_sep(mel, align):
51
+ plt.figure(figsize=(4, 3))
52
 
53
  fig_mel = plt.figure()
54
+ ax_mel = fig_mel.add_subplot(111)
55
+ fig_mel.tight_layout()
56
  ax_mel.imshow(mel)
57
+ # fig_mel.set_title('Mel-Scale Spectrogram', fontsize=12)
58
 
59
+ fig_align = plt.figure()
60
+ ax_align = fig_align.add_subplot(111) # fig_align
61
+ fig_align.tight_layout()
62
  ax_align.imshow(align)
63
+ # fig_align.set_title('Alignment', fontsize=12)
64
+
65
+ return fig_mel, fig_align
66
+
67
+
68
+ # load trained tacotron2 + GST model:
69
+ model = load_model(hparams)
70
+ checkpoint_path = "models/checkpoint_78000.model"
71
+ model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
72
+ # model.to('cuda')
73
+ _ = model.eval()
74
 
75
+ # load pre-trained HiFi-GAN model for mel2audio:
76
+ hifigan_checkpoint_path = "models/generator_v1"
77
+ config_file = os.path.join(os.path.split(hifigan_checkpoint_path)[0], 'config.json')
78
+ with open(config_file) as f:
79
+ data = f.read()
80
+ json_config = json.loads(data)
81
+ h = AttrDict(json_config)
82
+ device = torch.device("cpu")
83
 
84
+ generator = Generator(h).to(device)
85
 
86
+ state_dict_g = load_checkpoint(hifigan_checkpoint_path, device)
87
+ generator.load_state_dict(state_dict_g['generator'])
88
+ generator.eval()
89
+ generator.remove_weight_norm()
90
+
91
+
92
+ def synthesize(text, gst_1, gst_2, gst_3, voc):
93
  sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
94
  sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
95
 
96
+ # gst_head_scores = np.array([0.5, 0.15, 0.35])
97
  gst_head_scores = np.array([gst_1, gst_2, gst_3])
98
  gst_scores = torch.from_numpy(gst_head_scores).float()
99
 
 
 
 
100
  with torch.no_grad():
101
+ mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
102
+
103
+ if voc == 0:
104
+ # mel2wav inference:
105
+ with torch.no_grad():
106
+ y_g_hat = generator(mel_outputs_postnet)
107
+ audio = y_g_hat.squeeze()
108
+ audio = audio * MAX_WAV_VALUE
109
+ audio_numpy = audio.cpu().numpy().astype('int16')
110
+ # audio = vocoder_model.inference(mel_outputs_postnet)
111
+ # audio_numpy = audio.data.cpu().detach().numpy()
112
+
113
+ else:
114
+ # Griffin Lim vocoder synthesis:
115
+ griffin_iters = 60
116
+ taco_stft = TacotronSTFT(hparams['filter_length'], hparams['hop_length'], hparams['win_length'],
117
+ sampling_rate=hparams['sampling_rate'])
118
+
119
+ mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
120
+ mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
121
+
122
+ spec_from_mel_scaling = 60
123
+ spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
124
+ spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
125
+ spec_from_mel = spec_from_mel * spec_from_mel_scaling
126
+
127
+ audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters)
128
+
129
+ audio = audio.squeeze()
130
+ audio_numpy = audio.cpu().numpy()
131
 
132
  # prepare plot for the output:
133
  mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
134
  mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
135
  alignments = alignments.squeeze().T.detach().numpy()
136
+ # fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
137
+ # fig_mel, fig_align = plot_spec_align_sep(mel_outputs_postnet, alignments)
138
+
139
+ # normalize numpy arrays between [-1, 1]
140
+ min_val = np.min(mel_outputs_postnet)
141
+ max_val = np.max(mel_outputs_postnet)
142
+ scaled_mel = (mel_outputs_postnet - min_val) / (max_val - min_val)
143
+ normalized_mel = 2 * scaled_mel - 1
144
+
145
+ min_val = np.min(alignments)
146
+ max_val = np.max(alignments)
147
+ scaled_align = (alignments - min_val) / (max_val - min_val)
148
+ normalized_align = 2 * scaled_align - 1
149
+
150
+ aw = gr.make_waveform((22050, audio_numpy), bg_image='background_images/wallpaper_test_1_crop_3.jpg',
151
+ bars_color=('#f3df4b', '#63edb7'), bar_count=100, bar_width=0.7, animate=True)
152
+
153
+ return aw, normalized_mel, normalized_align # (22050, audio_numpy), fig_mel, fig_align
154
+
155
+
156
+ # Custom Demo Interface:
157
+ # theme='ysharma/steampunk',
158
+ # css=".gradio-container {background: url('file=background_images/wallpaper_test_mod_2.jpg')}"
159
+ with gr.Blocks() as demo:
160
+ gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> "
161
+ "<h2>Speech Synthesis with Partial Style Control</h2></center><br>")
162
+ # gr.Markdown("## <center>Unsupervised Style Tokens using Single-Head Attention Parallel Encoder "
163
+ # "with Tacotron2</center>")
164
+ with gr.Row():
165
+ with gr.Column(scale=1):
166
+ inp = gr.Textbox(label="Input Text", value="Speech synthesis has evolved dramatically since the "
167
+ "development of neural architectures capable of generating "
168
+ "high quality samples.")
169
+ clear_btn = gr.ClearButton(value='Clear Text', size='sm', components=[inp])
170
+ # gr.Markdown("A continuaci贸, calibrem els pesos dels *style tokens*:")
171
+ with gr.Row():
172
+ with gr.Column(scale=2):
173
+ with gr.Tab("Global Style Tokens"):
174
+ gst_1 = gr.Slider(0.2, 0.45, label="GST 1", value=0.4)
175
+ gst_2 = gr.Slider(0.2, 0.45, label="GST 2", value=0.26)
176
+ gst_3 = gr.Slider(0.2, 0.45, label="GST 3", value=0.33)
177
+ with gr.Column(scale=0):
178
+ with gr.Tab("Vocoder"):
179
+ vocoder = gr.Radio([("HiFi-GAN", 0), ("Griffin-Lim", 1)],
180
+ container=False, value=0, min_width=300) # label="Vocoder")
181
+ greet_btn = gr.Button("Synthesize!", scale=1)
182
+ with gr.Column():
183
+ # wave_video = gr.make_waveform(audio)
184
+ with gr.Tab("Spectrogram"):
185
+ # spec_plot = gr.Plot()
186
+ spec_plot = gr.Image(container=False)
187
+ with gr.Tab("Alignment"):
188
+ # align_plot = gr.Plot()
189
+ align_plot = gr.Image(container=False)
190
+ wave_video = gr.Video(label="Waveform", height=150, width=800, container=False)
191
+ # play_video = gr.Button(label="Play", size='sm')
192
+ # audio_clip = gr.Audio(label="Generated Speech", type="numpy")
193
+
194
+ def display_video():
195
+ return wave_video
196
+ # play_video.click(fn=display_video)
197
+ greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder],
198
+ outputs=[wave_video, spec_plot, align_plot],
199
+ api_name="synthesize")
200
+
201
+ with gr.Row():
202
+ with gr.Column():
203
+ # gr.Markdown("### Audio Examples")
204
+ gr.Examples(examples=infer_from_text_examples,
205
+ inputs=[inp, gst_1, gst_2, gst_3, vocoder],
206
+ outputs=[wave_video, spec_plot, align_plot],
207
+ fn=synthesize,
208
+ cache_examples=True, )
209
+ gr.Markdown("""
210
+ ### Details and Indications
211
+ This is a Text-to-Speech (TTS) system that consists of two modules: 1) a Tacotron2 replicated model, which generates
212
+ the spectrogram of the speech corresponding to the input text. And 2) a pre-trained HiFiGAN vocoder that maps the
213
+ spectrogram to a digital waveform. Global Style Tokens (GST) have been implemented to catch style information from
214
+ the female speaker with which the model has been trained (see the links below for more information).
215
+ Please, feel free to play with the GST scores and observe how the synthetic voice spells the input text.
216
+ Keep in mind that GSTs have been trained in an unsupervised way, so there is no specific control of
217
+ style attributes. Moreover, try to balance the GST scores by making them add up to a value close to 1. Below or
218
+ higher than 1 may cause low energy, mispronunciations or distortion.
219
+ You can choose between the HiFiGAN trained vocoder and the iterative algorithm Griffin-Lim, which does not need
220
+ to be trained, but produces a speech quite "robotic".
221
+
222
+ ### More Information
223
+ Spectrogram generator has been adapted and trained from the
224
+ [NVIDIA's](https://github.com/NVIDIA/tacotron2) Tacotron2 replica published in
225
+ <a href="https://arxiv.org/abs/1712.05884" style="display: inline-block;margin-top: .5em;margin-right: .25em;"
226
+ target="_blank"> <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
227
+ src="https://img.shields.io/badge/ArXiv-Tacotron2-b31b1b" alt="Tacotron2"></a>
228
+ <br>
229
+ The neural vocoder is a pre-trained model replicated from <a href="https://arxiv.org/abs/2010.05646"
230
+ style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom:
231
+ 0em;display: inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-HiFi%20GAN-b31b1b"
232
+ alt="HiFiGAN"></a>
233
+ <br>
234
+ Unsupervised style control has been implemented based on <a href="https://arxiv.org/abs/1803.09017" style="display:
235
+ inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom: 0em;display:
236
+ inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-Global%20Style%20Tokens-b31b1b"
237
+ alt="Global Style Tokens"></a>
238
+ <br>
239
+ """)
240
+
241
+ """Instead of using multiple heads for the attention module, we just set one single
242
+ head for simplicity, ease control purposes, but also to observer whether this attention still
243
+ works with just one head."""
244
+
245
+ # gr.Markdown("This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens "
246
+ # "(GST). The whole architecture has been trained from scratch with the LJSpeech dataset. In order "
247
+ # "to control the relevance of each style token, we configured the attention module as a single-head. "
248
+ # "Keep in mind that, for a better synthetic output, the sum of the three style weights should be around "
249
+ # "1. A combination that sums less than 1 may work, but higher the generated speech may show more "
250
+ # "distortion and miss-pronunciations.")
251
+
252
+ demo.launch()