Spaces:
Sleeping
Sleeping
File size: 11,714 Bytes
bfcd0c0 080259f bfcd0c0 cc8638f 080259f cc8638f cb82d78 cc8638f cb82d78 080259f cc8638f 080259f 2f6ba98 3ed419f cb82d78 4d5330a 848f2f7 2f6ba98 b48749f 2f6ba98 cc8638f b48749f a25696f cc8638f a25696f cb82d78 cc8638f 92be68f cc8638f 92be68f cc8638f 92be68f cc8638f 92be68f cc8638f 2f6ba98 cc8638f 2f6ba98 cc8638f 2f6ba98 cc8638f cb82d78 f95a0dc cb82d78 cc8638f 2f6ba98 f95a0dc c472fbf cb82d78 cc8638f cb82d78 2f6ba98 c628e3e cc8638f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
import gradio as gr
from hyper_parameters import tacotron_params as hparams
from training import load_model
from audio_processing import griffin_lim
from nn_layers import TacotronSTFT
from text import text_to_sequence
from hifigan.env import AttrDict
from examples_taco2 import *
from hifigan.models import Generator
import torch
import numpy as np
import json
import os
from matplotlib import pyplot as plt
# Adjust vertical spacing between subplots
plt.subplots_adjust(hspace=0.15) # You can adjust the value as needed
# Adjust the white space (margins) around the plot
plt.tight_layout(pad=0.5) # You can adjust the pad value as needed
torch.manual_seed(1234)
MAX_WAV_VALUE = 32768.0
DESCRIPTION = """
This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST).
The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
of each style token, we configured the attention module as a single-head.
Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
generated speech may show more distortion and miss-pronunciations.
"""
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print("Loading '{}'".format(filepath))
checkpoint_dict = torch.load(filepath, map_location=device)
print("Complete.")
return checkpoint_dict
def plot_spec_align_sep(mel, align):
plt.figure(figsize=(4, 3))
fig_mel = plt.figure()
ax_mel = fig_mel.add_subplot(111)
fig_mel.tight_layout()
ax_mel.imshow(mel)
# fig_mel.set_title('Mel-Scale Spectrogram', fontsize=12)
fig_align = plt.figure()
ax_align = fig_align.add_subplot(111) # fig_align
fig_align.tight_layout()
ax_align.imshow(align)
# fig_align.set_title('Alignment', fontsize=12)
return fig_mel, fig_align
# load trained tacotron2 + GST model:
model = load_model(hparams)
checkpoint_path = "models/checkpoint_78000.model"
model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
# model.to('cuda')
_ = model.eval()
# load pre-trained HiFi-GAN model for mel2audio:
hifigan_checkpoint_path = "models/generator_v1"
config_file = os.path.join(os.path.split(hifigan_checkpoint_path)[0], 'config.json')
with open(config_file) as f:
data = f.read()
json_config = json.loads(data)
h = AttrDict(json_config)
device = torch.device("cpu")
generator = Generator(h).to(device)
state_dict_g = load_checkpoint(hifigan_checkpoint_path, device)
generator.load_state_dict(state_dict_g['generator'])
generator.eval()
generator.remove_weight_norm()
def synthesize(text, gst_1, gst_2, gst_3, voc):
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
# gst_head_scores = np.array([0.5, 0.15, 0.35])
gst_head_scores = np.array([gst_1, gst_2, gst_3])
gst_scores = torch.from_numpy(gst_head_scores).float()
with torch.no_grad():
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
if voc == 0:
# mel2wav inference:
with torch.no_grad():
y_g_hat = generator(mel_outputs_postnet)
audio = y_g_hat.squeeze()
audio = audio * MAX_WAV_VALUE
audio_numpy = audio.cpu().numpy().astype('int16')
# audio = vocoder_model.inference(mel_outputs_postnet)
# audio_numpy = audio.data.cpu().detach().numpy()
else:
# Griffin Lim vocoder synthesis:
griffin_iters = 60
taco_stft = TacotronSTFT(hparams['filter_length'], hparams['hop_length'], hparams['win_length'],
sampling_rate=hparams['sampling_rate'])
mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
spec_from_mel_scaling = 60
spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
spec_from_mel = spec_from_mel * spec_from_mel_scaling
audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters)
audio = audio.squeeze()
audio_numpy = audio.cpu().numpy()
# prepare plot for the output:
mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
alignments = alignments.squeeze().T.detach().numpy()
# fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
# fig_mel, fig_align = plot_spec_align_sep(mel_outputs_postnet, alignments)
# normalize numpy arrays between [-1, 1]
min_val = np.min(mel_outputs_postnet)
max_val = np.max(mel_outputs_postnet)
scaled_mel = (mel_outputs_postnet - min_val) / (max_val - min_val)
normalized_mel = 2 * scaled_mel - 1
min_val = np.min(alignments)
max_val = np.max(alignments)
scaled_align = (alignments - min_val) / (max_val - min_val)
normalized_align = 2 * scaled_align - 1
aw = gr.make_waveform((22050, audio_numpy), bg_image='background_images/wallpaper_test_1_crop_3.jpg',
bars_color=('#f3df4b', '#63edb7'), bar_count=100, bar_width=0.7, animate=True)
return aw, normalized_mel, normalized_align # (22050, audio_numpy), fig_mel, fig_align
# Custom Demo Interface:
# theme='ysharma/steampunk',
# css=".gradio-container {background: url('file=background_images/wallpaper_test_mod_2.jpg')}"
with gr.Blocks() as demo:
gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> "
"<h2>Speech Synthesis with Partial Style Control</h2></center><br>")
# gr.Markdown("## <center>Unsupervised Style Tokens using Single-Head Attention Parallel Encoder "
# "with Tacotron2</center>")
with gr.Row():
with gr.Column(scale=1):
inp = gr.Textbox(label="Input Text", value="Speech synthesis has evolved dramatically since the "
"development of neural architectures capable of generating "
"high quality samples.")
clear_btn = gr.ClearButton(value='Clear Text', size='sm', components=[inp])
# gr.Markdown("A continuació, calibrem els pesos dels *style tokens*:")
with gr.Row():
with gr.Column(scale=2):
with gr.Tab("Global Style Tokens"):
gst_1 = gr.Slider(0.2, 0.45, label="GST 1", value=0.4)
gst_2 = gr.Slider(0.2, 0.45, label="GST 2", value=0.26)
gst_3 = gr.Slider(0.2, 0.45, label="GST 3", value=0.33)
with gr.Column(scale=0):
with gr.Tab("Vocoder"):
vocoder = gr.Radio([("HiFi-GAN", 0), ("Griffin-Lim", 1)],
container=False, value=0, min_width=300) # label="Vocoder")
greet_btn = gr.Button("Synthesize!", scale=1)
with gr.Column():
# wave_video = gr.make_waveform(audio)
with gr.Tab("Spectrogram"):
# spec_plot = gr.Plot()
spec_plot = gr.Image(container=False)
with gr.Tab("Alignment"):
# align_plot = gr.Plot()
align_plot = gr.Image(container=False)
wave_video = gr.Video(label="Waveform", height=150, width=800, container=False)
# play_video = gr.Button(label="Play", size='sm')
# audio_clip = gr.Audio(label="Generated Speech", type="numpy")
def display_video():
return wave_video
# play_video.click(fn=display_video)
greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder],
outputs=[wave_video, spec_plot, align_plot],
api_name="synthesize")
with gr.Row():
with gr.Column():
# gr.Markdown("### Audio Examples")
gr.Examples(examples=infer_from_text_examples,
inputs=[inp, gst_1, gst_2, gst_3, vocoder],
outputs=[wave_video, spec_plot, align_plot],
fn=synthesize,
cache_examples=True, )
gr.Markdown("""
### Details and Indications
This is a Text-to-Speech (TTS) system that consists of two modules: 1) a Tacotron2 replicated model, which generates
the spectrogram of the speech corresponding to the input text. And 2) a pre-trained HiFiGAN vocoder that maps the
spectrogram to a digital waveform. Global Style Tokens (GST) have been implemented to catch style information from
the female speaker with which the model has been trained (see the links below for more information).
Please, feel free to play with the GST scores and observe how the synthetic voice spells the input text.
Keep in mind that GSTs have been trained in an unsupervised way, so there is no specific control of
style attributes. Moreover, try to balance the GST scores by making them add up to a value close to 1. Below or
higher than 1 may cause low energy, mispronunciations or distortion.
You can choose between the HiFiGAN trained vocoder and the iterative algorithm Griffin-Lim, which does not need
to be trained, but produces a speech quite "robotic".
### More Information
Spectrogram generator has been adapted and trained from the
[NVIDIA's](https://github.com/NVIDIA/tacotron2) Tacotron2 replica published in
<a href="https://arxiv.org/abs/1712.05884" style="display: inline-block;margin-top: .5em;margin-right: .25em;"
target="_blank"> <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
src="https://img.shields.io/badge/ArXiv-Tacotron2-b31b1b" alt="Tacotron2"></a>
<br>
The neural vocoder is a pre-trained model replicated from <a href="https://arxiv.org/abs/2010.05646"
style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom:
0em;display: inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-HiFi%20GAN-b31b1b"
alt="HiFiGAN"></a>
<br>
Unsupervised style control has been implemented based on <a href="https://arxiv.org/abs/1803.09017" style="display:
inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom: 0em;display:
inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-Global%20Style%20Tokens-b31b1b"
alt="Global Style Tokens"></a>
<br>
""")
"""Instead of using multiple heads for the attention module, we just set one single
head for simplicity, ease control purposes, but also to observer whether this attention still
works with just one head."""
# gr.Markdown("This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens "
# "(GST). The whole architecture has been trained from scratch with the LJSpeech dataset. In order "
# "to control the relevance of each style token, we configured the attention module as a single-head. "
# "Keep in mind that, for a better synthetic output, the sum of the three style weights should be around "
# "1. A combination that sums less than 1 may work, but higher the generated speech may show more "
# "distortion and miss-pronunciations.")
demo.launch()
|