Spaces:
Sleeping
Sleeping
import gradio as gr | |
from hyper_parameters import tacotron_params as hparams | |
from training import load_model | |
from text import text_to_sequence | |
from melgan.model.generator import Generator | |
from melgan.utils.hparams import load_hparam | |
import torch | |
import numpy as np | |
from matplotlib import pyplot as plt | |
# Adjust vertical spacing between subplots | |
plt.subplots_adjust(hspace=0.15) # You can adjust the value as needed | |
# Adjust the white space (margins) around the plot | |
plt.tight_layout(pad=0.5) # You can adjust the pad value as needed | |
torch.manual_seed(1234) | |
MAX_WAV_VALUE = 32768.0 | |
DESCRIPTION = """ | |
This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST). | |
The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance | |
of each style token, we configured the attention module as a single-head. | |
Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the | |
generated speech may show more distortion and misspronunciations. | |
""" | |
# load trained tacotron2 + GST model: | |
model = load_model(hparams) | |
checkpoint_path = "trained_models/checkpoint_78000.model" | |
model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict']) | |
# model.to('cuda') | |
_ = model.eval() | |
# load pre trained MelGAN model for mel2audio: | |
vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt" | |
checkpoint = torch.load(vocoder_checkpoint_path, map_location="cpu") | |
hp_melgan = load_hparam("melgan/config/default.yaml") | |
vocoder_model = Generator(80) | |
vocoder_model.load_state_dict(checkpoint['model_g']) | |
# vocoder_model = vocoder_model.to('cuda') | |
vocoder_model.eval(inference=False) | |
def plot_spec_align(mel, align): | |
fig_mel = plt.figure() | |
ax_mel = fig_mel.add_subplot(211) | |
ax_mel.imshow(mel) | |
ax_mel.set_title('Mel-Scale Spectrogram', fontsize=12) | |
# fig_align = plt.figure() | |
ax_align = fig_mel.add_subplot(212) # fig_align | |
ax_align.imshow(align) | |
ax_align.set_title('Alignment', fontsize=12) | |
return fig_mel # fig_align | |
def synthesize(text, gst_1, gst_2, gst_3): | |
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] | |
sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64) | |
# gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35]) | |
gst_head_scores = np.array([gst_1, gst_2, gst_3]) | |
gst_scores = torch.from_numpy(gst_head_scores).float() | |
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores) | |
# mel2wav inference: | |
with torch.no_grad(): | |
audio = vocoder_model.inference(mel_outputs_postnet) | |
audio_numpy = audio.data.cpu().detach().numpy() | |
# prepare plot for the output: | |
mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0]) | |
mel_outputs_postnet = mel_outputs_postnet.detach().numpy() | |
alignments = alignments.squeeze().T.detach().numpy() | |
fig_mel = plot_spec_align(mel_outputs_postnet, alignments) | |
return (22050, audio_numpy), fig_mel # fig_align | |
iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:", value=0.4), | |
gr.Slider(0.2, 0.45, label="Second style token weight:", value=0.26), | |
gr.Slider(0.2, 0.45, label="Third style token weight:", value=0.33)], | |
outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.Plot(label="Output"),], | |
title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION) | |
iface.launch() | |