import gradio as gr from hyper_parameters import tacotron_params as hparams from training import load_model from text import text_to_sequence from melgan.model.generator import Generator from melgan.utils.hparams import load_hparam import torch import numpy as np from matplotlib import pyplot as plt # Adjust vertical spacing between subplots plt.subplots_adjust(hspace=0.15) # You can adjust the value as needed # Adjust the white space (margins) around the plot plt.tight_layout(pad=0.5) # You can adjust the pad value as needed torch.manual_seed(1234) MAX_WAV_VALUE = 32768.0 DESCRIPTION = """ This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST). The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance of each style token, we configured the attention module as a single-head. Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the generated speech may show more distortion and misspronunciations. """ # load trained tacotron2 + GST model: model = load_model(hparams) checkpoint_path = "trained_models/checkpoint_78000.model" model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict']) # model.to('cuda') _ = model.eval() # load pre trained MelGAN model for mel2audio: vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt" checkpoint = torch.load(vocoder_checkpoint_path, map_location="cpu") hp_melgan = load_hparam("melgan/config/default.yaml") vocoder_model = Generator(80) vocoder_model.load_state_dict(checkpoint['model_g']) # vocoder_model = vocoder_model.to('cuda') vocoder_model.eval(inference=False) def plot_spec_align(mel, align): fig_mel = plt.figure() ax_mel = fig_mel.add_subplot(211) ax_mel.imshow(mel) ax_mel.set_title('Mel-Scale Spectrogram', fontsize=12) # fig_align = plt.figure() ax_align = fig_mel.add_subplot(212) # fig_align ax_align.imshow(align) ax_align.set_title('Alignment', fontsize=12) return fig_mel # fig_align def synthesize(text, gst_1, gst_2, gst_3): sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64) # gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35]) gst_head_scores = np.array([gst_1, gst_2, gst_3]) gst_scores = torch.from_numpy(gst_head_scores).float() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores) # mel2wav inference: with torch.no_grad(): audio = vocoder_model.inference(mel_outputs_postnet) audio_numpy = audio.data.cpu().detach().numpy() # prepare plot for the output: mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0]) mel_outputs_postnet = mel_outputs_postnet.detach().numpy() alignments = alignments.squeeze().T.detach().numpy() fig_mel = plot_spec_align(mel_outputs_postnet, alignments) return (22050, audio_numpy), fig_mel # fig_align iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:", value=0.4), gr.Slider(0.2, 0.45, label="Second style token weight:", value=0.26), gr.Slider(0.2, 0.45, label="Third style token weight:", value=0.33)], outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.Plot(label="Output"),], title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION) iface.launch()