Spaces:
Sleeping
Sleeping
File size: 3,788 Bytes
bfcd0c0 080259f bfcd0c0 080259f cb82d78 080259f 2f6ba98 3ed419f cb82d78 4d5330a 848f2f7 2f6ba98 b48749f 2f6ba98 b48749f a25696f cb82d78 2f6ba98 92be68f 5b2ce7f 92be68f c628e3e 92be68f 5b2ce7f 92be68f c628e3e 2f6ba98 5b2ce7f 2f6ba98 6a0c3cb cb82d78 f95a0dc cb82d78 6a0c3cb 2f6ba98 f95a0dc c472fbf cb82d78 848f2f7 cb82d78 2f6ba98 c628e3e 5b2ce7f 2f6ba98 5b2ce7f cadb9c5 848f2f7 cadb9c5 5b2ce7f 2f6ba98 31277c7 848f2f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import gradio as gr
from hyper_parameters import tacotron_params as hparams
from training import load_model
from text import text_to_sequence
from melgan.model.generator import Generator
from melgan.utils.hparams import load_hparam
import torch
import numpy as np
from matplotlib import pyplot as plt
# Adjust vertical spacing between subplots
plt.subplots_adjust(hspace=0.15) # You can adjust the value as needed
# Adjust the white space (margins) around the plot
plt.tight_layout(pad=0.5) # You can adjust the pad value as needed
torch.manual_seed(1234)
MAX_WAV_VALUE = 32768.0
DESCRIPTION = """
This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST).
The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
of each style token, we configured the attention module as a single-head.
Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
generated speech may show more distortion and misspronunciations.
"""
# load trained tacotron2 + GST model:
model = load_model(hparams)
checkpoint_path = "trained_models/checkpoint_78000.model"
model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
# model.to('cuda')
_ = model.eval()
# load pre trained MelGAN model for mel2audio:
vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt"
checkpoint = torch.load(vocoder_checkpoint_path, map_location="cpu")
hp_melgan = load_hparam("melgan/config/default.yaml")
vocoder_model = Generator(80)
vocoder_model.load_state_dict(checkpoint['model_g'])
# vocoder_model = vocoder_model.to('cuda')
vocoder_model.eval(inference=False)
def plot_spec_align(mel, align):
fig_mel = plt.figure()
ax_mel = fig_mel.add_subplot(211)
ax_mel.imshow(mel)
ax_mel.set_title('Mel-Scale Spectrogram', fontsize=12)
# fig_align = plt.figure()
ax_align = fig_mel.add_subplot(212) # fig_align
ax_align.imshow(align)
ax_align.set_title('Alignment', fontsize=12)
return fig_mel # fig_align
def synthesize(text, gst_1, gst_2, gst_3):
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
# gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35])
gst_head_scores = np.array([gst_1, gst_2, gst_3])
gst_scores = torch.from_numpy(gst_head_scores).float()
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
# mel2wav inference:
with torch.no_grad():
audio = vocoder_model.inference(mel_outputs_postnet)
audio_numpy = audio.data.cpu().detach().numpy()
# prepare plot for the output:
mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
alignments = alignments.squeeze().T.detach().numpy()
fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
return (22050, audio_numpy), fig_mel # fig_align
iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:", value=0.4),
gr.Slider(0.2, 0.45, label="Second style token weight:", value=0.26),
gr.Slider(0.2, 0.45, label="Third style token weight:", value=0.33)],
outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.Plot(label="Output"),],
title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
iface.launch()
|