File size: 3,788 Bytes
bfcd0c0
 
080259f
 
bfcd0c0
080259f
cb82d78
 
 
 
080259f
 
 
2f6ba98
 
3ed419f
 
 
 
 
 
cb82d78
4d5330a
848f2f7
2f6ba98
b48749f
 
 
2f6ba98
 
 
b48749f
 
a25696f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb82d78
2f6ba98
92be68f
 
5b2ce7f
92be68f
c628e3e
92be68f
5b2ce7f
 
92be68f
c628e3e
2f6ba98
5b2ce7f
2f6ba98
 
6a0c3cb
cb82d78
f95a0dc
cb82d78
6a0c3cb
2f6ba98
f95a0dc
c472fbf
cb82d78
 
 
 
 
848f2f7
cb82d78
2f6ba98
c628e3e
 
 
5b2ce7f
2f6ba98
5b2ce7f
cadb9c5
848f2f7
cadb9c5
 
 
5b2ce7f
2f6ba98
31277c7
848f2f7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import gradio as gr

from hyper_parameters import tacotron_params as hparams
from training import load_model

from text import text_to_sequence

from melgan.model.generator import Generator
from melgan.utils.hparams import load_hparam

import torch
import numpy as np

from matplotlib import pyplot as plt

# Adjust vertical spacing between subplots
plt.subplots_adjust(hspace=0.15)  # You can adjust the value as needed

# Adjust the white space (margins) around the plot
plt.tight_layout(pad=0.5)  # You can adjust the pad value as needed

torch.manual_seed(1234)
MAX_WAV_VALUE = 32768.0

DESCRIPTION = """
This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST). 
The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
of each style token, we configured the attention module as a single-head.

Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
generated speech may show more distortion and misspronunciations.
"""

# load trained tacotron2 + GST model:
model = load_model(hparams)
checkpoint_path = "trained_models/checkpoint_78000.model"
model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
# model.to('cuda')
_ = model.eval()

# load pre trained MelGAN model for mel2audio:
vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt"
checkpoint = torch.load(vocoder_checkpoint_path, map_location="cpu")
hp_melgan = load_hparam("melgan/config/default.yaml")
vocoder_model = Generator(80)
vocoder_model.load_state_dict(checkpoint['model_g'])
# vocoder_model = vocoder_model.to('cuda')
vocoder_model.eval(inference=False)


def plot_spec_align(mel, align):

    fig_mel = plt.figure()
    ax_mel = fig_mel.add_subplot(211)
    ax_mel.imshow(mel)
    ax_mel.set_title('Mel-Scale Spectrogram', fontsize=12)

    # fig_align = plt.figure()
    ax_align = fig_mel.add_subplot(212)  # fig_align
    ax_align.imshow(align)
    ax_align.set_title('Alignment', fontsize=12)

    return fig_mel # fig_align


def synthesize(text, gst_1, gst_2, gst_3):
    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)

    # gst_head_scores = np.array([0.5, 0.15, 0.35])  # originally ([0.5, 0.15, 0.35])
    gst_head_scores = np.array([gst_1, gst_2, gst_3])
    gst_scores = torch.from_numpy(gst_head_scores).float()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)

    # mel2wav inference:
    with torch.no_grad():
      audio = vocoder_model.inference(mel_outputs_postnet)
    audio_numpy = audio.data.cpu().detach().numpy()

    # prepare plot for the output:
    mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
    mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
    alignments = alignments.squeeze().T.detach().numpy()
    fig_mel = plot_spec_align(mel_outputs_postnet, alignments)

    return (22050, audio_numpy), fig_mel  # fig_align
 

iface = gr.Interface(fn=synthesize, inputs=[gr.Textbox(label="Input Text"), gr.Slider(0.2, 0.45, label="First style token weight:", value=0.4), 
                                            gr.Slider(0.2, 0.45, label="Second style token weight:", value=0.26), 
                                            gr.Slider(0.2, 0.45, label="Third style token weight:", value=0.33)], 
                     outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.Plot(label="Output"),], 
                     title="Single-Head Attention Tacotron2 with Style Tokens", description=DESCRIPTION)
iface.launch()