Spaces:

AlexK-PL
/

Tacotron2_GST_eng

Sleeping

File size: 1,792 Bytes

bfcd0c0
 
080259f
 
bfcd0c0
080259f
cb82d78
 
 
 
080259f
 
 
cb82d78
4d5330a
848f2f7
 
 
 
 
 
f95a0dc
848f2f7
 
 
 
 
 
 
 
f95a0dc
848f2f7
cb82d78
 
 
f95a0dc
cb82d78
c472fbf
f95a0dc
c472fbf
cb82d78
 
 
 
 
 
848f2f7
cb82d78
c472fbf
848f2f7
c472fbf
 
 
 
848f2f7

import gradio as gr

from hyper_parameters import tacotron_params as hparams
from training import load_model

from text import text_to_sequence

from melgan.model.generator import Generator
from melgan.utils.hparams import load_hparam

import torch
import numpy as np

torch.manual_seed(1234)
MAX_WAV_VALUE = 32768.0

def init_models(hparams):
    # load trained tacotron2 + GST model:
    model = load_model(hparams)
    checkpoint_path = "trained_models/checkpoint_78000.model"
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    # model.to('cuda')
    _ = model.eval()
    
    # load pre trained MelGAN model for mel2audio:
    vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt"
    checkpoint = torch.load(vocoder_checkpoint_path)
    hp_melgan = load_hparam("melgan/config/default.yaml")
    vocoder_model = Generator(80)
    vocoder_model.load_state_dict(checkpoint['model_g'])
    # vocoder_model = vocoder_model.to('cuda')
    vocoder_model.eval(inference=False)

def synthesize(text):
    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)

    gst_head_scores = np.array([0.5, 0.15, 0.35])  # originally ([0.5, 0.15, 0.35])
    gst_scores = torch.from_numpy(gst_head_scores).float()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)

    # mel2wav inference:
    with torch.no_grad():
      audio = vocoder_model.inference(mel_outputs_postnet)
    
    audio_numpy = audio.data.cpu().detach().numpy()

    return (22050, audio_numpy)


init_models(hparams)
iface = gr.Interface(fn=synthesize, inputs="text", outputs=[gr.Audio(label="Generated Speech", type="numpy"),])
iface.launch()