import gradio as gr from hyper_parameters import tacotron_params as hparams from training import load_model from text import text_to_sequence from melgan.model.generator import Generator from melgan.utils.hparams import load_hparam torch.manual_seed(1234) def init_models(hparams): # load trained tacotron2 + GST model: model = load_model(hparams) checkpoint_path = "trained_models/checkpoint_78000.model" model.load_state_dict(torch.load(checkpoint_path)['state_dict']) model.to('cuda') _ = model.eval() # load pre trained MelGAN model for mel2audio: vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt" checkpoint = torch.load(vocoder_checkpoint_path) hp_melgan = load_hparam("melgan/config/default.yaml") vocoder_model = Generator(80) vocoder_model.load_state_dict(checkpoint['model_g']) vocoder_model = vocoder_model.to('cuda') vocoder_model.eval(inference=False) def synthesize(text): sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.from_numpy(sequence).to(device='cuda', dtype=torch.int64) gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35]) gst_scores = torch.from_numpy(gst_head_scores).cuda().float() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores) # mel2wav inference: with torch.no_grad(): audio = vocoder_model.inference(mel_outputs_postnet) audio_numpy = audio.data.cpu().detach().numpy() return (22050, audio_numpy) init_models(hparams) iface = gr.Interface(fn=synthesize, inputs="text", outputs=[gr.Audio(label="Generated Speech", type="numpy"),]) iface.launch()