Spaces:

AlexK-PL
/

Tacotron2_GST_eng

Sleeping

App Files Files Community

Tacotron2_GST_eng / app.py

AlexK-PL

Update app.py

cc8638f over 1 year ago

raw

history blame

11.7 kB

	import gradio as gr

	from hyper_parameters import tacotron_params as hparams
	from training import load_model

	from audio_processing import griffin_lim
	from nn_layers import TacotronSTFT


	from text import text_to_sequence
	from hifigan.env import AttrDict
	from examples_taco2 import *

	from hifigan.models import Generator

	import torch
	import numpy as np
	import json
	import os

	from matplotlib import pyplot as plt

	# Adjust vertical spacing between subplots
	plt.subplots_adjust(hspace=0.15) # You can adjust the value as needed

	# Adjust the white space (margins) around the plot
	plt.tight_layout(pad=0.5) # You can adjust the pad value as needed

	torch.manual_seed(1234)
	MAX_WAV_VALUE = 32768.0

	DESCRIPTION = """
	This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens (GST).
	The whole architecture has been trained from scratch with the LJSpeech dataset. In order to control the relevance
	of each style token, we configured the attention module as a single-head.

	Keep in mind that, for a better synthetic output, the sum of the three style weights should be around 1. A combination that sums less than 1 may work, but higher the
	generated speech may show more distortion and miss-pronunciations.
	"""


	def load_checkpoint(filepath, device):
	assert os.path.isfile(filepath)
	print("Loading '{}'".format(filepath))
	checkpoint_dict = torch.load(filepath, map_location=device)
	print("Complete.")
	return checkpoint_dict


	def plot_spec_align_sep(mel, align):
	plt.figure(figsize=(4, 3))

	fig_mel = plt.figure()
	ax_mel = fig_mel.add_subplot(111)
	fig_mel.tight_layout()
	ax_mel.imshow(mel)
	# fig_mel.set_title('Mel-Scale Spectrogram', fontsize=12)

	fig_align = plt.figure()
	ax_align = fig_align.add_subplot(111) # fig_align
	fig_align.tight_layout()
	ax_align.imshow(align)
	# fig_align.set_title('Alignment', fontsize=12)

	return fig_mel, fig_align


	# load trained tacotron2 + GST model:
	model = load_model(hparams)
	checkpoint_path = "models/checkpoint_78000.model"
	model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
	# model.to('cuda')
	_ = model.eval()

	# load pre-trained HiFi-GAN model for mel2audio:
	hifigan_checkpoint_path = "models/generator_v1"
	config_file = os.path.join(os.path.split(hifigan_checkpoint_path)[0], 'config.json')
	with open(config_file) as f:
	data = f.read()
	json_config = json.loads(data)
	h = AttrDict(json_config)
	device = torch.device("cpu")

	generator = Generator(h).to(device)

	state_dict_g = load_checkpoint(hifigan_checkpoint_path, device)
	generator.load_state_dict(state_dict_g['generator'])
	generator.eval()
	generator.remove_weight_norm()


	def synthesize(text, gst_1, gst_2, gst_3, voc):
	sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
	sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)

	# gst_head_scores = np.array([0.5, 0.15, 0.35])
	gst_head_scores = np.array([gst_1, gst_2, gst_3])
	gst_scores = torch.from_numpy(gst_head_scores).float()

	with torch.no_grad():
	mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)

	if voc == 0:
	# mel2wav inference:
	with torch.no_grad():
	y_g_hat = generator(mel_outputs_postnet)
	audio = y_g_hat.squeeze()
	audio = audio * MAX_WAV_VALUE
	audio_numpy = audio.cpu().numpy().astype('int16')
	# audio = vocoder_model.inference(mel_outputs_postnet)
	# audio_numpy = audio.data.cpu().detach().numpy()

	else:
	# Griffin Lim vocoder synthesis:
	griffin_iters = 60
	taco_stft = TacotronSTFT(hparams['filter_length'], hparams['hop_length'], hparams['win_length'],
	sampling_rate=hparams['sampling_rate'])

	mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
	mel_decompress = mel_decompress.transpose(1, 2).data.cpu()

	spec_from_mel_scaling = 60
	spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
	spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
	spec_from_mel = spec_from_mel * spec_from_mel_scaling

	audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters)

	audio = audio.squeeze()
	audio_numpy = audio.cpu().numpy()

	# prepare plot for the output:
	mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
	mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
	alignments = alignments.squeeze().T.detach().numpy()
	# fig_mel = plot_spec_align(mel_outputs_postnet, alignments)
	# fig_mel, fig_align = plot_spec_align_sep(mel_outputs_postnet, alignments)

	# normalize numpy arrays between [-1, 1]
	min_val = np.min(mel_outputs_postnet)
	max_val = np.max(mel_outputs_postnet)
	scaled_mel = (mel_outputs_postnet - min_val) / (max_val - min_val)
	normalized_mel = 2 * scaled_mel - 1

	min_val = np.min(alignments)
	max_val = np.max(alignments)
	scaled_align = (alignments - min_val) / (max_val - min_val)
	normalized_align = 2 * scaled_align - 1

	aw = gr.make_waveform((22050, audio_numpy), bg_image='background_images/wallpaper_test_1_crop_3.jpg',
	bars_color=('#f3df4b', '#63edb7'), bar_count=100, bar_width=0.7, animate=True)

	return aw, normalized_mel, normalized_align # (22050, audio_numpy), fig_mel, fig_align


	# Custom Demo Interface:
	# theme='ysharma/steampunk',
	# css=".gradio-container {background: url('file=background_images/wallpaper_test_mod_2.jpg')}"
	with gr.Blocks() as demo:
	gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> "
	"<h2>Speech Synthesis with Partial Style Control</h2></center><br>")
	# gr.Markdown("## <center>Unsupervised Style Tokens using Single-Head Attention Parallel Encoder "
	# "with Tacotron2</center>")
	with gr.Row():
	with gr.Column(scale=1):
	inp = gr.Textbox(label="Input Text", value="Speech synthesis has evolved dramatically since the "
	"development of neural architectures capable of generating "
	"high quality samples.")
	clear_btn = gr.ClearButton(value='Clear Text', size='sm', components=[inp])
	# gr.Markdown("A continuació, calibrem els pesos dels style tokens:")
	with gr.Row():
	with gr.Column(scale=2):
	with gr.Tab("Global Style Tokens"):
	gst_1 = gr.Slider(0.2, 0.45, label="GST 1", value=0.4)
	gst_2 = gr.Slider(0.2, 0.45, label="GST 2", value=0.26)
	gst_3 = gr.Slider(0.2, 0.45, label="GST 3", value=0.33)
	with gr.Column(scale=0):
	with gr.Tab("Vocoder"):
	vocoder = gr.Radio([("HiFi-GAN", 0), ("Griffin-Lim", 1)],
	container=False, value=0, min_width=300) # label="Vocoder")
	greet_btn = gr.Button("Synthesize!", scale=1)
	with gr.Column():
	# wave_video = gr.make_waveform(audio)
	with gr.Tab("Spectrogram"):
	# spec_plot = gr.Plot()
	spec_plot = gr.Image(container=False)
	with gr.Tab("Alignment"):
	# align_plot = gr.Plot()
	align_plot = gr.Image(container=False)
	wave_video = gr.Video(label="Waveform", height=150, width=800, container=False)
	# play_video = gr.Button(label="Play", size='sm')
	# audio_clip = gr.Audio(label="Generated Speech", type="numpy")

	def display_video():
	return wave_video
	# play_video.click(fn=display_video)
	greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder],
	outputs=[wave_video, spec_plot, align_plot],
	api_name="synthesize")

	with gr.Row():
	with gr.Column():
	# gr.Markdown("### Audio Examples")
	gr.Examples(examples=infer_from_text_examples,
	inputs=[inp, gst_1, gst_2, gst_3, vocoder],
	outputs=[wave_video, spec_plot, align_plot],
	fn=synthesize,
	cache_examples=True, )
	gr.Markdown("""
	### Details and Indications
	This is a Text-to-Speech (TTS) system that consists of two modules: 1) a Tacotron2 replicated model, which generates
	the spectrogram of the speech corresponding to the input text. And 2) a pre-trained HiFiGAN vocoder that maps the
	spectrogram to a digital waveform. Global Style Tokens (GST) have been implemented to catch style information from
	the female speaker with which the model has been trained (see the links below for more information).
	Please, feel free to play with the GST scores and observe how the synthetic voice spells the input text.
	Keep in mind that GSTs have been trained in an unsupervised way, so there is no specific control of
	style attributes. Moreover, try to balance the GST scores by making them add up to a value close to 1. Below or
	higher than 1 may cause low energy, mispronunciations or distortion.
	You can choose between the HiFiGAN trained vocoder and the iterative algorithm Griffin-Lim, which does not need
	to be trained, but produces a speech quite "robotic".

	### More Information
	Spectrogram generator has been adapted and trained from the
	[NVIDIA's](https://github.com/NVIDIA/tacotron2) Tacotron2 replica published in
	<a href="https://arxiv.org/abs/1712.05884" style="display: inline-block;margin-top: .5em;margin-right: .25em;"
	target="_blank"> <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
	src="https://img.shields.io/badge/ArXiv-Tacotron2-b31b1b" alt="Tacotron2"></a>
	<br>
	The neural vocoder is a pre-trained model replicated from <a href="https://arxiv.org/abs/2010.05646"
	style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom:
	0em;display: inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-HiFi%20GAN-b31b1b"
	alt="HiFiGAN"></a>
	<br>
	Unsupervised style control has been implemented based on <a href="https://arxiv.org/abs/1803.09017" style="display:
	inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom: 0em;display:
	inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-Global%20Style%20Tokens-b31b1b"
	alt="Global Style Tokens"></a>
	<br>
	""")

	"""Instead of using multiple heads for the attention module, we just set one single
	head for simplicity, ease control purposes, but also to observer whether this attention still
	works with just one head."""

	# gr.Markdown("This is a Tacotron2 model based on the NVIDIA's model plus three unsupervised Global Style Tokens "
	# "(GST). The whole architecture has been trained from scratch with the LJSpeech dataset. In order "
	# "to control the relevance of each style token, we configured the attention module as a single-head. "
	# "Keep in mind that, for a better synthetic output, the sum of the three style weights should be around "
	# "1. A combination that sums less than 1 may work, but higher the generated speech may show more "
	# "distortion and miss-pronunciations.")

	demo.launch()