Spaces:

wasmdashai
/

RunTasking

Running on Zero

App Files Files Community

RunTasking / VitsModelSplit /decoder.py

wasmdashai

Upload 26 files

2da45ea verified 9 months ago

raw

history blame contribute delete

6.48 kB

	import math
	from typing import Optional
	import numpy as np
	import torch
	from torch import nn
	from .vits_config import VitsConfig

	#.............................................





	# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
	class HifiGanResidualBlock(nn.Module):
	def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
	super().__init__()
	self.leaky_relu_slope = leaky_relu_slope

	self.convs1 = nn.ModuleList(
	[
	nn.Conv1d(
	channels,
	channels,
	kernel_size,
	stride=1,
	dilation=dilation[i],
	padding=self.get_padding(kernel_size, dilation[i]),
	)
	for i in range(len(dilation))
	]
	)
	self.convs2 = nn.ModuleList(
	[
	nn.Conv1d(
	channels,
	channels,
	kernel_size,
	stride=1,
	dilation=1,
	padding=self.get_padding(kernel_size, 1),
	)
	for _ in range(len(dilation))
	]
	)

	def get_padding(self, kernel_size, dilation=1):
	return (kernel_size * dilation - dilation) // 2

	def apply_weight_norm(self):
	for layer in self.convs1:
	nn.utils.weight_norm(layer)
	for layer in self.convs2:
	nn.utils.weight_norm(layer)

	def remove_weight_norm(self):
	for layer in self.convs1:
	nn.utils.remove_weight_norm(layer)
	for layer in self.convs2:
	nn.utils.remove_weight_norm(layer)

	def forward(self, hidden_states):
	for conv1, conv2 in zip(self.convs1, self.convs2):
	residual = hidden_states
	hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
	hidden_states = conv1(hidden_states)
	hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
	hidden_states = conv2(hidden_states)
	hidden_states = hidden_states + residual
	return hidden_states


	#.............................................................................................


	class VitsHifiGan(nn.Module):
	def __init__(self, config: VitsConfig):
	super().__init__()
	self.config = config
	self.num_kernels = len(config.resblock_kernel_sizes)
	self.num_upsamples = len(config.upsample_rates)
	self.conv_pre = nn.Conv1d(
	config.flow_size,
	config.upsample_initial_channel,
	kernel_size=7,
	stride=1,
	padding=3,
	)

	self.upsampler = nn.ModuleList()
	for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
	self.upsampler.append(
	nn.ConvTranspose1d(
	config.upsample_initial_channel // (2**i),
	config.upsample_initial_channel // (2 ** (i + 1)),
	kernel_size=kernel_size,
	stride=upsample_rate,
	padding=(kernel_size - upsample_rate) // 2,
	)
	)

	self.resblocks = nn.ModuleList()
	for i in range(len(self.upsampler)):
	channels = config.upsample_initial_channel // (2 ** (i + 1))
	for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
	self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))

	self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3, bias=False)

	if config.speaker_embedding_size != 0:
	self.cond = nn.Conv1d(config.speaker_embedding_size, config.upsample_initial_channel, 1)

	def resize_speaker_embedding(self, speaker_embedding_size):
	self.config.speaker_embedding_size = speaker_embedding_size
	self.cond = nn.Conv1d(speaker_embedding_size, self.config.upsample_initial_channel, 1)
	nn.init.kaiming_normal_(self.cond.weight)
	if self.cond.bias is not None:
	k = math.sqrt(self.cond.groups / (self.cond.in_channels * self.cond.kernel_size[0]))
	nn.init.uniform_(self.cond.bias, a=-k, b=k)

	def apply_weight_norm(self):
	for layer in self.upsampler:
	nn.utils.weight_norm(layer)
	for layer in self.resblocks:
	layer.apply_weight_norm()

	def remove_weight_norm(self):
	for layer in self.upsampler:
	nn.utils.remove_weight_norm(layer)
	for layer in self.resblocks:
	layer.remove_weight_norm()

	def forward(
	self, spectrogram: torch.FloatTensor, global_conditioning: Optional[torch.FloatTensor] = None
	) -> torch.FloatTensor:
	r"""
	Converts a spectrogram into a speech waveform.

	Args:
	spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
	Tensor containing the spectrograms.
	global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, optional):
	Tensor containing speaker embeddings, for multispeaker models.

	Returns:
	`torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
	"""
	hidden_states = self.conv_pre(spectrogram)

	if global_conditioning is not None:
	hidden_states = hidden_states + self.cond(global_conditioning)

	for i in range(self.num_upsamples):
	hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
	hidden_states = self.upsampler[i](hidden_states)

	res_state = self.resblocks[i * self.num_kernels](hidden_states)
	for j in range(1, self.num_kernels):
	res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
	hidden_states = res_state / self.num_kernels

	hidden_states = nn.functional.leaky_relu(hidden_states)
	hidden_states = self.conv_post(hidden_states)
	waveform = torch.tanh(hidden_states)
	return waveform


	#.............................................................................................