Spaces:

Steveeeeeeen
/

Step-Audio-2-mini

Running on Zero

App Files Files Community

Step-Audio-2-mini / flashcosyvoice /modules /hifigan_components /layers.py

Steveeeeeeen HF Staff

add model

7e6946d about 1 month ago

raw

history blame contribute delete

16.4 kB

	from typing import List

	import numpy as np
	import torch
	import torch.nn as nn
	from torch.distributions.uniform import Uniform
	from torch.nn import Conv1d
	from torch.nn.utils import remove_weight_norm

	try:
	from torch.nn.utils.parametrizations import weight_norm
	except ImportError:
	from torch.nn.utils import weight_norm # noqa


	def get_padding(kernel_size, dilation=1):
	return int((kernel_size * dilation - dilation) / 2)


	def init_weights(m, mean=0.0, std=0.01):
	classname = m.__class__.__name__
	if classname.find("Conv") != -1:
	m.weight.data.normal_(mean, std)


	"""hifigan based generator implementation.

	This code is modified from https://github.com/jik876/hifi-gan
	,https://github.com/kan-bayashi/ParallelWaveGAN and
	https://github.com/NVIDIA/BigVGAN

	"""


	# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
	# LICENSE is in incl_licenses directory.
	class Snake(nn.Module):
	'''
	Implementation of a sine-based periodic activation function
	Shape:
	- Input: (B, C, T)
	- Output: (B, C, T), same shape as the input
	Parameters:
	- alpha - trainable parameter
	References:
	- This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
	https://arxiv.org/abs/2006.08195
	Examples:
	>>> a1 = snake(256)
	>>> x = torch.randn(256)
	>>> x = a1(x)

	Args:
	in_features: shape of the input
	alpha: trainable parameter
	alpha_trainable: whether alpha is trainable
	alpha_logscale: whether to use log scale for alpha
	alpha is initialized to 1 by default, higher values = higher-frequency.
	alpha will be trained along with the rest of your model.
	'''
	def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
	super(Snake, self).__init__()
	self.in_features = in_features

	# initialize alpha
	self.alpha_logscale = alpha_logscale
	if self.alpha_logscale: # log scale alphas initialized to zeros
	self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
	else: # linear scale alphas initialized to ones
	self.alpha = nn.Parameter(torch.ones(in_features) * alpha)

	self.alpha.requires_grad = alpha_trainable

	self.no_div_by_zero = 0.000000001

	def forward(self, x):
	'''
	Forward pass of the function.
	Applies the function to the input elementwise.
	Snake ∶= x + 1/a * sin^2 (xa)
	'''
	alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
	if self.alpha_logscale:
	alpha = torch.exp(alpha)
	x = x + (1.0 / (alpha + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2)

	return x


	class ResBlock(torch.nn.Module):
	"""Residual block module in HiFiGAN/BigVGAN."""
	def __init__(
	self,
	channels: int = 512,
	kernel_size: int = 3,
	dilations: List[int] = [1, 3, 5], # noqa
	):
	super(ResBlock, self).__init__()
	self.convs1 = nn.ModuleList()
	self.convs2 = nn.ModuleList()

	for dilation in dilations:
	self.convs1.append(
	weight_norm( # noqa
	Conv1d(
	channels,
	channels,
	kernel_size,
	1,
	dilation=dilation,
	padding=get_padding(kernel_size, dilation)
	)
	)
	)
	self.convs2.append(
	weight_norm( # noqa
	Conv1d(
	channels,
	channels,
	kernel_size,
	1,
	dilation=1,
	padding=get_padding(kernel_size, 1)
	)
	)
	)
	self.convs1.apply(init_weights)
	self.convs2.apply(init_weights)
	self.activations1 = nn.ModuleList([
	Snake(channels, alpha_logscale=False)
	for _ in range(len(self.convs1))
	])
	self.activations2 = nn.ModuleList([
	Snake(channels, alpha_logscale=False)
	for _ in range(len(self.convs2))
	])

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	for idx in range(len(self.convs1)):
	xt = self.activations1[idx](x)
	xt = self.convs1[idx](xt)
	xt = self.activations2[idx](xt)
	xt = self.convs2[idx](xt)
	x = xt + x
	return x

	def remove_weight_norm(self):
	for idx in range(len(self.convs1)):
	remove_weight_norm(self.convs1[idx])
	remove_weight_norm(self.convs2[idx])


	class SineGen(torch.nn.Module):
	""" Definition of sine generator
	SineGen(samp_rate, harmonic_num = 0,
	sine_amp = 0.1, noise_std = 0.003,
	voiced_threshold = 0,
	flag_for_pulse=False)
	samp_rate: sampling rate in Hz
	harmonic_num: number of harmonic overtones (default 0)
	sine_amp: amplitude of sine-wavefrom (default 0.1)
	noise_std: std of Gaussian noise (default 0.003)
	voiced_thoreshold: F0 threshold for U/V classification (default 0)
	flag_for_pulse: this SinGen is used inside PulseGen (default False)
	Note: when flag_for_pulse is True, the first time step of a voiced
	segment is always sin(np.pi) or cos(0)
	"""

	def __init__(self, samp_rate, harmonic_num=0,
	sine_amp=0.1, noise_std=0.003,
	voiced_threshold=0):
	super(SineGen, self).__init__()
	self.sine_amp = sine_amp
	self.noise_std = noise_std
	self.harmonic_num = harmonic_num
	self.sampling_rate = samp_rate
	self.voiced_threshold = voiced_threshold

	def _f02uv(self, f0):
	# generate uv signal
	uv = (f0 > self.voiced_threshold).type(torch.float32)
	return uv

	@torch.no_grad()
	def forward(self, f0):
	"""
	:param f0: [B, 1, sample_len], Hz
	:return: [B, 1, sample_len]
	"""

	F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
	for i in range(self.harmonic_num + 1):
	F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate

	theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
	u_dist = Uniform(low=-np.pi, high=np.pi)
	phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
	phase_vec[:, 0, :] = 0

	# generate sine waveforms
	sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)

	# generate uv signal
	uv = self._f02uv(f0)

	# noise: for unvoiced should be similar to sine_amp
	# std = self.sine_amp/3 -> max value ~ self.sine_amp
	# . for voiced regions is self.noise_std
	noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
	noise = noise_amp * torch.randn_like(sine_waves)

	# first: set the unvoiced part to 0 by uv
	# then: additive noise
	sine_waves = sine_waves * uv + noise
	return sine_waves, uv, noise


	class SourceModuleHnNSF(torch.nn.Module):
	""" SourceModule for hn-nsf
	SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
	add_noise_std=0.003, voiced_threshod=0)
	sampling_rate: sampling_rate in Hz
	harmonic_num: number of harmonic above F0 (default: 0)
	sine_amp: amplitude of sine source signal (default: 0.1)
	add_noise_std: std of additive Gaussian noise (default: 0.003)
	note that amplitude of noise in unvoiced is decided
	by sine_amp
	voiced_threshold: threhold to set U/V given F0 (default: 0)
	Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
	F0_sampled (batchsize, length, 1)
	Sine_source (batchsize, length, 1)
	noise_source (batchsize, length 1)
	uv (batchsize, length, 1)
	"""

	def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
	add_noise_std=0.003, voiced_threshod=0):
	super(SourceModuleHnNSF, self).__init__()

	self.sine_amp = sine_amp
	self.noise_std = add_noise_std

	# to produce sine waveforms
	self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
	sine_amp, add_noise_std, voiced_threshod)

	# to merge source harmonics into a single excitation
	self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
	self.l_tanh = torch.nn.Tanh()

	def forward(self, x):
	"""
	Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
	F0_sampled (batchsize, length, 1)
	Sine_source (batchsize, length, 1)
	noise_source (batchsize, length 1)
	"""
	# source for harmonic branch
	with torch.no_grad():
	sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
	sine_wavs = sine_wavs.transpose(1, 2)
	uv = uv.transpose(1, 2)
	sine_merge = self.l_tanh(self.l_linear(sine_wavs))

	# source for noise branch, in the same shape as uv
	noise = torch.randn_like(uv) * self.sine_amp / 3
	return sine_merge, noise, uv


	class SineGen2(torch.nn.Module):
	""" Definition of sine generator
	SineGen(samp_rate, harmonic_num = 0,
	sine_amp = 0.1, noise_std = 0.003,
	voiced_threshold = 0,
	flag_for_pulse=False)
	samp_rate: sampling rate in Hz
	harmonic_num: number of harmonic overtones (default 0)
	sine_amp: amplitude of sine-wavefrom (default 0.1)
	noise_std: std of Gaussian noise (default 0.003)
	voiced_thoreshold: F0 threshold for U/V classification (default 0)
	flag_for_pulse: this SinGen is used inside PulseGen (default False)
	Note: when flag_for_pulse is True, the first time step of a voiced
	segment is always sin(np.pi) or cos(0)
	"""

	def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
	sine_amp=0.1, noise_std=0.003,
	voiced_threshold=0,
	flag_for_pulse=False):
	super(SineGen2, self).__init__()
	self.sine_amp = sine_amp
	self.noise_std = noise_std
	self.harmonic_num = harmonic_num
	self.dim = self.harmonic_num + 1
	self.sampling_rate = samp_rate
	self.voiced_threshold = voiced_threshold
	self.flag_for_pulse = flag_for_pulse
	self.upsample_scale = upsample_scale

	def _f02uv(self, f0):
	# generate uv signal
	uv = (f0 > self.voiced_threshold).type(torch.float32)
	return uv

	def _f02sine(self, f0_values):
	""" f0_values: (batchsize, length, dim)
	where dim indicates fundamental tone and overtones
	"""
	# convert to F0 in rad. The interger part n can be ignored
	# because 2 * np.pi * n doesn't affect phase
	rad_values = (f0_values / self.sampling_rate) % 1

	# initial phase noise (no noise for fundamental component)
	rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
	rand_ini[:, 0] = 0
	rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini

	# instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
	if not self.flag_for_pulse:
	rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
	scale_factor=1 / self.upsample_scale,
	mode="linear").transpose(1, 2)

	phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
	phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
	scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
	sines = torch.sin(phase)
	else:
	# If necessary, make sure that the first time step of every
	# voiced segments is sin(pi) or cos(0)
	# This is used for pulse-train generation

	# identify the last time step in unvoiced segments
	uv = self._f02uv(f0_values)
	uv_1 = torch.roll(uv, shifts=-1, dims=1)
	uv_1[:, -1, :] = 1
	u_loc = (uv < 1) * (uv_1 > 0)

	# get the instantanouse phase
	tmp_cumsum = torch.cumsum(rad_values, dim=1)
	# different batch needs to be processed differently
	for idx in range(f0_values.shape[0]):
	temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
	temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
	# stores the accumulation of i.phase within
	# each voiced segments
	tmp_cumsum[idx, :, :] = 0
	tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum

	# rad_values - tmp_cumsum: remove the accumulation of i.phase
	# within the previous voiced segment.
	i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)

	# get the sines
	sines = torch.cos(i_phase * 2 * np.pi)
	return sines

	def forward(self, f0):
	""" sine_tensor, uv = forward(f0)
	input F0: tensor(batchsize=1, length, dim=1)
	f0 for unvoiced steps should be 0
	output sine_tensor: tensor(batchsize=1, length, dim)
	output uv: tensor(batchsize=1, length, 1)
	"""
	# fundamental component
	fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))

	# generate sine waveforms
	sine_waves = self._f02sine(fn) * self.sine_amp

	# generate uv signal
	uv = self._f02uv(f0)

	# noise: for unvoiced should be similar to sine_amp
	# std = self.sine_amp/3 -> max value ~ self.sine_amp
	# . for voiced regions is self.noise_std
	noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
	noise = noise_amp * torch.randn_like(sine_waves)

	# first: set the unvoiced part to 0 by uv
	# then: additive noise
	sine_waves = sine_waves * uv + noise
	return sine_waves, uv, noise


	class SourceModuleHnNSF2(torch.nn.Module):
	""" SourceModule for hn-nsf
	SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
	add_noise_std=0.003, voiced_threshod=0)
	sampling_rate: sampling_rate in Hz
	harmonic_num: number of harmonic above F0 (default: 0)
	sine_amp: amplitude of sine source signal (default: 0.1)
	add_noise_std: std of additive Gaussian noise (default: 0.003)
	note that amplitude of noise in unvoiced is decided
	by sine_amp
	voiced_threshold: threhold to set U/V given F0 (default: 0)
	Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
	F0_sampled (batchsize, length, 1)
	Sine_source (batchsize, length, 1)
	noise_source (batchsize, length 1)
	uv (batchsize, length, 1)
	"""

	def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
	add_noise_std=0.003, voiced_threshod=0):
	super(SourceModuleHnNSF2, self).__init__()

	self.sine_amp = sine_amp
	self.noise_std = add_noise_std

	# to produce sine waveforms
	self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num,
	sine_amp, add_noise_std, voiced_threshod)

	# to merge source harmonics into a single excitation
	self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
	self.l_tanh = torch.nn.Tanh()

	def forward(self, x):
	"""
	Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
	F0_sampled (batchsize, length, 1)
	Sine_source (batchsize, length, 1)
	noise_source (batchsize, length 1)
	"""
	# source for harmonic branch
	with torch.no_grad():
	sine_wavs, uv, _ = self.l_sin_gen(x)
	sine_merge = self.l_tanh(self.l_linear(sine_wavs))

	# source for noise branch, in the same shape as uv
	noise = torch.randn_like(uv) * self.sine_amp / 3
	return sine_merge, noise, uv