Spaces:

amphion
/

Text-to-Speech

Running

App Files Files Community

Text-to-Speech / models /svc /transformer /transformer.py

zyingt

Upload 685 files

0d80816 almost 2 years ago

raw

history blame

2.54 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import math
	import torch
	import torch.nn as nn
	from torch.nn import TransformerEncoder, TransformerEncoderLayer


	class Transformer(nn.Module):
	def __init__(self, cfg):
	super().__init__()
	self.cfg = cfg

	dropout = self.cfg.dropout
	nhead = self.cfg.n_heads
	nlayers = self.cfg.n_layers
	input_dim = self.cfg.input_dim
	output_dim = self.cfg.output_dim

	d_model = input_dim
	self.pos_encoder = PositionalEncoding(d_model, dropout)
	encoder_layers = TransformerEncoderLayer(
	d_model, nhead, dropout=dropout, batch_first=True
	)
	self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)

	self.output_mlp = nn.Linear(d_model, output_dim)

	def forward(self, x, mask=None):
	"""
	Args:
	x: (N, seq_len, input_dim)
	Returns:
	output: (N, seq_len, output_dim)
	"""
	# (N, seq_len, d_model)
	src = self.pos_encoder(x)
	# model_stats["pos_embedding"] = x
	# (N, seq_len, d_model)
	output = self.transformer_encoder(src)
	# (N, seq_len, output_dim)
	output = self.output_mlp(output)
	return output


	class PositionalEncoding(nn.Module):
	def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
	super().__init__()
	self.dropout = nn.Dropout(p=dropout)

	position = torch.arange(max_len).unsqueeze(1)
	div_term = torch.exp(
	torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
	)

	# Assume that x is (seq_len, N, d)
	# pe = torch.zeros(max_len, 1, d_model)
	# pe[:, 0, 0::2] = torch.sin(position * div_term)
	# pe[:, 0, 1::2] = torch.cos(position * div_term)

	# Assume that x in (N, seq_len, d)
	pe = torch.zeros(1, max_len, d_model)
	pe[0, :, 0::2] = torch.sin(position * div_term)
	pe[0, :, 1::2] = torch.cos(position * div_term)

	self.register_buffer("pe", pe)

	def forward(self, x):
	"""
	Args:
	x: Tensor, shape [N, seq_len, d]
	"""
	# Old: Assume that x is (seq_len, N, d), and self.pe is (max_len, 1, d_model)
	# x = x + self.pe[: x.size(0)]

	# Now: self.pe is (1, max_len, d)
	x = x + self.pe[:, : x.size(1), :]

	return self.dropout(x)