File size: 8,282 Bytes
17ed7d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
import math
from typing import Tuple
import numpy as np
import torch
import torch.nn as nn # pylint: disable=consider-using-from-import
import torch.nn.functional as F
from TTS.tts.layers.delightful_tts.conv_layers import ConvNorm
def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor:
assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..."
# Kaiming initialization
return torch.randn(shape) * np.sqrt(2 / shape[1])
def positional_encoding(d_model: int, length: int, device: torch.device) -> torch.Tensor:
pe = torch.zeros(length, d_model, device=device)
position = torch.arange(0, length, dtype=torch.float, device=device).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2, device=device).float() * -(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
return pe
class BottleneckLayer(nn.Module):
Bottleneck layer for reducing the dimensionality of a tensor.
in_dim: The number of input dimensions.
reduction_factor: The factor by which to reduce the number of dimensions.
norm: The normalization method to use. Can be "weightnorm" or "instancenorm".
non_linearity: The non-linearity to use. Can be "relu" or "leakyrelu".
kernel_size: The size of the convolutional kernel.
use_partial_padding: Whether to use partial padding with the convolutional kernel.
- Input: :math:`[N, in_dim]` where `N` is the batch size and `in_dim` is the number of input dimensions.
- Output: :math:`[N, out_dim]` where `out_dim` is the number of output dimensions.
def __init__(
use_partial_padding=False, # pylint: disable=unused-argument
super(BottleneckLayer, self).__init__() # pylint: disable=super-with-arguments
self.reduction_factor = reduction_factor
reduced_dim = int(in_dim / reduction_factor)
self.out_dim = reduced_dim
if self.reduction_factor > 1:
fn = ConvNorm(in_dim, reduced_dim, kernel_size=kernel_size, use_weight_norm=(norm == "weightnorm"))
if norm == "instancenorm":
fn = nn.Sequential(fn, nn.InstanceNorm1d(reduced_dim, affine=True))
self.projection_fn = fn
self.non_linearity = nn.ReLU()
if non_linearity == "leakyrelu":
self.non_linearity = nn.LeakyReLU()
def forward(self, x):
if self.reduction_factor > 1:
x = self.projection_fn(x)
x = self.non_linearity(x)
return x
class GLUActivation(nn.Module):
"""Class that implements the Gated Linear Unit (GLU) activation function.
The GLU activation function is a variant of the Leaky ReLU activation function,
where the output of the activation function is gated by an input tensor.
def __init__(self, slope: float):
self.lrelu = nn.LeakyReLU(slope)
def forward(self, x: torch.Tensor) -> torch.Tensor:
out, gate = x.chunk(2, dim=1)
x = out * self.lrelu(gate)
return x
class StyleEmbedAttention(nn.Module):
def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int):
self.num_units = num_units
self.num_heads = num_heads
self.key_dim = key_dim
self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
def forward(self, query: torch.Tensor, key_soft: torch.Tensor) -> torch.Tensor:
values = self.W_value(key_soft)
split_size = self.num_units // self.num_heads
values = torch.stack(torch.split(values, split_size, dim=2), dim=0)
out_soft = scores_soft = None
querys = self.W_query(query) # [N, T_q, num_units]
keys = self.W_key(key_soft) # [N, T_k, num_units]
# [h, N, T_q, num_units/h]
querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0)
# [h, N, T_k, num_units/h]
keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)
# [h, N, T_k, num_units/h]
# score = softmax(QK^T / (d_k ** 0.5))
scores_soft = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k]
scores_soft = scores_soft / (self.key_dim**0.5)
scores_soft = F.softmax(scores_soft, dim=3)
# out = score * V
# [h, N, T_q, num_units/h]
out_soft = torch.matmul(scores_soft, values)
out_soft =, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
return out_soft # , scores_soft
class EmbeddingPadded(nn.Module):
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
padding_mult = torch.ones((num_embeddings, 1), dtype=torch.int64)
padding_mult[padding_idx] = 0
self.register_buffer("padding_mult", padding_mult)
self.embeddings = nn.parameter.Parameter(initialize_embeddings((num_embeddings, embedding_dim)))
def forward(self, idx: torch.Tensor) -> torch.Tensor:
embeddings_zeroed = self.embeddings * self.padding_mult
x = F.embedding(idx, embeddings_zeroed)
return x
class EmbeddingProjBlock(nn.Module):
def __init__(self, embedding_dim: int):
self.layers = nn.ModuleList(
nn.Linear(embedding_dim, embedding_dim),
nn.Linear(embedding_dim, embedding_dim),
def forward(self, x: torch.Tensor) -> torch.Tensor:
res = x
for layer in self.layers:
x = layer(x)
x = x + res
return x
class LinearNorm(nn.Module):
def __init__(self, in_features: int, out_features: int, bias: bool = False):
self.linear = nn.Linear(in_features, out_features, bias)
if bias:
nn.init.constant_(self.linear.bias, 0.0)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.linear(x)
return x
class STL(nn.Module):
A PyTorch module for the Style Token Layer (STL) as described in
"A Style-Based Generator Architecture for Generative Adversarial Networks"
The STL applies a multi-headed attention mechanism over the learned style tokens,
using the text input as the query and the style tokens as the keys and values.
The output of the attention mechanism is used as the text's style embedding.
token_num (int): The number of style tokens.
n_hidden (int): Number of hidden dimensions.
def __init__(self, n_hidden: int, token_num: int):
super(STL, self).__init__() # pylint: disable=super-with-arguments
num_heads = 1
E = n_hidden
self.token_num = token_num
self.embed = nn.Parameter(torch.FloatTensor(self.token_num, E // num_heads))
d_q = E // 2
d_k = E // num_heads
self.attention = StyleEmbedAttention(query_dim=d_q, key_dim=d_k, num_units=E, num_heads=num_heads)
torch.nn.init.normal_(self.embed, mean=0, std=0.5)
def forward(self, x: torch.Tensor) -> torch.Tensor:
N = x.size(0)
query = x.unsqueeze(1) # [N, 1, E//2]
keys_soft = torch.tanh(self.embed).unsqueeze(0).expand(N, -1, -1) # [N, token_num, E // num_heads]
# Weighted sum
emotion_embed_soft = self.attention(query, keys_soft)
return emotion_embed_soft