|
from torch import nn
|
|
|
|
from TTS.tts.layers.generic.res_conv_bn import Conv1dBN
|
|
|
|
|
|
class DurationPredictor(nn.Module):
|
|
"""Speedy Speech duration predictor model.
|
|
Predicts phoneme durations from encoder outputs.
|
|
|
|
Note:
|
|
Outputs interpreted as log(durations)
|
|
To get actual durations, do exp transformation
|
|
|
|
conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1
|
|
|
|
Args:
|
|
hidden_channels (int): number of channels in the inner layers.
|
|
"""
|
|
|
|
def __init__(self, hidden_channels):
|
|
super().__init__()
|
|
|
|
self.layers = nn.ModuleList(
|
|
[
|
|
Conv1dBN(hidden_channels, hidden_channels, 4, 1),
|
|
Conv1dBN(hidden_channels, hidden_channels, 3, 1),
|
|
Conv1dBN(hidden_channels, hidden_channels, 1, 1),
|
|
nn.Conv1d(hidden_channels, 1, 1),
|
|
]
|
|
)
|
|
|
|
def forward(self, x, x_mask):
|
|
"""
|
|
Shapes:
|
|
x: [B, C, T]
|
|
x_mask: [B, 1, T]
|
|
"""
|
|
o = x
|
|
for layer in self.layers:
|
|
o = layer(o) * x_mask
|
|
return o
|
|
|