Update ONNXVITS_infer.py
Browse files- ONNXVITS_infer.py +6 -28
ONNXVITS_infer.py
CHANGED
|
@@ -125,6 +125,7 @@ class SynthesizerTrn(models.SynthesizerTrn):
|
|
| 125 |
gin_channels=0,
|
| 126 |
use_sdp=True,
|
| 127 |
emotion_embedding=False,
|
|
|
|
| 128 |
**kwargs):
|
| 129 |
|
| 130 |
super().__init__(
|
|
@@ -149,6 +150,7 @@ class SynthesizerTrn(models.SynthesizerTrn):
|
|
| 149 |
use_sdp=use_sdp,
|
| 150 |
**kwargs
|
| 151 |
)
|
|
|
|
| 152 |
self.enc_p = TextEncoder(n_vocab,
|
| 153 |
inter_channels,
|
| 154 |
hidden_channels,
|
|
@@ -172,7 +174,7 @@ class SynthesizerTrn(models.SynthesizerTrn):
|
|
| 172 |
g = None
|
| 173 |
|
| 174 |
# logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
| 175 |
-
logw = runonnx("
|
| 176 |
logw = torch.from_numpy(logw[0])
|
| 177 |
|
| 178 |
w = torch.exp(logw) * x_mask * length_scale
|
|
@@ -189,35 +191,11 @@ class SynthesizerTrn(models.SynthesizerTrn):
|
|
| 189 |
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
| 190 |
|
| 191 |
# z = self.flow(z_p, y_mask, g=g, reverse=True)
|
| 192 |
-
z = runonnx("
|
| 193 |
z = torch.from_numpy(z[0])
|
| 194 |
|
| 195 |
# o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
| 196 |
-
o = runonnx("
|
| 197 |
o = torch.from_numpy(o[0])
|
| 198 |
|
| 199 |
-
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
| 200 |
-
|
| 201 |
-
def predict_duration(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None,
|
| 202 |
-
emotion_embedding=None):
|
| 203 |
-
from ONNXVITS_utils import runonnx
|
| 204 |
-
|
| 205 |
-
# x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
| 206 |
-
x, m_p, logs_p, x_mask = runonnx("ONNX_net/enc_p.onnx", x=x.numpy(), x_lengths=x_lengths.numpy())
|
| 207 |
-
x = torch.from_numpy(x)
|
| 208 |
-
m_p = torch.from_numpy(m_p)
|
| 209 |
-
logs_p = torch.from_numpy(logs_p)
|
| 210 |
-
x_mask = torch.from_numpy(x_mask)
|
| 211 |
-
|
| 212 |
-
if self.n_speakers > 0:
|
| 213 |
-
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
| 214 |
-
else:
|
| 215 |
-
g = None
|
| 216 |
-
|
| 217 |
-
# logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
| 218 |
-
logw = runonnx("ONNX_net/dp.onnx", x=x.numpy(), x_mask=x_mask.numpy(), g=g.numpy())
|
| 219 |
-
logw = torch.from_numpy(logw[0])
|
| 220 |
-
|
| 221 |
-
w = torch.exp(logw) * x_mask * length_scale
|
| 222 |
-
w_ceil = torch.ceil(w)
|
| 223 |
-
return list(w_ceil.squeeze())
|
|
|
|
| 125 |
gin_channels=0,
|
| 126 |
use_sdp=True,
|
| 127 |
emotion_embedding=False,
|
| 128 |
+
ONNX_dir="./ONNX_net/",
|
| 129 |
**kwargs):
|
| 130 |
|
| 131 |
super().__init__(
|
|
|
|
| 150 |
use_sdp=use_sdp,
|
| 151 |
**kwargs
|
| 152 |
)
|
| 153 |
+
self.ONNX_dir = ONNX_dir
|
| 154 |
self.enc_p = TextEncoder(n_vocab,
|
| 155 |
inter_channels,
|
| 156 |
hidden_channels,
|
|
|
|
| 174 |
g = None
|
| 175 |
|
| 176 |
# logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
| 177 |
+
logw = runonnx(f"{self.ONNX_dir}dp.onnx", x=x.numpy(), x_mask=x_mask.numpy(), g=g.numpy())
|
| 178 |
logw = torch.from_numpy(logw[0])
|
| 179 |
|
| 180 |
w = torch.exp(logw) * x_mask * length_scale
|
|
|
|
| 191 |
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
| 192 |
|
| 193 |
# z = self.flow(z_p, y_mask, g=g, reverse=True)
|
| 194 |
+
z = runonnx(f"{self.ONNX_dir}flow.onnx", z_p=z_p.numpy(), y_mask=y_mask.numpy(), g=g.numpy())
|
| 195 |
z = torch.from_numpy(z[0])
|
| 196 |
|
| 197 |
# o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
| 198 |
+
o = runonnx(f"{self.ONNX_dir}dec.onnx", z_in=(z * y_mask)[:, :, :max_len].numpy(), g=g.numpy())
|
| 199 |
o = torch.from_numpy(o[0])
|
| 200 |
|
| 201 |
+
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|