Spaces:
Running
Running
support timbre confusion
Browse files- models/tts/vits/vits.py +16 -6
models/tts/vits/vits.py
CHANGED
|
@@ -317,12 +317,15 @@ class SynthesizerTrn(nn.Module):
|
|
| 317 |
"logs_q": logs_q,
|
| 318 |
}
|
| 319 |
return outputs
|
| 320 |
-
|
|
|
|
| 321 |
def infer(
|
| 322 |
self,
|
| 323 |
x,
|
| 324 |
x_lengths,
|
| 325 |
-
|
|
|
|
|
|
|
| 326 |
noise_scale=1,
|
| 327 |
length_scale=1,
|
| 328 |
noise_scale_w=1.0,
|
|
@@ -330,13 +333,20 @@ class SynthesizerTrn(nn.Module):
|
|
| 330 |
):
|
| 331 |
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
| 332 |
if self.n_speakers > 0:
|
| 333 |
-
|
| 334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
else:
|
| 336 |
g = None
|
| 337 |
-
|
| 338 |
-
print('g.shape: ', g.shape)
|
| 339 |
|
|
|
|
|
|
|
| 340 |
if self.use_sdp:
|
| 341 |
logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
| 342 |
else:
|
|
|
|
| 317 |
"logs_q": logs_q,
|
| 318 |
}
|
| 319 |
return outputs
|
| 320 |
+
|
| 321 |
+
|
| 322 |
def infer(
|
| 323 |
self,
|
| 324 |
x,
|
| 325 |
x_lengths,
|
| 326 |
+
sid_1=None,
|
| 327 |
+
sid_2=None,
|
| 328 |
+
alpha=0.5,
|
| 329 |
noise_scale=1,
|
| 330 |
length_scale=1,
|
| 331 |
noise_scale_w=1.0,
|
|
|
|
| 333 |
):
|
| 334 |
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
| 335 |
if self.n_speakers > 0:
|
| 336 |
+
if sid_2 is None:
|
| 337 |
+
sid = sid_1.squeeze(-1)
|
| 338 |
+
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
| 339 |
+
else:
|
| 340 |
+
sid_1= sid_1.squeeze(-1)
|
| 341 |
+
g_1 = self.emb_g(sid_1).unsqueeze(-1)
|
| 342 |
+
sid_2= sid_2.squeeze(-1)
|
| 343 |
+
g_2 = self.emb_g(sid_2).unsqueeze(-1)
|
| 344 |
+
g = interpolate_embeddings(g_1,g_2,alpha)
|
| 345 |
else:
|
| 346 |
g = None
|
|
|
|
|
|
|
| 347 |
|
| 348 |
+
print('g.shape: ', g.shape)
|
| 349 |
+
|
| 350 |
if self.use_sdp:
|
| 351 |
logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
| 352 |
else:
|