Spaces:
Running
on
T4
Running
on
T4
remove the spaces environment, since we now have a permanent GPU
Browse files
Architectures/ControllabilityGAN/wgan/wgan_qc.py
CHANGED
|
@@ -246,9 +246,9 @@ class WassersteinGanQuadraticCost(torch.nn.Module):
|
|
| 246 |
if nograd:
|
| 247 |
with torch.no_grad():
|
| 248 |
if isinstance(self.G, torch.nn.parallel.DataParallel):
|
| 249 |
-
generated_data = self.G.module(latent_samples
|
| 250 |
else:
|
| 251 |
-
generated_data = self.G(latent_samples
|
| 252 |
else:
|
| 253 |
generated_data = self.G(latent_samples)
|
| 254 |
self.G.train()
|
|
|
|
| 246 |
if nograd:
|
| 247 |
with torch.no_grad():
|
| 248 |
if isinstance(self.G, torch.nn.parallel.DataParallel):
|
| 249 |
+
generated_data = self.G.module(latent_samples, return_intermediate=return_intermediate)
|
| 250 |
else:
|
| 251 |
+
generated_data = self.G(latent_samples, return_intermediate=return_intermediate)
|
| 252 |
else:
|
| 253 |
generated_data = self.G(latent_samples)
|
| 254 |
self.G.train()
|
InferenceInterfaces/ToucanTTSInterface.py
CHANGED
|
@@ -1,14 +1,12 @@
|
|
| 1 |
import itertools
|
| 2 |
import os
|
| 3 |
import warnings
|
| 4 |
-
from typing import cast
|
| 5 |
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
import pyloudnorm
|
| 8 |
import sounddevice
|
| 9 |
import soundfile
|
| 10 |
import torch
|
| 11 |
-
import spaces
|
| 12 |
with warnings.catch_warnings():
|
| 13 |
warnings.simplefilter("ignore")
|
| 14 |
from speechbrain.pretrained import EncoderClassifier
|
|
@@ -127,7 +125,6 @@ class ToucanTTSInterface(torch.nn.Module):
|
|
| 127 |
|
| 128 |
self.lang_id = get_language_id(lang_id).to(self.device)
|
| 129 |
|
| 130 |
-
@spaces.GPU
|
| 131 |
def forward(self,
|
| 132 |
text,
|
| 133 |
view=False,
|
|
@@ -153,19 +150,15 @@ class ToucanTTSInterface(torch.nn.Module):
|
|
| 153 |
1.0 means no scaling happens, higher values increase variance of the energy curve,
|
| 154 |
lower values decrease variance of the energy curve.
|
| 155 |
"""
|
| 156 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 157 |
-
self.device = device
|
| 158 |
-
self.to(device)
|
| 159 |
-
|
| 160 |
with torch.inference_mode():
|
| 161 |
phones = self.text2phone.string_to_tensor(text, input_phonemes=input_is_phones).to(torch.device(self.device))
|
| 162 |
mel, durations, pitch, energy = self.phone2mel(phones,
|
| 163 |
return_duration_pitch_energy=True,
|
| 164 |
-
utterance_embedding=self.default_utterance_embedding.to(device),
|
| 165 |
durations=durations,
|
| 166 |
pitch=pitch,
|
| 167 |
energy=energy,
|
| 168 |
-
lang_id=self.lang_id.to(device),
|
| 169 |
duration_scaling_factor=duration_scaling_factor,
|
| 170 |
pitch_variance_scale=pitch_variance_scale,
|
| 171 |
energy_variance_scale=energy_variance_scale,
|
|
@@ -228,8 +221,7 @@ class ToucanTTSInterface(torch.nn.Module):
|
|
| 228 |
if return_plot_as_filepath:
|
| 229 |
plt.savefig("tmp.png")
|
| 230 |
return wave, sr, "tmp.png"
|
| 231 |
-
|
| 232 |
-
self.device="cpu"
|
| 233 |
return wave, sr
|
| 234 |
|
| 235 |
def read_to_file(self,
|
|
|
|
| 1 |
import itertools
|
| 2 |
import os
|
| 3 |
import warnings
|
|
|
|
| 4 |
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
import pyloudnorm
|
| 7 |
import sounddevice
|
| 8 |
import soundfile
|
| 9 |
import torch
|
|
|
|
| 10 |
with warnings.catch_warnings():
|
| 11 |
warnings.simplefilter("ignore")
|
| 12 |
from speechbrain.pretrained import EncoderClassifier
|
|
|
|
| 125 |
|
| 126 |
self.lang_id = get_language_id(lang_id).to(self.device)
|
| 127 |
|
|
|
|
| 128 |
def forward(self,
|
| 129 |
text,
|
| 130 |
view=False,
|
|
|
|
| 150 |
1.0 means no scaling happens, higher values increase variance of the energy curve,
|
| 151 |
lower values decrease variance of the energy curve.
|
| 152 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
with torch.inference_mode():
|
| 154 |
phones = self.text2phone.string_to_tensor(text, input_phonemes=input_is_phones).to(torch.device(self.device))
|
| 155 |
mel, durations, pitch, energy = self.phone2mel(phones,
|
| 156 |
return_duration_pitch_energy=True,
|
| 157 |
+
utterance_embedding=self.default_utterance_embedding.to(self.device),
|
| 158 |
durations=durations,
|
| 159 |
pitch=pitch,
|
| 160 |
energy=energy,
|
| 161 |
+
lang_id=self.lang_id.to(self.device),
|
| 162 |
duration_scaling_factor=duration_scaling_factor,
|
| 163 |
pitch_variance_scale=pitch_variance_scale,
|
| 164 |
energy_variance_scale=energy_variance_scale,
|
|
|
|
| 221 |
if return_plot_as_filepath:
|
| 222 |
plt.savefig("tmp.png")
|
| 223 |
return wave, sr, "tmp.png"
|
| 224 |
+
|
|
|
|
| 225 |
return wave, sr
|
| 226 |
|
| 227 |
def read_to_file(self,
|
app.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
-
import spaces
|
| 4 |
-
|
| 5 |
from run_model_downloader import download_models
|
| 6 |
|
| 7 |
if not os.path.exists("Models/ToucanTTS_Meta/best.pt"):
|
|
@@ -23,17 +21,12 @@ class ControllableInterface(torch.nn.Module):
|
|
| 23 |
|
| 24 |
def __init__(self, available_artificial_voices=1000):
|
| 25 |
super().__init__()
|
| 26 |
-
self.model = ToucanTTSInterface(device="
|
| 27 |
-
self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="
|
| 28 |
self.generated_speaker_embeds = list()
|
| 29 |
self.available_artificial_voices = available_artificial_voices
|
| 30 |
self.current_language = ""
|
| 31 |
self.current_accent = ""
|
| 32 |
-
self.device = "cpu"
|
| 33 |
-
self.model.to("cpu")
|
| 34 |
-
self.model.device = "cpu"
|
| 35 |
-
self.wgan.to("cpu")
|
| 36 |
-
self.wgan.device = "cpu"
|
| 37 |
|
| 38 |
def read(self,
|
| 39 |
prompt,
|
|
|
|
| 1 |
import os
|
| 2 |
|
|
|
|
|
|
|
| 3 |
from run_model_downloader import download_models
|
| 4 |
|
| 5 |
if not os.path.exists("Models/ToucanTTS_Meta/best.pt"):
|
|
|
|
| 21 |
|
| 22 |
def __init__(self, available_artificial_voices=1000):
|
| 23 |
super().__init__()
|
| 24 |
+
self.model = ToucanTTSInterface(device="cuda", tts_model_path="Meta", language="eng")
|
| 25 |
+
self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cuda")
|
| 26 |
self.generated_speaker_embeds = list()
|
| 27 |
self.available_artificial_voices = available_artificial_voices
|
| 28 |
self.current_language = ""
|
| 29 |
self.current_accent = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def read(self,
|
| 32 |
prompt,
|