Spaces:

Flux9665
/

MassivelyMultilingualTTS

Running on T4

App Files Files

Flux9665 commited on Jun 11, 2024

Commit

c69f215

1 Parent(s): 9c3ebc7

remove the spaces environment, since we now have a permanent GPU

Browse files

Files changed (3) hide show

Architectures/ControllabilityGAN/wgan/wgan_qc.py +2 -2
InferenceInterfaces/ToucanTTSInterface.py +3 -11
app.py +2 -9

Architectures/ControllabilityGAN/wgan/wgan_qc.py CHANGED Viewed

@@ -246,9 +246,9 @@ class WassersteinGanQuadraticCost(torch.nn.Module):
         if nograd:
             with torch.no_grad():
                 if isinstance(self.G, torch.nn.parallel.DataParallel):
-                    generated_data = self.G.module(latent_samples.to("cpu"), return_intermediate=return_intermediate)
                 else:
-                    generated_data = self.G(latent_samples.to("cpu"), return_intermediate=return_intermediate)
         else:
             generated_data = self.G(latent_samples)
         self.G.train()

         if nograd:
             with torch.no_grad():
                 if isinstance(self.G, torch.nn.parallel.DataParallel):
+                    generated_data = self.G.module(latent_samples, return_intermediate=return_intermediate)
                 else:
+                    generated_data = self.G(latent_samples, return_intermediate=return_intermediate)
         else:
             generated_data = self.G(latent_samples)
         self.G.train()

InferenceInterfaces/ToucanTTSInterface.py CHANGED Viewed

@@ -1,14 +1,12 @@
 import itertools
 import os
 import warnings
-from typing import cast
 import matplotlib.pyplot as plt
 import pyloudnorm
 import sounddevice
 import soundfile
 import torch
-import spaces
 with warnings.catch_warnings():
     warnings.simplefilter("ignore")
     from speechbrain.pretrained import EncoderClassifier
@@ -127,7 +125,6 @@ class ToucanTTSInterface(torch.nn.Module):
         self.lang_id = get_language_id(lang_id).to(self.device)
-    @spaces.GPU
     def forward(self,
                 text,
                 view=False,
@@ -153,19 +150,15 @@ class ToucanTTSInterface(torch.nn.Module):
                                    1.0 means no scaling happens, higher values increase variance of the energy curve,
                                    lower values decrease variance of the energy curve.
         """
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.device = device
-        self.to(device)
         with torch.inference_mode():
             phones = self.text2phone.string_to_tensor(text, input_phonemes=input_is_phones).to(torch.device(self.device))
             mel, durations, pitch, energy = self.phone2mel(phones,
                                                            return_duration_pitch_energy=True,
-                                                           utterance_embedding=self.default_utterance_embedding.to(device),
                                                            durations=durations,
                                                            pitch=pitch,
                                                            energy=energy,
-                                                           lang_id=self.lang_id.to(device),
                                                            duration_scaling_factor=duration_scaling_factor,
                                                            pitch_variance_scale=pitch_variance_scale,
                                                            energy_variance_scale=energy_variance_scale,
@@ -228,8 +221,7 @@ class ToucanTTSInterface(torch.nn.Module):
             if return_plot_as_filepath:
                 plt.savefig("tmp.png")
                 return wave, sr, "tmp.png"
-        self.to("cpu")
-        self.device="cpu"
         return wave, sr
     def read_to_file(self,

 import itertools
 import os
 import warnings
 import matplotlib.pyplot as plt
 import pyloudnorm
 import sounddevice
 import soundfile
 import torch
 with warnings.catch_warnings():
     warnings.simplefilter("ignore")
     from speechbrain.pretrained import EncoderClassifier
         self.lang_id = get_language_id(lang_id).to(self.device)
     def forward(self,
                 text,
                 view=False,
                                    1.0 means no scaling happens, higher values increase variance of the energy curve,
                                    lower values decrease variance of the energy curve.
         """
         with torch.inference_mode():
             phones = self.text2phone.string_to_tensor(text, input_phonemes=input_is_phones).to(torch.device(self.device))
             mel, durations, pitch, energy = self.phone2mel(phones,
                                                            return_duration_pitch_energy=True,
+                                                           utterance_embedding=self.default_utterance_embedding.to(self.device),
                                                            durations=durations,
                                                            pitch=pitch,
                                                            energy=energy,
+                                                           lang_id=self.lang_id.to(self.device),
                                                            duration_scaling_factor=duration_scaling_factor,
                                                            pitch_variance_scale=pitch_variance_scale,
                                                            energy_variance_scale=energy_variance_scale,
             if return_plot_as_filepath:
                 plt.savefig("tmp.png")
                 return wave, sr, "tmp.png"
         return wave, sr
     def read_to_file(self,

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import os
-import spaces
 from run_model_downloader import download_models
 if not os.path.exists("Models/ToucanTTS_Meta/best.pt"):
@@ -23,17 +21,12 @@ class ControllableInterface(torch.nn.Module):
     def __init__(self, available_artificial_voices=1000):
         super().__init__()
-        self.model = ToucanTTSInterface(device="cpu", tts_model_path="Meta", language="eng")
-        self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cpu")
         self.generated_speaker_embeds = list()
         self.available_artificial_voices = available_artificial_voices
         self.current_language = ""
         self.current_accent = ""
-        self.device = "cpu"
-        self.model.to("cpu")
-        self.model.device = "cpu"
-        self.wgan.to("cpu")
-        self.wgan.device = "cpu"
     def read(self,
              prompt,

 import os
 from run_model_downloader import download_models
 if not os.path.exists("Models/ToucanTTS_Meta/best.pt"):
     def __init__(self, available_artificial_voices=1000):
         super().__init__()
+        self.model = ToucanTTSInterface(device="cuda", tts_model_path="Meta", language="eng")
+        self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cuda")
         self.generated_speaker_embeds = list()
         self.available_artificial_voices = available_artificial_voices
         self.current_language = ""
         self.current_accent = ""
     def read(self,
              prompt,