Spaces:

archivartaunik
/

Bextts

Running on Zero

App Files Files Community

archivartaunik commited on Dec 28, 2024

Commit

5f7d4f2

verified ·

1 Parent(s): 18ea01f

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -79

app.py CHANGED Viewed

@@ -1,13 +1,11 @@
 # Імпартуем патрэбныя модулі
-import os
-import shutil
-import torch
-import torchaudio
 import gradio as gr
-from tqdm import tqdm
-from huggingface_hub import snapshot_download
-import spaces
 import sys
 # Клонуем рэпазіторый, калі ён яшчэ не загружаны
 if not os.path.exists("XTTSv2-Finetuning-for-New-Languages"):
@@ -19,64 +17,36 @@ if os.path.exists("XTTSv2-Finetuning-for-New-Languages/TTS"):
 # Дадаем тэчку TTS у PYTHONPATH
 sys.path.append("./TTS")
 from underthesea import sent_tokenize
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
-# Clone the repository if not already present
-if not os.path.exists("XTTSv2-Finetuning-for-New-Languages"):
-    os.system("git clone https://github.com/hellcatmon/XTTSv2-Finetuning-for-New-Languages.git")
-# Move the TTS directory to the current working directory
-if os.path.exists("XTTSv2-Finetuning-for-New-Languages/TTS"):
-    os.system("mv XTTSv2-Finetuning-for-New-Languages/TTS ./")
-# Add the TTS directory to the Python path
-import sys
-sys.path.append("./TTS")
-# Define repository and model paths
 repo_id = "archivartaunik/BE_XTTS_V2_60epoch3Dataset"
-destination_dir = "checkpoints/XTTS_v2.0_original_model_files/"
-# Download model files
-print("Downloading model files from Hugging Face...")
-local_repo_path = snapshot_download(repo_id)
-os.makedirs(destination_dir, exist_ok=True)
-print("Copying model files...")
-for root, _, files in os.walk(local_repo_path):
-    for file in files:
-        source_file = os.path.join(root, file)
-        relative_path = os.path.relpath(source_file, local_repo_path)
-        destination_file = os.path.join(destination_dir, relative_path)
-        os.makedirs(os.path.dirname(destination_file), exist_ok=True)
-        shutil.copy2(source_file, destination_file)
-print(f"Model files are saved in {destination_dir}.")
-# Load model
-print("Loading the model...")
-xtts_checkpoint = os.path.join(destination_dir, "model.pth")
-xtts_config = os.path.join(destination_dir, "config.json")
-xtts_vocab = os.path.join(destination_dir, "vocab.json")
 config = XttsConfig()
-config.load_json(xtts_config)
 XTTS_MODEL = Xtts.init_from_config(config)
-XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
-print("Model loaded successfully!")
-# Function for inference
-@spaces.GPU
-def tts_inference(belarusian_text):
-    lang = "be"
-    speaker_audio_file = os.path.join(destination_dir, "voice.wav")
     gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
         audio_path=speaker_audio_file,
         gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
@@ -84,8 +54,10 @@ def tts_inference(belarusian_text):
         sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
     )
-    tts_texts = sent_tokenize(belarusian_text)
     wav_chunks = []
     for text in tqdm(tts_texts):
         wav_chunk = XTTS_MODEL.inference(
@@ -101,33 +73,27 @@ def tts_inference(belarusian_text):
         )
         wav_chunks.append(torch.tensor(wav_chunk["wav"]))
-    out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()
-    # Save the generated audio
-    output_path = "output.wav"
-    torchaudio.save(output_path, out_wav, sample_rate=24000)
-    return output_path
-# Create Gradio app
-@spaces.GPU(duration=220)
-  def gradio_app():
-    with gr.Blocks() as app:
-        gr.Markdown("# Belarusian TTS Inference App")
-        text_input = gr.Textbox(label="Enter Belarusian Text", placeholder="Быў раз...")
-        audio_output = gr.Audio(label="Generated Speech")
-        generate_button = gr.Button("Generate Speech")
-        generate_button.click(
-            fn=tts_inference,
-            inputs=text_input,
-            outputs=audio_output,
-        )
-    return app
-# Launch the app
 if __name__ == "__main__":
-    app = gradio_app()
-    app.launch()

 # Імпартуем патрэбныя модулі
 import gradio as gr
+import torch
+from huggingface_hub import hf_hub_download
+import os
 import sys
+import tempfile
+from scipy.io.wavfile import write
 # Клонуем рэпазіторый, калі ён яшчэ не загружаны
 if not os.path.exists("XTTSv2-Finetuning-for-New-Languages"):
 # Дадаем тэчку TTS у PYTHONPATH
 sys.path.append("./TTS")
+from tqdm import tqdm
 from underthesea import sent_tokenize
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
+# Вызначэнне прылады (толькі CPU)
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# Шлях да мадэлі ў Hugging Face
 repo_id = "archivartaunik/BE_XTTS_V2_60epoch3Dataset"
+checkpoint_file = hf_hub_download(repo_id, filename="model.pth")
+config_file = hf_hub_download(repo_id, filename="config.json")
+vocab_file = hf_hub_download(repo_id, filename="vocab.json")
+default_voice_file = hf_hub_download(repo_id, filename="voice.wav")
+# Загрузка канфігурацыі мадэлі
 config = XttsConfig()
+config.load_json(config_file)
+# Ініцыялізацыя і загрузка мадэлі
 XTTS_MODEL = Xtts.init_from_config(config)
+XTTS_MODEL.load_checkpoint(config, checkpoint_path=checkpoint_file, vocab_path=vocab_file, use_deepspeed=False)
+XTTS_MODEL.to(device)
+def text_to_speech(belarusian_story, lang="be", speaker_audio_file=None):
+    # Калі файл не пададзены, выкарыстоўваем голас па змаўчанні
+    if not speaker_audio_file or (not isinstance(speaker_audio_file, str) and speaker_audio_file.name == ""):
+        speaker_audio_file = default_voice_file
+    # Атрыманне латэнтных умоў і эмацый
     gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
         audio_path=speaker_audio_file,
         gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
         sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
     )
+    # Токенізацыя тэксту на асобныя сказы
+    tts_texts = sent_tokenize(belarusian_story)
+    # Генерацыя аўдыё для кожнага сказы
     wav_chunks = []
     for text in tqdm(tts_texts):
         wav_chunk = XTTS_MODEL.inference(
         )
         wav_chunks.append(torch.tensor(wav_chunk["wav"]))
+    # Аб'ядноўваем усе часткі аўдыё ў адзін масіў
+    out_wav = torch.cat(wav_chunks, dim=0).squeeze().cpu().numpy()
+    # Захоўваем аўдыё ў часовы файл
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    write(temp_file.name, 24000, out_wav)
+    return temp_file.name
+demo = gr.Interface(
+    fn=text_to_speech,
+    inputs=[
+        gr.Textbox(lines=5, label="Тэкст на беларускай мове"),
+        gr.Textbox(value="be", label="Мова (па змаўчанні BE)", visible=False),
+        gr.Audio(type="filepath", label="Запішыце або загрузіце файл голасу (без іншых гукаў) не карацей 7 секунд", interactive=True),
+    ],
+    outputs="audio",
+    title="XTTS Belarusian TTS Demo",
+    description="Увядзіце тэкст, і мадэль пераўтворыць яго ў аўдыя. Вы можаце выкарыстоўваць голас па змаўчанні, загрузіць уласны файл або запісаць аўдыё.",
+)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, enable_queue=True)