archivartaunik commited on
Commit
5f7d4f2
·
verified ·
1 Parent(s): 18ea01f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -79
app.py CHANGED
@@ -1,13 +1,11 @@
1
  # Імпартуем патрэбныя модулі
2
- import os
3
- import shutil
4
- import torch
5
- import torchaudio
6
  import gradio as gr
7
- from tqdm import tqdm
8
- from huggingface_hub import snapshot_download
9
- import spaces
10
  import sys
 
 
11
 
12
  # Клонуем рэпазіторый, калі ён яшчэ не загружаны
13
  if not os.path.exists("XTTSv2-Finetuning-for-New-Languages"):
@@ -19,64 +17,36 @@ if os.path.exists("XTTSv2-Finetuning-for-New-Languages/TTS"):
19
 
20
  # Дадаем тэчку TTS у PYTHONPATH
21
  sys.path.append("./TTS")
22
-
23
  from underthesea import sent_tokenize
24
  from TTS.tts.configs.xtts_config import XttsConfig
25
  from TTS.tts.models.xtts import Xtts
26
 
 
 
27
 
28
- # Clone the repository if not already present
29
- if not os.path.exists("XTTSv2-Finetuning-for-New-Languages"):
30
- os.system("git clone https://github.com/hellcatmon/XTTSv2-Finetuning-for-New-Languages.git")
31
-
32
- # Move the TTS directory to the current working directory
33
- if os.path.exists("XTTSv2-Finetuning-for-New-Languages/TTS"):
34
- os.system("mv XTTSv2-Finetuning-for-New-Languages/TTS ./")
35
-
36
- # Add the TTS directory to the Python path
37
- import sys
38
- sys.path.append("./TTS")
39
-
40
- # Define repository and model paths
41
  repo_id = "archivartaunik/BE_XTTS_V2_60epoch3Dataset"
42
- destination_dir = "checkpoints/XTTS_v2.0_original_model_files/"
43
-
44
- # Download model files
45
- print("Downloading model files from Hugging Face...")
46
- local_repo_path = snapshot_download(repo_id)
47
-
48
- os.makedirs(destination_dir, exist_ok=True)
49
-
50
- print("Copying model files...")
51
- for root, _, files in os.walk(local_repo_path):
52
- for file in files:
53
- source_file = os.path.join(root, file)
54
- relative_path = os.path.relpath(source_file, local_repo_path)
55
- destination_file = os.path.join(destination_dir, relative_path)
56
-
57
- os.makedirs(os.path.dirname(destination_file), exist_ok=True)
58
- shutil.copy2(source_file, destination_file)
59
-
60
- print(f"Model files are saved in {destination_dir}.")
61
-
62
- # Load model
63
- print("Loading the model...")
64
- xtts_checkpoint = os.path.join(destination_dir, "model.pth")
65
- xtts_config = os.path.join(destination_dir, "config.json")
66
- xtts_vocab = os.path.join(destination_dir, "vocab.json")
67
 
 
68
  config = XttsConfig()
69
- config.load_json(xtts_config)
 
 
70
  XTTS_MODEL = Xtts.init_from_config(config)
71
- XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
72
- print("Model loaded successfully!")
73
 
74
- # Function for inference
75
- @spaces.GPU
76
- def tts_inference(belarusian_text):
77
- lang = "be"
78
- speaker_audio_file = os.path.join(destination_dir, "voice.wav")
79
 
 
80
  gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
81
  audio_path=speaker_audio_file,
82
  gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
@@ -84,8 +54,10 @@ def tts_inference(belarusian_text):
84
  sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
85
  )
86
 
87
- tts_texts = sent_tokenize(belarusian_text)
 
88
 
 
89
  wav_chunks = []
90
  for text in tqdm(tts_texts):
91
  wav_chunk = XTTS_MODEL.inference(
@@ -101,33 +73,27 @@ def tts_inference(belarusian_text):
101
  )
102
  wav_chunks.append(torch.tensor(wav_chunk["wav"]))
103
 
104
- out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()
 
105
 
106
- # Save the generated audio
107
- output_path = "output.wav"
108
- torchaudio.save(output_path, out_wav, sample_rate=24000)
109
 
110
- return output_path
111
 
112
- # Create Gradio app
113
- @spaces.GPU(duration=220)
114
- def gradio_app():
115
- with gr.Blocks() as app:
116
- gr.Markdown("# Belarusian TTS Inference App")
117
- text_input = gr.Textbox(label="Enter Belarusian Text", placeholder="Быў раз...")
118
- audio_output = gr.Audio(label="Generated Speech")
119
-
120
- generate_button = gr.Button("Generate Speech")
121
-
122
- generate_button.click(
123
- fn=tts_inference,
124
- inputs=text_input,
125
- outputs=audio_output,
126
- )
127
 
128
- return app
 
 
 
 
 
 
 
 
 
 
129
 
130
- # Launch the app
131
  if __name__ == "__main__":
132
- app = gradio_app()
133
- app.launch()
 
1
  # Імпартуем патрэбныя модулі
 
 
 
 
2
  import gradio as gr
3
+ import torch
4
+ from huggingface_hub import hf_hub_download
5
+ import os
6
  import sys
7
+ import tempfile
8
+ from scipy.io.wavfile import write
9
 
10
  # Клонуем рэпазіторый, калі ён яшчэ не загружаны
11
  if not os.path.exists("XTTSv2-Finetuning-for-New-Languages"):
 
17
 
18
  # Дадаем тэчку TTS у PYTHONPATH
19
  sys.path.append("./TTS")
20
+ from tqdm import tqdm
21
  from underthesea import sent_tokenize
22
  from TTS.tts.configs.xtts_config import XttsConfig
23
  from TTS.tts.models.xtts import Xtts
24
 
25
+ # Вызначэнне прылады (толькі CPU)
26
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
27
 
28
+ # Шлях да мадэлі ў Hugging Face
 
 
 
 
 
 
 
 
 
 
 
 
29
  repo_id = "archivartaunik/BE_XTTS_V2_60epoch3Dataset"
30
+ checkpoint_file = hf_hub_download(repo_id, filename="model.pth")
31
+ config_file = hf_hub_download(repo_id, filename="config.json")
32
+ vocab_file = hf_hub_download(repo_id, filename="vocab.json")
33
+ default_voice_file = hf_hub_download(repo_id, filename="voice.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # Загрузка канфігурацыі мадэлі
36
  config = XttsConfig()
37
+ config.load_json(config_file)
38
+
39
+ # Ініцыялізацыя і загрузка мадэлі
40
  XTTS_MODEL = Xtts.init_from_config(config)
41
+ XTTS_MODEL.load_checkpoint(config, checkpoint_path=checkpoint_file, vocab_path=vocab_file, use_deepspeed=False)
42
+ XTTS_MODEL.to(device)
43
 
44
+ def text_to_speech(belarusian_story, lang="be", speaker_audio_file=None):
45
+ # Калі файл не пададзены, выкарыстоўваем голас па змаўчанні
46
+ if not speaker_audio_file or (not isinstance(speaker_audio_file, str) and speaker_audio_file.name == ""):
47
+ speaker_audio_file = default_voice_file
 
48
 
49
+ # Атрыманне латэнтных умоў і эмацый
50
  gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
51
  audio_path=speaker_audio_file,
52
  gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
 
54
  sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
55
  )
56
 
57
+ # Токенізацыя тэксту на асобныя сказы
58
+ tts_texts = sent_tokenize(belarusian_story)
59
 
60
+ # Генерацыя аўдыё для кожнага сказы
61
  wav_chunks = []
62
  for text in tqdm(tts_texts):
63
  wav_chunk = XTTS_MODEL.inference(
 
73
  )
74
  wav_chunks.append(torch.tensor(wav_chunk["wav"]))
75
 
76
+ # Аб'ядноўваем усе часткі аўдыё ў адзін масіў
77
+ out_wav = torch.cat(wav_chunks, dim=0).squeeze().cpu().numpy()
78
 
79
+ # Захоўваем аўдыё ў часовы файл
80
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
81
+ write(temp_file.name, 24000, out_wav)
82
 
83
+ return temp_file.name
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ demo = gr.Interface(
87
+ fn=text_to_speech,
88
+ inputs=[
89
+ gr.Textbox(lines=5, label="Тэкст на беларускай мове"),
90
+ gr.Textbox(value="be", label="Мова (па змаўчанні BE)", visible=False),
91
+ gr.Audio(type="filepath", label="Запішыце або загрузіце файл голасу (без іншых гукаў) не карацей 7 секунд", interactive=True),
92
+ ],
93
+ outputs="audio",
94
+ title="XTTS Belarusian TTS Demo",
95
+ description="Увядзіце тэкст, і мадэль пераўтворыць яго ў аўдыя. Вы можаце выкарыстоўваць голас па змаўчанні, загрузіць уласны файл або запісаць аўдыё.",
96
+ )
97
 
 
98
  if __name__ == "__main__":
99
+ demo.launch(server_name="0.0.0.0", server_port=7860, enable_queue=True)