Russian
TTS
F5-TTS
Den4ikAI commited on
Commit
f582b6e
·
verified ·
1 Parent(s): 51bf800

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +263 -0
README.md ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - ESpeech/ESpeech-webinars2
5
+ language:
6
+ - ru
7
+ tags:
8
+ - TTS
9
+ - F5-TTS
10
+ ---
11
+
12
+ Установите необходимые зависимости:
13
+ ```
14
+ pip install f5-tts gradio ruaccent transformers torch torchaudio huggingface_hub
15
+ ```
16
+
17
+ Запустите код и ждите сообщения с адресом веб-интерфейса
18
+
19
+ ```py
20
+ #!/usr/bin/env python3
21
+ import os
22
+ import gc
23
+ import tempfile
24
+ import traceback
25
+ from pathlib import Path
26
+
27
+ import gradio as gr
28
+ import numpy as np
29
+ import soundfile as sf
30
+ import torch
31
+ import torchaudio
32
+ from huggingface_hub import hf_hub_download, snapshot_download
33
+ from ruaccent import RUAccent
34
+ from f5_tts.infer.utils_infer import (
35
+ infer_process,
36
+ load_model,
37
+ load_vocoder,
38
+ preprocess_ref_audio_text,
39
+ remove_silence_for_generated_wav,
40
+ save_spectrogram,
41
+ tempfile_kwargs,
42
+ )
43
+ from f5_tts.model import DiT
44
+
45
+ MODEL_CFG = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
46
+ MODEL_REPO = "ESpeech/ESpeech-TTS-1_RL-V2"
47
+ MODEL_FILE = "espeech_tts_rlv2.pt"
48
+ VOCAB_FILE = "vocab.txt"
49
+
50
+ loaded_model = None
51
+
52
+ def ensure_model():
53
+ global loaded_model
54
+ if loaded_model is not None:
55
+ return loaded_model
56
+ model_path = None
57
+ vocab_path = None
58
+ print(f"Trying to download model file '{MODEL_FILE}' and '{VOCAB_FILE}' from hub '{MODEL_REPO}'")
59
+ try:
60
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
61
+ vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=VOCAB_FILE)
62
+ print(f"Downloaded model to {model_path}")
63
+ print(f"Downloaded vocab to {vocab_path}")
64
+ except Exception as e:
65
+ print("hf_hub_download failed:", e)
66
+ if model_path is None or vocab_path is None:
67
+ try:
68
+ local_dir = f"cache_{MODEL_REPO.replace('/', '_')}"
69
+ print(f"Attempting snapshot_download into {local_dir}...")
70
+ snapshot_dir = snapshot_download(repo_id=MODEL_REPO, cache_dir=None, local_dir=local_dir, token=hf_token)
71
+ possible_model = os.path.join(snapshot_dir, MODEL_FILE)
72
+ possible_vocab = os.path.join(snapshot_dir, VOCAB_FILE)
73
+ if os.path.exists(possible_model):
74
+ model_path = possible_model
75
+ if os.path.exists(possible_vocab):
76
+ vocab_path = possible_vocab
77
+ print(f"Snapshot downloaded to {snapshot_dir}")
78
+ except Exception as e:
79
+ print("snapshot_download failed:", e)
80
+ if not model_path or not os.path.exists(model_path):
81
+ raise FileNotFoundError(f"Model file not found after download attempts: {model_path}")
82
+ if not vocab_path or not os.path.exists(vocab_path):
83
+ raise FileNotFoundError(f"Vocab file not found after download attempts: {vocab_path}")
84
+ print(f"Loading model from: {model_path}")
85
+ loaded_model = load_model(DiT, MODEL_CFG, model_path, vocab_file=vocab_path)
86
+ return loaded_model
87
+
88
+ print("Preloading model...")
89
+ try:
90
+ ensure_model()
91
+ print("Model preloaded.")
92
+ except Exception as e:
93
+ print(f"Model preload failed: {e}")
94
+
95
+ print("Loading RUAccent...")
96
+ accentizer = RUAccent()
97
+ accentizer.load(omograph_model_size='turbo3.1', use_dictionary=True, tiny_mode=False)
98
+ print("RUAccent loaded.")
99
+
100
+ print("Loading vocoder...")
101
+ vocoder = load_vocoder()
102
+ print("Vocoder loaded.")
103
+
104
+ def process_text_with_accent(text, accentizer):
105
+ if not text or not text.strip():
106
+ return text
107
+ if '+' in text:
108
+ return text
109
+ else:
110
+ return accentizer.process_all(text)
111
+
112
+ def process_texts_only(ref_text, gen_text):
113
+ processed_ref_text = process_text_with_accent(ref_text, accentizer)
114
+ processed_gen_text = process_text_with_accent(gen_text, accentizer)
115
+ return processed_ref_text, processed_gen_text
116
+
117
+ def synthesize(
118
+ ref_audio,
119
+ ref_text,
120
+ gen_text,
121
+ remove_silence,
122
+ seed,
123
+ cross_fade_duration=0.15,
124
+ nfe_step=32,
125
+ speed=1.0,
126
+ ):
127
+ if not ref_audio:
128
+ gr.Warning("Please provide reference audio.")
129
+ return None, None, ref_text, gen_text
130
+ if seed is None or seed < 0 or seed > 2**31 - 1:
131
+ seed = np.random.randint(0, 2**31 - 1)
132
+ torch.manual_seed(int(seed))
133
+ if not gen_text or not gen_text.strip():
134
+ gr.Warning("Please enter text to generate.")
135
+ return None, None, ref_text, gen_text
136
+ if not ref_text or not ref_text.strip():
137
+ gr.Warning("Please provide reference text.")
138
+ return None, None, ref_text, gen_text
139
+ processed_ref_text = process_text_with_accent(ref_text, accentizer)
140
+ processed_gen_text = process_text_with_accent(gen_text, accentizer)
141
+ try:
142
+ model = ensure_model()
143
+ except Exception as e:
144
+ gr.Warning(f"Failed to load model: {e}")
145
+ return None, None, processed_ref_text, processed_gen_text
146
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
147
+ try:
148
+ if device.type == "cuda":
149
+ try:
150
+ model.to(device)
151
+ vocoder.to(device)
152
+ except Exception as e:
153
+ print("Warning: failed to move model/vocoder to cuda:", e)
154
+ try:
155
+ ref_audio_proc, processed_ref_text_final = preprocess_ref_audio_text(
156
+ ref_audio,
157
+ processed_ref_text,
158
+ show_info=gr.Info
159
+ )
160
+ except Exception as e:
161
+ gr.Warning(f"Preprocess failed: {e}")
162
+ traceback.print_exc()
163
+ return None, None, processed_ref_text, processed_gen_text
164
+ try:
165
+ final_wave, final_sample_rate, combined_spectrogram = infer_process(
166
+ ref_audio_proc,
167
+ processed_ref_text_final,
168
+ processed_gen_text,
169
+ model,
170
+ vocoder,
171
+ cross_fade_duration=cross_fade_duration,
172
+ nfe_step=nfe_step,
173
+ speed=speed,
174
+ show_info=gr.Info,
175
+ progress=gr.Progress(),
176
+ )
177
+ except Exception as e:
178
+ gr.Warning(f"Infer failed: {e}")
179
+ traceback.print_exc()
180
+ return None, None, processed_ref_text, processed_gen_text
181
+ if remove_silence:
182
+ try:
183
+ with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
184
+ temp_path = f.name
185
+ sf.write(temp_path, final_wave, final_sample_rate)
186
+ remove_silence_for_generated_wav(temp_path)
187
+ final_wave_tensor, _ = torchaudio.load(temp_path)
188
+ final_wave = final_wave_tensor.squeeze().cpu().numpy()
189
+ except Exception as e:
190
+ print("Remove silence failed:", e)
191
+ try:
192
+ with tempfile.NamedTemporaryFile(suffix=".png", **tempfile_kwargs) as tmp_spectrogram:
193
+ spectrogram_path = tmp_spectrogram.name
194
+ save_spectrogram(combined_spectrogram, spectrogram_path)
195
+ except Exception as e:
196
+ print("Save spectrogram failed:", e)
197
+ spectrogram_path = None
198
+ return (final_sample_rate, final_wave), spectrogram_path, processed_ref_text_final, processed_gen_text
199
+ finally:
200
+ if device.type == "cuda":
201
+ try:
202
+ model.to("cpu")
203
+ vocoder.to("cpu")
204
+ torch.cuda.empty_cache()
205
+ gc.collect()
206
+ except Exception as e:
207
+ print("Warning during cuda cleanup:", e)
208
+
209
+ with gr.Blocks(title="ESpeech-TTS") as app:
210
+ gr.Markdown("# ESpeech-TTS")
211
+ gr.Markdown("💡 **Совет:** Добавьте символ '+' для ударения (например, 'прив+ет')")
212
+ gr.Markdown("❌ **Совет:** Референс должен быть не более 12 секунд")
213
+ with gr.Row():
214
+ with gr.Column():
215
+ ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
216
+ ref_text_input = gr.Textbox(
217
+ label="Reference Text",
218
+ lines=2,
219
+ placeholder="Text corresponding to reference audio"
220
+ )
221
+ with gr.Column():
222
+ gen_text_input = gr.Textbox(
223
+ label="Text to Generate",
224
+ lines=5,
225
+ max_lines=20,
226
+ placeholder="Enter text to synthesize..."
227
+ )
228
+ process_text_btn = gr.Button("✏️ Process Text (Add Accents)", variant="secondary")
229
+ with gr.Accordion("Advanced Settings", open=False):
230
+ with gr.Row():
231
+ seed_input = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
232
+ remove_silence = gr.Checkbox(label="Remove Silences", value=False)
233
+ with gr.Row():
234
+ speed_slider = gr.Slider(label="Speed", minimum=0.3, maximum=2.0, value=1.0, step=0.1)
235
+ nfe_slider = gr.Slider(label="NFE Steps", minimum=4, maximum=64, value=48, step=2)
236
+ cross_fade_slider = gr.Slider(label="Cross-Fade Duration (s)", minimum=0.0, maximum=1.0, value=0.15, step=0.01)
237
+ generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
238
+ with gr.Row():
239
+ audio_output = gr.Audio(label="Generated Audio", type="numpy")
240
+ spectrogram_output = gr.Image(label="Spectrogram", type="filepath")
241
+ process_text_btn.click(
242
+ process_texts_only,
243
+ inputs=[ref_text_input, gen_text_input],
244
+ outputs=[ref_text_input, gen_text_input]
245
+ )
246
+ generate_btn.click(
247
+ synthesize,
248
+ inputs=[
249
+ ref_audio_input,
250
+ ref_text_input,
251
+ gen_text_input,
252
+ remove_silence,
253
+ seed_input,
254
+ cross_fade_slider,
255
+ nfe_slider,
256
+ speed_slider,
257
+ ],
258
+ outputs=[audio_output, spectrogram_output, ref_text_input, gen_text_input]
259
+ )
260
+
261
+ if __name__ == "__main__":
262
+ app.launch()
263
+ ```