Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
datasets:
|
4 |
+
- ESpeech/ESpeech-webinars2
|
5 |
+
language:
|
6 |
+
- ru
|
7 |
+
tags:
|
8 |
+
- TTS
|
9 |
+
- F5-TTS
|
10 |
+
---
|
11 |
+
|
12 |
+
Установите необходимые зависимости:
|
13 |
+
```
|
14 |
+
pip install f5-tts gradio ruaccent transformers torch torchaudio huggingface_hub
|
15 |
+
```
|
16 |
+
|
17 |
+
Запустите код и ждите сообщения с адресом веб-интерфейса
|
18 |
+
|
19 |
+
```py
|
20 |
+
#!/usr/bin/env python3
|
21 |
+
import os
|
22 |
+
import gc
|
23 |
+
import tempfile
|
24 |
+
import traceback
|
25 |
+
from pathlib import Path
|
26 |
+
|
27 |
+
import gradio as gr
|
28 |
+
import numpy as np
|
29 |
+
import soundfile as sf
|
30 |
+
import torch
|
31 |
+
import torchaudio
|
32 |
+
from huggingface_hub import hf_hub_download, snapshot_download
|
33 |
+
from ruaccent import RUAccent
|
34 |
+
from f5_tts.infer.utils_infer import (
|
35 |
+
infer_process,
|
36 |
+
load_model,
|
37 |
+
load_vocoder,
|
38 |
+
preprocess_ref_audio_text,
|
39 |
+
remove_silence_for_generated_wav,
|
40 |
+
save_spectrogram,
|
41 |
+
tempfile_kwargs,
|
42 |
+
)
|
43 |
+
from f5_tts.model import DiT
|
44 |
+
|
45 |
+
MODEL_CFG = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
|
46 |
+
MODEL_REPO = "ESpeech/ESpeech-TTS-1_RL-V2"
|
47 |
+
MODEL_FILE = "espeech_tts_rlv2.pt"
|
48 |
+
VOCAB_FILE = "vocab.txt"
|
49 |
+
|
50 |
+
loaded_model = None
|
51 |
+
|
52 |
+
def ensure_model():
|
53 |
+
global loaded_model
|
54 |
+
if loaded_model is not None:
|
55 |
+
return loaded_model
|
56 |
+
model_path = None
|
57 |
+
vocab_path = None
|
58 |
+
print(f"Trying to download model file '{MODEL_FILE}' and '{VOCAB_FILE}' from hub '{MODEL_REPO}'")
|
59 |
+
try:
|
60 |
+
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
|
61 |
+
vocab_path = hf_hub_download(repo_id=MODEL_REPO, filename=VOCAB_FILE)
|
62 |
+
print(f"Downloaded model to {model_path}")
|
63 |
+
print(f"Downloaded vocab to {vocab_path}")
|
64 |
+
except Exception as e:
|
65 |
+
print("hf_hub_download failed:", e)
|
66 |
+
if model_path is None or vocab_path is None:
|
67 |
+
try:
|
68 |
+
local_dir = f"cache_{MODEL_REPO.replace('/', '_')}"
|
69 |
+
print(f"Attempting snapshot_download into {local_dir}...")
|
70 |
+
snapshot_dir = snapshot_download(repo_id=MODEL_REPO, cache_dir=None, local_dir=local_dir, token=hf_token)
|
71 |
+
possible_model = os.path.join(snapshot_dir, MODEL_FILE)
|
72 |
+
possible_vocab = os.path.join(snapshot_dir, VOCAB_FILE)
|
73 |
+
if os.path.exists(possible_model):
|
74 |
+
model_path = possible_model
|
75 |
+
if os.path.exists(possible_vocab):
|
76 |
+
vocab_path = possible_vocab
|
77 |
+
print(f"Snapshot downloaded to {snapshot_dir}")
|
78 |
+
except Exception as e:
|
79 |
+
print("snapshot_download failed:", e)
|
80 |
+
if not model_path or not os.path.exists(model_path):
|
81 |
+
raise FileNotFoundError(f"Model file not found after download attempts: {model_path}")
|
82 |
+
if not vocab_path or not os.path.exists(vocab_path):
|
83 |
+
raise FileNotFoundError(f"Vocab file not found after download attempts: {vocab_path}")
|
84 |
+
print(f"Loading model from: {model_path}")
|
85 |
+
loaded_model = load_model(DiT, MODEL_CFG, model_path, vocab_file=vocab_path)
|
86 |
+
return loaded_model
|
87 |
+
|
88 |
+
print("Preloading model...")
|
89 |
+
try:
|
90 |
+
ensure_model()
|
91 |
+
print("Model preloaded.")
|
92 |
+
except Exception as e:
|
93 |
+
print(f"Model preload failed: {e}")
|
94 |
+
|
95 |
+
print("Loading RUAccent...")
|
96 |
+
accentizer = RUAccent()
|
97 |
+
accentizer.load(omograph_model_size='turbo3.1', use_dictionary=True, tiny_mode=False)
|
98 |
+
print("RUAccent loaded.")
|
99 |
+
|
100 |
+
print("Loading vocoder...")
|
101 |
+
vocoder = load_vocoder()
|
102 |
+
print("Vocoder loaded.")
|
103 |
+
|
104 |
+
def process_text_with_accent(text, accentizer):
|
105 |
+
if not text or not text.strip():
|
106 |
+
return text
|
107 |
+
if '+' in text:
|
108 |
+
return text
|
109 |
+
else:
|
110 |
+
return accentizer.process_all(text)
|
111 |
+
|
112 |
+
def process_texts_only(ref_text, gen_text):
|
113 |
+
processed_ref_text = process_text_with_accent(ref_text, accentizer)
|
114 |
+
processed_gen_text = process_text_with_accent(gen_text, accentizer)
|
115 |
+
return processed_ref_text, processed_gen_text
|
116 |
+
|
117 |
+
def synthesize(
|
118 |
+
ref_audio,
|
119 |
+
ref_text,
|
120 |
+
gen_text,
|
121 |
+
remove_silence,
|
122 |
+
seed,
|
123 |
+
cross_fade_duration=0.15,
|
124 |
+
nfe_step=32,
|
125 |
+
speed=1.0,
|
126 |
+
):
|
127 |
+
if not ref_audio:
|
128 |
+
gr.Warning("Please provide reference audio.")
|
129 |
+
return None, None, ref_text, gen_text
|
130 |
+
if seed is None or seed < 0 or seed > 2**31 - 1:
|
131 |
+
seed = np.random.randint(0, 2**31 - 1)
|
132 |
+
torch.manual_seed(int(seed))
|
133 |
+
if not gen_text or not gen_text.strip():
|
134 |
+
gr.Warning("Please enter text to generate.")
|
135 |
+
return None, None, ref_text, gen_text
|
136 |
+
if not ref_text or not ref_text.strip():
|
137 |
+
gr.Warning("Please provide reference text.")
|
138 |
+
return None, None, ref_text, gen_text
|
139 |
+
processed_ref_text = process_text_with_accent(ref_text, accentizer)
|
140 |
+
processed_gen_text = process_text_with_accent(gen_text, accentizer)
|
141 |
+
try:
|
142 |
+
model = ensure_model()
|
143 |
+
except Exception as e:
|
144 |
+
gr.Warning(f"Failed to load model: {e}")
|
145 |
+
return None, None, processed_ref_text, processed_gen_text
|
146 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
147 |
+
try:
|
148 |
+
if device.type == "cuda":
|
149 |
+
try:
|
150 |
+
model.to(device)
|
151 |
+
vocoder.to(device)
|
152 |
+
except Exception as e:
|
153 |
+
print("Warning: failed to move model/vocoder to cuda:", e)
|
154 |
+
try:
|
155 |
+
ref_audio_proc, processed_ref_text_final = preprocess_ref_audio_text(
|
156 |
+
ref_audio,
|
157 |
+
processed_ref_text,
|
158 |
+
show_info=gr.Info
|
159 |
+
)
|
160 |
+
except Exception as e:
|
161 |
+
gr.Warning(f"Preprocess failed: {e}")
|
162 |
+
traceback.print_exc()
|
163 |
+
return None, None, processed_ref_text, processed_gen_text
|
164 |
+
try:
|
165 |
+
final_wave, final_sample_rate, combined_spectrogram = infer_process(
|
166 |
+
ref_audio_proc,
|
167 |
+
processed_ref_text_final,
|
168 |
+
processed_gen_text,
|
169 |
+
model,
|
170 |
+
vocoder,
|
171 |
+
cross_fade_duration=cross_fade_duration,
|
172 |
+
nfe_step=nfe_step,
|
173 |
+
speed=speed,
|
174 |
+
show_info=gr.Info,
|
175 |
+
progress=gr.Progress(),
|
176 |
+
)
|
177 |
+
except Exception as e:
|
178 |
+
gr.Warning(f"Infer failed: {e}")
|
179 |
+
traceback.print_exc()
|
180 |
+
return None, None, processed_ref_text, processed_gen_text
|
181 |
+
if remove_silence:
|
182 |
+
try:
|
183 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", **tempfile_kwargs) as f:
|
184 |
+
temp_path = f.name
|
185 |
+
sf.write(temp_path, final_wave, final_sample_rate)
|
186 |
+
remove_silence_for_generated_wav(temp_path)
|
187 |
+
final_wave_tensor, _ = torchaudio.load(temp_path)
|
188 |
+
final_wave = final_wave_tensor.squeeze().cpu().numpy()
|
189 |
+
except Exception as e:
|
190 |
+
print("Remove silence failed:", e)
|
191 |
+
try:
|
192 |
+
with tempfile.NamedTemporaryFile(suffix=".png", **tempfile_kwargs) as tmp_spectrogram:
|
193 |
+
spectrogram_path = tmp_spectrogram.name
|
194 |
+
save_spectrogram(combined_spectrogram, spectrogram_path)
|
195 |
+
except Exception as e:
|
196 |
+
print("Save spectrogram failed:", e)
|
197 |
+
spectrogram_path = None
|
198 |
+
return (final_sample_rate, final_wave), spectrogram_path, processed_ref_text_final, processed_gen_text
|
199 |
+
finally:
|
200 |
+
if device.type == "cuda":
|
201 |
+
try:
|
202 |
+
model.to("cpu")
|
203 |
+
vocoder.to("cpu")
|
204 |
+
torch.cuda.empty_cache()
|
205 |
+
gc.collect()
|
206 |
+
except Exception as e:
|
207 |
+
print("Warning during cuda cleanup:", e)
|
208 |
+
|
209 |
+
with gr.Blocks(title="ESpeech-TTS") as app:
|
210 |
+
gr.Markdown("# ESpeech-TTS")
|
211 |
+
gr.Markdown("💡 **Совет:** Добавьте символ '+' для ударения (например, 'прив+ет')")
|
212 |
+
gr.Markdown("❌ **Совет:** Референс должен быть не более 12 секунд")
|
213 |
+
with gr.Row():
|
214 |
+
with gr.Column():
|
215 |
+
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
|
216 |
+
ref_text_input = gr.Textbox(
|
217 |
+
label="Reference Text",
|
218 |
+
lines=2,
|
219 |
+
placeholder="Text corresponding to reference audio"
|
220 |
+
)
|
221 |
+
with gr.Column():
|
222 |
+
gen_text_input = gr.Textbox(
|
223 |
+
label="Text to Generate",
|
224 |
+
lines=5,
|
225 |
+
max_lines=20,
|
226 |
+
placeholder="Enter text to synthesize..."
|
227 |
+
)
|
228 |
+
process_text_btn = gr.Button("✏️ Process Text (Add Accents)", variant="secondary")
|
229 |
+
with gr.Accordion("Advanced Settings", open=False):
|
230 |
+
with gr.Row():
|
231 |
+
seed_input = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
|
232 |
+
remove_silence = gr.Checkbox(label="Remove Silences", value=False)
|
233 |
+
with gr.Row():
|
234 |
+
speed_slider = gr.Slider(label="Speed", minimum=0.3, maximum=2.0, value=1.0, step=0.1)
|
235 |
+
nfe_slider = gr.Slider(label="NFE Steps", minimum=4, maximum=64, value=48, step=2)
|
236 |
+
cross_fade_slider = gr.Slider(label="Cross-Fade Duration (s)", minimum=0.0, maximum=1.0, value=0.15, step=0.01)
|
237 |
+
generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
|
238 |
+
with gr.Row():
|
239 |
+
audio_output = gr.Audio(label="Generated Audio", type="numpy")
|
240 |
+
spectrogram_output = gr.Image(label="Spectrogram", type="filepath")
|
241 |
+
process_text_btn.click(
|
242 |
+
process_texts_only,
|
243 |
+
inputs=[ref_text_input, gen_text_input],
|
244 |
+
outputs=[ref_text_input, gen_text_input]
|
245 |
+
)
|
246 |
+
generate_btn.click(
|
247 |
+
synthesize,
|
248 |
+
inputs=[
|
249 |
+
ref_audio_input,
|
250 |
+
ref_text_input,
|
251 |
+
gen_text_input,
|
252 |
+
remove_silence,
|
253 |
+
seed_input,
|
254 |
+
cross_fade_slider,
|
255 |
+
nfe_slider,
|
256 |
+
speed_slider,
|
257 |
+
],
|
258 |
+
outputs=[audio_output, spectrogram_output, ref_text_input, gen_text_input]
|
259 |
+
)
|
260 |
+
|
261 |
+
if __name__ == "__main__":
|
262 |
+
app.launch()
|
263 |
+
```
|