lez-tts / app.py
alialek's picture
add pauses and controls
cb1ad55
raw
history blame
3.45 kB
import tempfile
import numpy as np
from scipy.io.wavfile import write
import gradio as gr
from transformers import VitsTokenizer, VitsModel, set_seed, pipeline
from numToLez import numToLez
# Load your fine-tuned model
model_name = "leks-forever/vits_lez_tts" # Replace with your Hugging Face model name
tokenizer = VitsTokenizer.from_pretrained(model_name)
model = VitsModel.from_pretrained(model_name)
tts_pipeline = pipeline("text-to-speech", model=model_name)
new_sentence = '!.?'
in_sentence = ',-.:;'
def canonize_lez(text):
for abruptive_letter in ['к', 'К', 'п', 'П', 'т', 'Т', 'ц', 'Ц', 'ч', 'Ч']:
for abruptive_symbol in ['1', 'l', 'i', 'I', '|', 'ӏ', 'Ӏ', 'ӏ']:
text = text.replace(abruptive_letter+abruptive_symbol, abruptive_letter+'Ӏ')
return text
def tts_function(input_text, speaking_rate, noise_scale, add_pauses):
fixed_text = canonize_lez(input_text)
if add_pauses:
for symb in new_sentence:
fixed_text = fixed_text.replace(symb, ' ')
for symb in in_sentence:
fixed_text = fixed_text.replace(symb, ' ')
inputs = tokenizer(text=fixed_text, return_tensors="pt")
speech = tts_pipeline(input_text)
set_seed(900)
# make speech faster and more noisy
model.speaking_rate = speaking_rate
model.noise_scale = noise_scale
sampling_rate = speech["sampling_rate"]
outputs = model(**inputs)
waveform = outputs.waveform[0]
waveform = waveform.detach().cpu().float().numpy()
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
write(tmpfile.name, rate=sampling_rate, data=waveform)
return tmpfile.name # Return the filepath
# interface = gr.Interface(
# fn=tts_function,
# inputs=[
# gr.Textbox(label="Введите текст на лезгинском"),
# gr.Slider(label="Скорость речи", minimum=0, maximum=2, step=0.1, value=0.9),
# gr.Slider(label="Шум", minimum=0, maximum=5, step=0.1, value=0),
# gr.Checkbox(label="Сделать паузы длиннее", value=False),
# ],
# outputs=gr.Audio(label="Аудио"),
# title="Text-to-speech Лезги ЧIалал",
# submit_button=gr.Button("Сгенерировать"),
# flagging_mode="auto", # Enable the flagging button
# )
with gr.Blocks() as interface:
gr.Markdown("### Text-to-speech Лезги ЧIалал")
with gr.Row():
# Left Column: Inputs
with gr.Column():
input_text = gr.Textbox(label="Введите текст на лезгинском", elem_id="custom-input")
add_pauses = gr.Checkbox(label="Добавить больше пауз у знаков препинания", value=False)
speaking_rate = gr.Slider(label="Скорость речи (speaking_rate)", minimum=0, maximum=2, step=0.1, value=0.9)
noise_scale = gr.Slider(label="Шум (noise_scale)", minimum=0, maximum=5, step=0.1, value=0)
submit_button = gr.Button("Сгенерировать")
# Right Column: Output
with gr.Column():
output_audio = gr.Audio(label="Аудио")
# Link function to button
submit_button.click(
fn=tts_function,
inputs=[input_text, speaking_rate, noise_scale, add_pauses],
outputs=output_audio,
)
# Launch the app
interface.launch()