|
import tempfile |
|
import numpy as np |
|
from scipy.io.wavfile import write |
|
import gradio as gr |
|
from transformers import VitsTokenizer, VitsModel, set_seed, pipeline |
|
|
|
|
|
class CustomFlagging(gr.FlaggingCallback): |
|
def setup(self, *args, **kwargs): |
|
pass |
|
|
|
def flag(self, flag_data, flag_option=None, username=None): |
|
print(f"Аудио: {flag_data}, Сообщение: {flag_option}") |
|
|
|
|
|
|
|
flagging_callback = CustomFlagging() |
|
flagging_options = ["Хорошая озвучка", "Слышен механический треск", "Не совпадает произношение букв", 'Проглочены буквы'] |
|
|
|
|
|
|
|
model_name = "leks-forever/vits_lez_tts" |
|
tokenizer = VitsTokenizer.from_pretrained(model_name) |
|
model = VitsModel.from_pretrained(model_name) |
|
|
|
tts_pipeline = pipeline("text-to-speech", model=model_name) |
|
|
|
|
|
def tts_function(input_text): |
|
inputs = tokenizer(text=input_text, return_tensors="pt") |
|
speech = tts_pipeline(input_text) |
|
set_seed(900) |
|
|
|
|
|
model.speaking_rate = 0.9 |
|
model.noise_scale = 0 |
|
|
|
sampling_rate = speech["sampling_rate"] |
|
|
|
outputs = model(**inputs) |
|
waveform = outputs.waveform[0] |
|
waveform = waveform.detach().cpu().float().numpy() |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile: |
|
write(tmpfile.name, rate=sampling_rate, data=waveform) |
|
return tmpfile.name |
|
|
|
|
|
interface = gr.Interface( |
|
fn=tts_function, |
|
inputs=gr.Textbox(label="Введите текст на лезгинском"), |
|
outputs=gr.Audio(label="Аудио"), |
|
title="Text-to-speech Лезги ЧIалал", |
|
flagging_mode="auto", |
|
) |
|
|
|
|
|
interface.launch() |
|
|