File size: 3,513 Bytes
3fb186a
 
 
 
 
cb1ad55
a76b982
3fb186a
 
 
 
 
a76b982
3fb186a
 
a76b982
3fb186a
cb1ad55
 
3fb186a
cb1ad55
 
 
 
 
 
 
a76b982
cb1ad55
 
 
 
 
 
 
 
 
 
3fb186a
 
 
 
cb1ad55
 
3fb186a
 
 
 
 
 
 
 
 
 
 
 
cb1ad55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fb186a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import tempfile
import numpy as np
from scipy.io.wavfile import write
import gradio as gr
from transformers import VitsTokenizer, VitsModel, set_seed, pipeline
from numToLez import numToLez
import spaces

# Load your fine-tuned model
model_name = "leks-forever/vits_lez_tts"  # Replace with your Hugging Face model name
tokenizer = VitsTokenizer.from_pretrained(model_name)
model = VitsModel.from_pretrained(model_name)
model.to("cuda")

tts_pipeline = pipeline("text-to-speech", model=model_name)
tts_pipeline.to("cuda")

new_sentence = '!.?'
in_sentence = ',-.:;'


def canonize_lez(text):
    for abruptive_letter in ['к', 'К', 'п', 'П', 'т', 'Т', 'ц', 'Ц', 'ч', 'Ч']:
        for abruptive_symbol in ['1', 'l', 'i', 'I', '|', 'ӏ', 'Ӏ', 'ӏ']:
            text = text.replace(abruptive_letter+abruptive_symbol, abruptive_letter+'Ӏ')
    return text

@spaces.GPU()
def tts_function(input_text, speaking_rate, noise_scale, add_pauses):
    fixed_text = canonize_lez(input_text)
    if add_pauses:
        for symb in new_sentence:
            fixed_text = fixed_text.replace(symb, '   ')

        for symb in in_sentence:
            fixed_text = fixed_text.replace(symb, ' ')

    inputs = tokenizer(text=fixed_text, return_tensors="pt")
    speech = tts_pipeline(input_text)
    set_seed(900)  

    # make speech faster and more noisy
    model.speaking_rate = speaking_rate
    model.noise_scale = noise_scale

    sampling_rate = speech["sampling_rate"]

    outputs = model(**inputs)
    waveform = outputs.waveform[0]
    waveform = waveform.detach().cpu().float().numpy()

    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
        write(tmpfile.name, rate=sampling_rate, data=waveform)
        return tmpfile.name  # Return the filepath


# interface = gr.Interface(
#     fn=tts_function,
#     inputs=[
#         gr.Textbox(label="Введите текст на лезгинском"),
#         gr.Slider(label="Скорость речи", minimum=0, maximum=2, step=0.1, value=0.9),
#         gr.Slider(label="Шум", minimum=0, maximum=5, step=0.1, value=0),
#         gr.Checkbox(label="Сделать паузы длиннее", value=False),  
#     ],
#     outputs=gr.Audio(label="Аудио"),
#     title="Text-to-speech Лезги ЧIалал",
#     submit_button=gr.Button("Сгенерировать"),
    
#     flagging_mode="auto",  # Enable the flagging button
# )

with gr.Blocks() as interface:
    gr.Markdown("### Text-to-speech Лезги ЧIалал")

    with gr.Row():
        # Left Column: Inputs
        with gr.Column():
            input_text = gr.Textbox(label="Введите текст на лезгинском", elem_id="custom-input")
            add_pauses = gr.Checkbox(label="Добавить больше пауз у знаков препинания", value=False)
            speaking_rate = gr.Slider(label="Скорость речи (speaking_rate)", minimum=0, maximum=2, step=0.1, value=0.9)
            noise_scale = gr.Slider(label="Шум (noise_scale)", minimum=0, maximum=5, step=0.1, value=0)
            submit_button = gr.Button("Сгенерировать")
        
        # Right Column: Output
        with gr.Column():
            output_audio = gr.Audio(label="Аудио")

    # Link function to button
    submit_button.click(
        fn=tts_function,
        inputs=[input_text, speaking_rate, noise_scale, add_pauses],
        outputs=output_audio,
    )
# Launch the app
interface.launch()