File size: 4,123 Bytes
1337d7e
 
 
 
 
 
 
d278189
 
1337d7e
 
 
d278189
 
 
 
1337d7e
d278189
 
1337d7e
 
 
 
 
 
 
 
 
 
 
d278189
 
1337d7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d278189
1337d7e
 
d278189
1337d7e
d278189
1337d7e
 
 
 
 
 
 
 
 
 
 
 
 
 
1ea9ebb
d278189
1337d7e
 
d278189
1337d7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d278189
1337d7e
 
 
 
 
 
d278189
1337d7e
 
 
 
 
 
 
f642cbb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import gradio as gr
import tempfile
import soundfile as sf
from models import Tokenizer, Kokoro

# Function to fetch available style vectors dynamically


def get_style_vector_choices(directory="voices"):
    return [file for file in os.listdir(directory) if file.endswith(".pt")]


def get_onnx_models(directory="weights"):
    return [file for file in os.listdir(directory) if file.endswith(".onnx")]

# Function to perform TTS using your local model


def local_tts(
        text: str,
        model_path: str,
        style_vector: str,
        output_file_format: str = "wav",
        speed: float = 1.0
):
    if len(text) > 0:
        try:
            tokenizer = Tokenizer()
            style_vector_path = os.path.join("voices", style_vector)
            model_path = os.path.join("weights", model_path)

            inference = Kokoro(model_path, style_vector_path, tokenizer=tokenizer, lang='en-us')

            audio, sample_rate = inference.generate_audio(text, speed=speed)

            with tempfile.NamedTemporaryFile(suffix=f".{output_file_format}", delete=False) as temp_file:
                sf.write(temp_file.name, audio, sample_rate)
                temp_file_path = temp_file.name

            return temp_file_path

        except Exception as e:
            raise gr.Error(f"An error occurred during TTS inference: {str(e)}")
    else:
        raise gr.Error("Input text cannot be empty.")


# Get the list of available style vectors
style_vector_choices = get_style_vector_choices()
onnx_models_choices = get_onnx_models()

# sample texts and their corresponding audio
sample_outputs = [
    ("Educational Note", "Machine learning models rely on large datasets and complex algorithms to identify patterns and make predictions.", "assets/edu_note.wav"),
    ("Fun Fact", "Did you know that honey never spoils? Archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still edible!", "assets/fun_fact.wav"),
    ("Thanks", "Thank you for listening to this audio. It was generated by the Kokoro TTS model.", "assets/thanks.wav")
]

example_texts = [
    ["Machine learning models rely on large datasets and complex algorithms to identify patterns and make predictions."],
    ["Did you know that honey never spoils? Archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still edible!"],
    ["Thank you for listening to this audio. It was generated by the Kokoro TTS model."]
]

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("## <center> Kokoro TTS ONNX Inference | [GitHub Link](https://github.com/yakhyo/kokoro-onnx) </center>")

    # Model-specific inputs
    with gr.Row(variant="panel"):
        model_path = gr.Dropdown(choices=onnx_models_choices, label="ONNX Model Path", value=onnx_models_choices[0])
        style_vector = gr.Dropdown(choices=style_vector_choices, label="Style Vector", value=style_vector_choices[0])
        output_file_format = gr.Dropdown(choices=["wav", "mp3"], label="Output Format", value="wav")
        speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")

    # Text input and output
    text = gr.Textbox(
        label="Input Text",
        placeholder="Enter text to convert to speech."
    )
    btn = gr.Button("Generate Speech")
    output_audio = gr.Audio(label="Generated Audio", type="filepath")

    # Link inputs and outputs
    btn.click(
        fn=local_tts,
        inputs=[text, model_path, style_vector, output_file_format, speed],
        outputs=output_audio
    )

    # Add example texts
    gr.Examples(
        examples=example_texts,
        inputs=[text],
        label="Click an example to populate the input text"
    )

    # Add example texts and audios
    gr.Markdown("### Sample Texts and Audio")
    for topic, sample_text, sample_audio in sample_outputs:
        with gr.Row():
            gr.Textbox(value=sample_text, label=topic, interactive=False)
            gr.Audio(value=sample_audio, label="Example Audio", type="filepath", interactive=False)

demo.launch(server_name="0.0.0.0")