File size: 2,230 Bytes
a0a99a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acd6515
 
a0a99a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import gradio as gr
import numpy as np
from kittentts import KittenTTS

# Initialize the model
model = KittenTTS("KittenML/kitten-tts-nano-0.1")

# Available voices
AVAILABLE_VOICES = [
    'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f',
    'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f'
]

def generate_speech(text, voice):
    """Generate speech from text using KittenTTS"""
    if not text.strip():
        return None, "Please enter some text to generate speech."
    
    try:
        # Generate audio
        audio = model.generate(text, voice=voice)
        
        # Convert to the format expected by Gradio
        if len(audio.shape) > 1:
            audio = audio.mean(axis=1)  # Convert stereo to mono if needed
        
        # Normalize audio
        audio = audio / np.max(np.abs(audio)) if np.max(np.abs(audio)) > 0 else audio
        
        # Return in the format expected by Gradio Audio component: (sample_rate, audio_data)
        return (24000, audio), f"✅ Successfully generated speech with voice: {voice}"
        
    except Exception as e:
        return None, f"❌ Error generating speech: {str(e)}"

# Create the interface using Interface instead of Blocks
demo = gr.Interface(
    fn=generate_speech,
    inputs=[
        gr.Textbox(label="Enter your text", placeholder="Type your text here...", lines=3),
        gr.Dropdown(choices=AVAILABLE_VOICES, value=AVAILABLE_VOICES[1], label="Select Voice")
    ],
    outputs=[
        gr.Audio(label="Generated Audio"),
        gr.Textbox(label="Status", interactive=False)
    ],
    title="🎤 KittenTTS - High Quality Text-to-Speech",
    description="Generate natural-sounding speech from text using the KittenTTS model",
    examples=[
        ["Hello! This is a demonstration of the KittenTTS model.", "expr-voice-2-f"],
        ["The quick brown fox jumps over the lazy dog.", "expr-voice-2-m"],
        ["Welcome to our high-quality text-to-speech system.", "expr-voice-3-f"],
    ]
)

# Launch the demo
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )