Spaces:

Vishwas1
/

KittenTTSDemo

Runtime error

App Files Files Community

KittenTTSDemo / app.py

Vishwas1

Upload 5 files

e327671 verified 3 months ago

raw

history blame

5.96 kB

	import gradio as gr
	import soundfile as sf
	import numpy as np
	from kittentts import KittenTTS
	import os

	# Initialize the model
	model = KittenTTS("KittenML/kitten-tts-nano-0.1")

	# Available voices
	AVAILABLE_VOICES = [
	'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f',
	'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f'
	]

	def generate_speech(text, voice, progress=gr.Progress()):
	"""
	Generate speech from text using KittenTTS
	"""
	if not text.strip():
	return None, "Please enter some text to generate speech."

	try:
	progress(0.3, desc="Loading model...")

	# Generate audio
	progress(0.6, desc="Generating speech...")
	audio = model.generate(text, voice=voice)

	progress(0.9, desc="Processing audio...")

	# Convert to the format expected by Gradio
	# Ensure audio is in the correct format (float32, mono)
	if len(audio.shape) > 1:
	audio = audio.mean(axis=1) # Convert stereo to mono if needed

	# Normalize audio
	audio = audio / np.max(np.abs(audio)) if np.max(np.abs(audio)) > 0 else audio

	progress(1.0, desc="Complete!")

	return audio, f"✅ Successfully generated speech with voice: {voice}"

	except Exception as e:
	return None, f"❌ Error generating speech: {str(e)}"

	def create_demo():
	"""
	Create the Gradio demo interface
	"""

	# Custom CSS for better styling
	css = """
	.gradio-container {
	max-width: 800px !important;
	margin: auto !important;
	}
	.main-header {
	text-align: center;
	margin-bottom: 2rem;
	}
	.voice-selector {
	margin: 1rem 0;
	}
	.output-audio {
	margin-top: 1rem;
	}
	"""

	with gr.Blocks(css=css, title="KittenTTS - High Quality Text-to-Speech") as demo:

	# Header
	gr.HTML("""
	<div class="main-header">
	<h1>🎤 KittenTTS</h1>
	<p><em>High Quality Text-to-Speech Generation</em></p>
	<p>Generate natural-sounding speech from text using the KittenTTS model</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=2):
	# Text input
	text_input = gr.Textbox(
	label="Enter your text",
	placeholder="Type or paste your text here...",
	lines=4,
	max_lines=10
	)

	# Voice selection
	voice_dropdown = gr.Dropdown(
	choices=AVAILABLE_VOICES,
	value=AVAILABLE_VOICES[1], # Default to female voice
	label="Select Voice",
	info="Choose from 8 different voices (4 male, 4 female)"
	)

	# Generate button
	generate_btn = gr.Button(
	"🎵 Generate Speech",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	# Voice info
	gr.HTML("""
	<div style="background: #f0f0f0; padding: 1rem; border-radius: 8px;">
	<h3>Available Voices:</h3>
	<ul>
	<li><strong>Male voices:</strong> expr-voice-2-m, expr-voice-3-m, expr-voice-4-m, expr-voice-5-m</li>
	<li><strong>Female voices:</strong> expr-voice-2-f, expr-voice-3-f, expr-voice-4-f, expr-voice-5-f</li>
	</ul>
	</div>
	""")

	# Output section
	with gr.Row():
	with gr.Column():
	# Audio output
	audio_output = gr.Audio(
	label="Generated Audio",
	type="numpy"
	)

	# Status message
	status_output = gr.Textbox(
	label="Status",
	interactive=False
	)

	# Example texts
	gr.Examples(
	examples=[
	["Hello! This is a demonstration of the KittenTTS model.", "expr-voice-2-f"],
	["The quick brown fox jumps over the lazy dog.", "expr-voice-2-m"],
	["Welcome to our high-quality text-to-speech system.", "expr-voice-3-f"],
	["This model works without requiring a GPU.", "expr-voice-3-m"],
	],
	inputs=[text_input, voice_dropdown]
	)

	# Footer
	gr.HTML("""
	<div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f9f9f9; border-radius: 8px;">
	<p><strong>KittenTTS</strong> - Powered by <a href="https://huggingface.co/KittenML/kitten-tts-nano-0.1" target="_blank">KittenML/kitten-tts-nano-0.1</a></p>
	<p>Model: KittenTTS Nano v0.1 \| Sample Rate: 24kHz</p>
	</div>
	""")

	# Connect the generate button
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, voice_dropdown],
	outputs=[audio_output, status_output]
	)

	# Auto-generate when text is entered and Enter is pressed
	text_input.submit(
	fn=generate_speech,
	inputs=[text_input, voice_dropdown],
	outputs=[audio_output, status_output]
	)

	return demo

	# Create and launch the demo
	if __name__ == "__main__":
	demo = create_demo()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	debug=False
	)