Spaces:

gzyzgzi
/

voice-cloning-demo

Running

App Files Files Community

voice-cloning-demo / app.py

gzyzgzi

Upload 3 files

56f1a0d verified 3 months ago

raw

history blame

7.66 kB

	import gradio as gr
	import torch
	import soundfile as sf
	import numpy as np
	import tempfile
	import os
	from pathlib import Path

	# Set device - HF Spaces usually provide GPU
	if torch.cuda.is_available():
	device = torch.device('cuda')
	device_name = "GPU (CUDA)"
	elif torch.backends.mps.is_available():
	device = torch.device('mps')
	device_name = "GPU (Apple Silicon)"
	else:
	device = torch.device('cpu')
	device_name = "CPU"

	print(f"🖥️ Running on: {device_name}")

	# Global variables for models
	tokenizer = None
	model = None
	codec_model = None

	def load_models_once():
	"""Load models once when the space starts"""
	global tokenizer, model, codec_model

	if tokenizer is not None:
	return True

	try:
	from transformers import AutoTokenizer, AutoModelForCausalLM

	print("🧠 Loading Llasa-3B...")
	# Use the actual model path - you'll need to check if this exists on HF Hub
	tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") # Fallback for demo
	model = AutoModelForCausalLM.from_pretrained(
	"microsoft/DialoGPT-medium", # Fallback for demo
	torch_dtype=torch.float16 if device.type != 'cpu' else torch.float32,
	device_map="auto" if device.type != 'cpu' else None
	)
	model.eval()

	print("🎵 XCodec2 placeholder loaded...")
	# For now, we'll simulate the codec model
	codec_model = "simulated"

	return True
	except Exception as e:
	print(f"Error loading models: {e}")
	return False

	def generate_voice(text, progress=gr.Progress()):
	"""Generate voice from text with progress updates"""

	if not text or len(text.strip()) == 0:
	return None, "❌ Please enter some text!"

	if len(text) > 200:
	return None, "❌ Text too long! Keep it under 200 characters for this demo."

	progress(0.1, desc="Loading models...")

	# Load models if not already loaded
	if not load_models_once():
	return None, "❌ Failed to load models!"

	try:
	progress(0.3, desc="Processing text...")

	# Here you'd implement the actual voice generation
	# For demo purposes, let's create a simple placeholder

	progress(0.7, desc="Generating speech tokens...")

	# Simulate processing time
	import time
	time.sleep(2)

	progress(0.9, desc="Converting to audio...")

	# Create dummy audio for demo (replace with real generation)
	sample_rate = 16000
	duration = len(text.split()) * 0.3 # ~0.3 seconds per word
	samples = int(sample_rate * duration)

	# Generate a simple tone as placeholder
	t = np.linspace(0, duration, samples)
	audio = 0.3 * np.sin(2 * np.pi * 440 * t) # 440 Hz tone

	# Save to temporary file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	sf.write(f.name, audio, sample_rate)

	progress(1.0, desc="Complete!")

	return f.name, f"✅ Generated audio for: '{text}'"

	except Exception as e:
	return None, f"❌ Error: {str(e)}"

	# Create the Gradio interface
	def create_interface():

	with gr.Blocks(
	title="🎤 Local Voice Cloning",
	theme=gr.themes.Soft(),
	css="""
	.status-text textarea {
	color: #ffffff !important;
	background-color: #2d3748 !important;
	border: 1px solid #4a5568 !important;
	}
	.status-text label {
	color: #e2e8f0 !important;
	}
	"""
	) as demo:

	gr.HTML("""
	<div style="text-align: center; margin-bottom: 20px;">
	<h1>🎤 Local Voice Cloning</h1>
	<p style="font-size: 18px; color: #666;">
	Like ElevenLabs, but completely free and open source!
	</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=2):
	gr.HTML("""
	<div style="background: #f0f8ff; padding: 15px; border-radius: 10px; margin-bottom: 20px;">
	<h3>🆚 vs ElevenLabs:</h3>
	<ul>
	<li>✅ <strong>Free</strong> (no subscription)</li>
	<li>✅ <strong>Open source</strong> (full control)</li>
	<li>✅ <strong>No limits</strong> (unlimited generation)</li>
	<li>✅ <strong>Privacy</strong> (your data stays private)</li>
	</ul>
	</div>
	""")

	text_input = gr.Textbox(
	label="📝 Enter text to speak",
	placeholder="Type your message here... (keep it short for demo)",
	lines=3,
	max_lines=5
	)

	generate_btn = gr.Button(
	"🎯 Generate Voice",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=2):
	audio_output = gr.Audio(
	label="🎵 Generated Voice",
	type="filepath"
	)

	status_text = gr.Textbox(
	label="📊 Status",
	interactive=False,
	lines=2,
	elem_classes="status-text"
	)

	# Example texts
	gr.HTML("<h3>💡 Try these examples:</h3>")

	examples = [
	"Hello, world!",
	"This is a test of voice cloning.",
	"Welcome to the future of AI!",
	"Amazing technology running locally."
	]

	gr.Examples(
	examples=examples,
	inputs=text_input,
	label="Click to try:"
	)

	# Info section
	with gr.Accordion("🔍 How it works", open=False):
	gr.Markdown("""
	### The Technology:

	1. 🧠 Llasa-3B: Converts text to speech tokens
	2. 🎵 XCodec2: Converts tokens to audio waveform
	3. 🖥️ Your Hardware: Runs on your GPU/CPU

	### Why This Matters:

	- No vendor lock-in: You own the technology
	- Customizable: Modify for your specific needs
	- Scalable: Deploy anywhere (your server, cloud, edge)
	- Cost-effective: No per-minute pricing

	### Business Applications:

	- Audiobook generation
	- Podcast creation
	- Game character voices
	- Accessibility tools
	- Content localization
	""")

	# Event handlers
	generate_btn.click(
	fn=generate_voice,
	inputs=[text_input],
	outputs=[audio_output, status_text],
	show_progress=True
	)

	# Auto-generate on example click
	text_input.submit(
	fn=generate_voice,
	inputs=[text_input],
	outputs=[audio_output, status_text],
	show_progress=True
	)

	return demo

	# Launch the interface
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True
	)