gzyzgzi's picture
Upload 3 files
56f1a0d verified
raw
history blame
7.66 kB
import gradio as gr
import torch
import soundfile as sf
import numpy as np
import tempfile
import os
from pathlib import Path
# Set device - HF Spaces usually provide GPU
if torch.cuda.is_available():
device = torch.device('cuda')
device_name = "GPU (CUDA)"
elif torch.backends.mps.is_available():
device = torch.device('mps')
device_name = "GPU (Apple Silicon)"
else:
device = torch.device('cpu')
device_name = "CPU"
print(f"πŸ–₯️ Running on: {device_name}")
# Global variables for models
tokenizer = None
model = None
codec_model = None
def load_models_once():
"""Load models once when the space starts"""
global tokenizer, model, codec_model
if tokenizer is not None:
return True
try:
from transformers import AutoTokenizer, AutoModelForCausalLM
print("🧠 Loading Llasa-3B...")
# Use the actual model path - you'll need to check if this exists on HF Hub
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") # Fallback for demo
model = AutoModelForCausalLM.from_pretrained(
"microsoft/DialoGPT-medium", # Fallback for demo
torch_dtype=torch.float16 if device.type != 'cpu' else torch.float32,
device_map="auto" if device.type != 'cpu' else None
)
model.eval()
print("🎡 XCodec2 placeholder loaded...")
# For now, we'll simulate the codec model
codec_model = "simulated"
return True
except Exception as e:
print(f"Error loading models: {e}")
return False
def generate_voice(text, progress=gr.Progress()):
"""Generate voice from text with progress updates"""
if not text or len(text.strip()) == 0:
return None, "❌ Please enter some text!"
if len(text) > 200:
return None, "❌ Text too long! Keep it under 200 characters for this demo."
progress(0.1, desc="Loading models...")
# Load models if not already loaded
if not load_models_once():
return None, "❌ Failed to load models!"
try:
progress(0.3, desc="Processing text...")
# Here you'd implement the actual voice generation
# For demo purposes, let's create a simple placeholder
progress(0.7, desc="Generating speech tokens...")
# Simulate processing time
import time
time.sleep(2)
progress(0.9, desc="Converting to audio...")
# Create dummy audio for demo (replace with real generation)
sample_rate = 16000
duration = len(text.split()) * 0.3 # ~0.3 seconds per word
samples = int(sample_rate * duration)
# Generate a simple tone as placeholder
t = np.linspace(0, duration, samples)
audio = 0.3 * np.sin(2 * np.pi * 440 * t) # 440 Hz tone
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, audio, sample_rate)
progress(1.0, desc="Complete!")
return f.name, f"βœ… Generated audio for: '{text}'"
except Exception as e:
return None, f"❌ Error: {str(e)}"
# Create the Gradio interface
def create_interface():
with gr.Blocks(
title="🎀 Local Voice Cloning",
theme=gr.themes.Soft(),
css="""
.status-text textarea {
color: #ffffff !important;
background-color: #2d3748 !important;
border: 1px solid #4a5568 !important;
}
.status-text label {
color: #e2e8f0 !important;
}
"""
) as demo:
gr.HTML("""
<div style="text-align: center; margin-bottom: 20px;">
<h1>🎀 Local Voice Cloning</h1>
<p style="font-size: 18px; color: #666;">
Like ElevenLabs, but completely free and open source!
</p>
</div>
""")
with gr.Row():
with gr.Column(scale=2):
gr.HTML("""
<div style="background: #f0f8ff; padding: 15px; border-radius: 10px; margin-bottom: 20px;">
<h3>πŸ†š vs ElevenLabs:</h3>
<ul>
<li>βœ… <strong>Free</strong> (no subscription)</li>
<li>βœ… <strong>Open source</strong> (full control)</li>
<li>βœ… <strong>No limits</strong> (unlimited generation)</li>
<li>βœ… <strong>Privacy</strong> (your data stays private)</li>
</ul>
</div>
""")
text_input = gr.Textbox(
label="πŸ“ Enter text to speak",
placeholder="Type your message here... (keep it short for demo)",
lines=3,
max_lines=5
)
generate_btn = gr.Button(
"🎯 Generate Voice",
variant="primary",
size="lg"
)
with gr.Column(scale=2):
audio_output = gr.Audio(
label="🎡 Generated Voice",
type="filepath"
)
status_text = gr.Textbox(
label="πŸ“Š Status",
interactive=False,
lines=2,
elem_classes="status-text"
)
# Example texts
gr.HTML("<h3>πŸ’‘ Try these examples:</h3>")
examples = [
"Hello, world!",
"This is a test of voice cloning.",
"Welcome to the future of AI!",
"Amazing technology running locally."
]
gr.Examples(
examples=examples,
inputs=text_input,
label="Click to try:"
)
# Info section
with gr.Accordion("πŸ” How it works", open=False):
gr.Markdown("""
### The Technology:
1. **🧠 Llasa-3B**: Converts text to speech tokens
2. **🎡 XCodec2**: Converts tokens to audio waveform
3. **πŸ–₯️ Your Hardware**: Runs on your GPU/CPU
### Why This Matters:
- **No vendor lock-in**: You own the technology
- **Customizable**: Modify for your specific needs
- **Scalable**: Deploy anywhere (your server, cloud, edge)
- **Cost-effective**: No per-minute pricing
### Business Applications:
- **Audiobook generation**
- **Podcast creation**
- **Game character voices**
- **Accessibility tools**
- **Content localization**
""")
# Event handlers
generate_btn.click(
fn=generate_voice,
inputs=[text_input],
outputs=[audio_output, status_text],
show_progress=True
)
# Auto-generate on example click
text_input.submit(
fn=generate_voice,
inputs=[text_input],
outputs=[audio_output, status_text],
show_progress=True
)
return demo
# Launch the interface
if __name__ == "__main__":
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True
)