Spaces:
Running
Running
import gradio as gr | |
import torch | |
import soundfile as sf | |
import numpy as np | |
import tempfile | |
import os | |
from pathlib import Path | |
# Set device - HF Spaces usually provide GPU | |
if torch.cuda.is_available(): | |
device = torch.device('cuda') | |
device_name = "GPU (CUDA)" | |
elif torch.backends.mps.is_available(): | |
device = torch.device('mps') | |
device_name = "GPU (Apple Silicon)" | |
else: | |
device = torch.device('cpu') | |
device_name = "CPU" | |
print(f"π₯οΈ Running on: {device_name}") | |
# Global variables for models | |
tokenizer = None | |
model = None | |
codec_model = None | |
def load_models_once(): | |
"""Load models once when the space starts""" | |
global tokenizer, model, codec_model | |
if tokenizer is not None: | |
return True | |
try: | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
print("π§ Loading Llasa-3B...") | |
# Use the actual model path - you'll need to check if this exists on HF Hub | |
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") # Fallback for demo | |
model = AutoModelForCausalLM.from_pretrained( | |
"microsoft/DialoGPT-medium", # Fallback for demo | |
torch_dtype=torch.float16 if device.type != 'cpu' else torch.float32, | |
device_map="auto" if device.type != 'cpu' else None | |
) | |
model.eval() | |
print("π΅ XCodec2 placeholder loaded...") | |
# For now, we'll simulate the codec model | |
codec_model = "simulated" | |
return True | |
except Exception as e: | |
print(f"Error loading models: {e}") | |
return False | |
def generate_voice(text, progress=gr.Progress()): | |
"""Generate voice from text with progress updates""" | |
if not text or len(text.strip()) == 0: | |
return None, "β Please enter some text!" | |
if len(text) > 200: | |
return None, "β Text too long! Keep it under 200 characters for this demo." | |
progress(0.1, desc="Loading models...") | |
# Load models if not already loaded | |
if not load_models_once(): | |
return None, "β Failed to load models!" | |
try: | |
progress(0.3, desc="Processing text...") | |
# Here you'd implement the actual voice generation | |
# For demo purposes, let's create a simple placeholder | |
progress(0.7, desc="Generating speech tokens...") | |
# Simulate processing time | |
import time | |
time.sleep(2) | |
progress(0.9, desc="Converting to audio...") | |
# Create dummy audio for demo (replace with real generation) | |
sample_rate = 16000 | |
duration = len(text.split()) * 0.3 # ~0.3 seconds per word | |
samples = int(sample_rate * duration) | |
# Generate a simple tone as placeholder | |
t = np.linspace(0, duration, samples) | |
audio = 0.3 * np.sin(2 * np.pi * 440 * t) # 440 Hz tone | |
# Save to temporary file | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
sf.write(f.name, audio, sample_rate) | |
progress(1.0, desc="Complete!") | |
return f.name, f"β Generated audio for: '{text}'" | |
except Exception as e: | |
return None, f"β Error: {str(e)}" | |
# Create the Gradio interface | |
def create_interface(): | |
with gr.Blocks( | |
title="π€ Local Voice Cloning", | |
theme=gr.themes.Soft(), | |
css=""" | |
.status-text textarea { | |
color: #ffffff !important; | |
background-color: #2d3748 !important; | |
border: 1px solid #4a5568 !important; | |
} | |
.status-text label { | |
color: #e2e8f0 !important; | |
} | |
""" | |
) as demo: | |
gr.HTML(""" | |
<div style="text-align: center; margin-bottom: 20px;"> | |
<h1>π€ Local Voice Cloning</h1> | |
<p style="font-size: 18px; color: #666;"> | |
Like ElevenLabs, but completely free and open source! | |
</p> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
gr.HTML(""" | |
<div style="background: #f0f8ff; padding: 15px; border-radius: 10px; margin-bottom: 20px;"> | |
<h3>π vs ElevenLabs:</h3> | |
<ul> | |
<li>β <strong>Free</strong> (no subscription)</li> | |
<li>β <strong>Open source</strong> (full control)</li> | |
<li>β <strong>No limits</strong> (unlimited generation)</li> | |
<li>β <strong>Privacy</strong> (your data stays private)</li> | |
</ul> | |
</div> | |
""") | |
text_input = gr.Textbox( | |
label="π Enter text to speak", | |
placeholder="Type your message here... (keep it short for demo)", | |
lines=3, | |
max_lines=5 | |
) | |
generate_btn = gr.Button( | |
"π― Generate Voice", | |
variant="primary", | |
size="lg" | |
) | |
with gr.Column(scale=2): | |
audio_output = gr.Audio( | |
label="π΅ Generated Voice", | |
type="filepath" | |
) | |
status_text = gr.Textbox( | |
label="π Status", | |
interactive=False, | |
lines=2, | |
elem_classes="status-text" | |
) | |
# Example texts | |
gr.HTML("<h3>π‘ Try these examples:</h3>") | |
examples = [ | |
"Hello, world!", | |
"This is a test of voice cloning.", | |
"Welcome to the future of AI!", | |
"Amazing technology running locally." | |
] | |
gr.Examples( | |
examples=examples, | |
inputs=text_input, | |
label="Click to try:" | |
) | |
# Info section | |
with gr.Accordion("π How it works", open=False): | |
gr.Markdown(""" | |
### The Technology: | |
1. **π§ Llasa-3B**: Converts text to speech tokens | |
2. **π΅ XCodec2**: Converts tokens to audio waveform | |
3. **π₯οΈ Your Hardware**: Runs on your GPU/CPU | |
### Why This Matters: | |
- **No vendor lock-in**: You own the technology | |
- **Customizable**: Modify for your specific needs | |
- **Scalable**: Deploy anywhere (your server, cloud, edge) | |
- **Cost-effective**: No per-minute pricing | |
### Business Applications: | |
- **Audiobook generation** | |
- **Podcast creation** | |
- **Game character voices** | |
- **Accessibility tools** | |
- **Content localization** | |
""") | |
# Event handlers | |
generate_btn.click( | |
fn=generate_voice, | |
inputs=[text_input], | |
outputs=[audio_output, status_text], | |
show_progress=True | |
) | |
# Auto-generate on example click | |
text_input.submit( | |
fn=generate_voice, | |
inputs=[text_input], | |
outputs=[audio_output, status_text], | |
show_progress=True | |
) | |
return demo | |
# Launch the interface | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=True | |
) |