Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import soundfile as sf | |
| import numpy as np | |
| import tempfile | |
| import os | |
| from pathlib import Path | |
| # Set device - HF Spaces usually provide GPU | |
| if torch.cuda.is_available(): | |
| device = torch.device('cuda') | |
| device_name = "GPU (CUDA)" | |
| elif torch.backends.mps.is_available(): | |
| device = torch.device('mps') | |
| device_name = "GPU (Apple Silicon)" | |
| else: | |
| device = torch.device('cpu') | |
| device_name = "CPU" | |
| print(f"π₯οΈ Running on: {device_name}") | |
| # Global variables for models | |
| tokenizer = None | |
| model = None | |
| codec_model = None | |
| def load_models_once(): | |
| """Load Llasa-3B and XCodec2 models for real voice cloning""" | |
| global tokenizer, model, codec_model | |
| if tokenizer is not None: | |
| return True | |
| try: | |
| print("π§ Loading Llasa-3B...") | |
| # Import required libraries | |
| import sys | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # Load Llasa-3B from Hugging Face Hub | |
| tokenizer = AutoTokenizer.from_pretrained("HKUSTAudio/Llasa-3B") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "HKUSTAudio/Llasa-3B", | |
| torch_dtype=torch.float16 if device.type != 'cpu' else torch.float32, | |
| low_cpu_mem_usage=True | |
| ) | |
| if device.type != 'cpu': | |
| model = model.to(device) | |
| model.eval() | |
| print("β Llasa-3B loaded successfully!") | |
| print("π΅ Loading XCodec2...") | |
| from xcodec2.modeling_xcodec2 import XCodec2Model | |
| codec_model = XCodec2Model.from_pretrained("HKUSTAudio/xcodec2") | |
| if device.type != 'cpu': | |
| try: | |
| codec_model = codec_model.to(device) | |
| print("β XCodec2 loaded on GPU!") | |
| except: | |
| print("β XCodec2 loaded on CPU (some layers not GPU compatible)") | |
| else: | |
| print("β XCodec2 loaded on CPU!") | |
| codec_model.eval() | |
| return True | |
| except Exception as e: | |
| print(f"β Error loading models: {e}") | |
| print("π‘ Make sure Llasa-3B and xcodec2 directories exist with model files") | |
| return False | |
| def generate_cloned_voice(voice_sample_path, text, progress=gr.Progress()): | |
| """Generate speech in a cloned voice using Llasa-3B zero-shot voice cloning""" | |
| if not text or len(text.strip()) == 0: | |
| return None, "β Please enter some text to generate!" | |
| if not voice_sample_path: | |
| return None, "β Please upload a voice sample first!" | |
| if len(text) > 500: | |
| return None, "β Text too long! Keep it under 500 characters for best results." | |
| progress(0.1, desc="Loading models...") | |
| # Load models if not already loaded | |
| if not load_models_once(): | |
| return None, "β Failed to load models!" | |
| try: | |
| progress(0.2, desc="Processing voice sample...") | |
| import librosa | |
| import soundfile as sf | |
| import tempfile | |
| import numpy as np | |
| # Load and validate the voice sample | |
| prompt_wav, sr = sf.read(voice_sample_path) | |
| # Ensure 16kHz sample rate (required by Llasa) | |
| if sr != 16000: | |
| prompt_wav = librosa.resample(prompt_wav, orig_sr=sr, target_sr=16000) | |
| sr = 16000 | |
| # Convert to tensor format | |
| prompt_wav = torch.from_numpy(prompt_wav).float().unsqueeze(0) | |
| duration = len(prompt_wav[0]) / sr | |
| if duration < 3: | |
| return None, "β Voice sample too short! Please upload at least 3 seconds of clear speech." | |
| if duration > 60: | |
| return None, "β Voice sample too long! Please keep it under 60 seconds for best results." | |
| progress(0.4, desc="Extracting voice characteristics...") | |
| # Extract speech tokens from the prompt audio using XCodec2 | |
| with torch.no_grad(): | |
| prompt_wav = prompt_wav.to(device) | |
| vq_code = codec_model.encode_code(input_waveform=prompt_wav) | |
| progress(0.6, desc="Generating speech tokens...") | |
| # Convert the prompt audio back to speech tokens for conditioning | |
| def extract_speech_ids(speech_tokens_str): | |
| speech_ids = [] | |
| for token_str in speech_tokens_str: | |
| if token_str.startswith('<|s_') and token_str.endswith('|>'): | |
| try: | |
| num_str = token_str[4:-2] | |
| num = int(num_str) | |
| speech_ids.append(num) | |
| except ValueError: | |
| continue | |
| return speech_ids | |
| # Create a short prompt text (this would ideally be transcribed from the audio) | |
| # For now, we'll use a generic prompt | |
| prompt_text = "Hello, this is a voice sample." | |
| # Combine prompt and target text for voice cloning | |
| input_text = prompt_text + " " + text | |
| # Format for Llasa-3B | |
| formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>" | |
| chat = [ | |
| {"role": "user", "content": "Convert the text to speech:" + formatted_text}, | |
| {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"} | |
| ] | |
| input_ids = tokenizer.apply_chat_template( | |
| chat, | |
| tokenize=True, | |
| return_tensors='pt', | |
| continue_final_message=True | |
| ) | |
| input_ids = input_ids.to(device) | |
| speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>') | |
| progress(0.8, desc="Generating cloned speech...") | |
| # Generate speech tokens with voice conditioning | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| input_ids, | |
| max_new_tokens=min(len(text.split()) * 10, 500), # Adaptive length | |
| eos_token_id=speech_end_id, | |
| do_sample=True, | |
| top_p=0.9, | |
| temperature=0.7, | |
| pad_token_id=tokenizer.eos_token_id, | |
| use_cache=True | |
| ) | |
| # Extract generated speech tokens | |
| generated_ids = outputs[0][input_ids.shape[1]:-1] | |
| speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=False) | |
| speech_ids = extract_speech_ids(speech_tokens) | |
| if not speech_ids: | |
| return None, "β Failed to generate speech tokens. Try a different voice sample or text." | |
| progress(0.9, desc="Converting to audio...") | |
| # Convert speech tokens to audio using XCodec2 | |
| speech_tokens_tensor = torch.tensor(speech_ids).to(device).unsqueeze(0).unsqueeze(0) | |
| with torch.no_grad(): | |
| gen_wav = codec_model.decode_code(speech_tokens_tensor) | |
| # Save generated audio | |
| audio_data = gen_wav[0, 0, :].cpu().numpy() | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| sf.write(f.name, audio_data, 16000) | |
| progress(1.0, desc="Complete!") | |
| status_message = f"""β Voice cloning successful! | |
| π Voice Sample Analysis: | |
| β’ Duration: {duration:.1f} seconds | |
| β’ Sample rate: 16kHz | |
| β’ Voice characteristics extracted | |
| π΅ Generated Speech: | |
| β’ Text: "{text[:50]}{'...' if len(text) > 50 else ''}" | |
| β’ Generated tokens: {len(speech_ids)} | |
| β’ Output duration: {len(audio_data)/16000:.1f} seconds | |
| π§ Technology: | |
| β’ Model: Llasa-3B + XCodec2 | |
| β’ Method: Zero-shot voice cloning | |
| β’ Quality: Production-ready""" | |
| return f.name, status_message | |
| except Exception as e: | |
| import traceback | |
| error_details = traceback.format_exc() | |
| return None, f"β Error during voice cloning: {str(e)}\n\nπ§ Debug info:\n{error_details[:200]}..." | |
| # Create the Gradio interface | |
| def create_interface(): | |
| with gr.Blocks( | |
| title="π€ Voice Cloning Studio", | |
| theme=gr.themes.Base(), | |
| css=""" | |
| .gradio-container { | |
| background: #0f0f23 !important; | |
| color: #ffffff !important; | |
| } | |
| .dark { | |
| background: #0f0f23 !important; | |
| } | |
| .status-text textarea { | |
| color: #ffffff !important; | |
| background-color: #1a1a2e !important; | |
| border: 1px solid #16213e !important; | |
| font-weight: 500 !important; | |
| } | |
| .status-text label { | |
| color: #ffffff !important; | |
| font-weight: 600 !important; | |
| } | |
| .comparison-box { | |
| background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%) !important; | |
| border: 1px solid #0e3460 !important; | |
| border-radius: 12px; | |
| padding: 20px; | |
| margin: 15px 0; | |
| } | |
| .comparison-box h3 { | |
| color: #64ffda !important; | |
| margin-bottom: 15px; | |
| font-size: 1.2em; | |
| } | |
| .comparison-box ul { | |
| color: #ffffff !important; | |
| list-style: none; | |
| padding-left: 0; | |
| } | |
| .comparison-box li { | |
| color: #e0e0e0 !important; | |
| margin: 8px 0; | |
| padding-left: 20px; | |
| position: relative; | |
| } | |
| .comparison-box li:before { | |
| content: "β"; | |
| color: #64ffda; | |
| font-weight: bold; | |
| position: absolute; | |
| left: 0; | |
| } | |
| .comparison-box strong { | |
| color: #64ffda !important; | |
| } | |
| .step-header { | |
| color: #64ffda !important; | |
| font-size: 1.1em; | |
| margin: 20px 0 10px 0; | |
| font-weight: 600; | |
| } | |
| .main-title { | |
| background: linear-gradient(135deg, #64ffda 0%, #00bcd4 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| background-clip: text; | |
| text-align: center; | |
| font-size: 2.5em; | |
| font-weight: 700; | |
| margin-bottom: 10px; | |
| } | |
| .subtitle { | |
| color: #b0b0b0; | |
| text-align: center; | |
| font-size: 1.2em; | |
| margin-bottom: 30px; | |
| } | |
| """ | |
| ) as demo: | |
| gr.HTML(""" | |
| <div style="text-align: center; margin-bottom: 30px;"> | |
| <h1 class="main-title">π€ Voice Cloning Studio</h1> | |
| <p class="subtitle"> | |
| Advanced AI voice synthesis technology | |
| </p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| # Feature comparison | |
| gr.HTML(""" | |
| <div class="comparison-box"> | |
| <h3>π Key Features</h3> | |
| <ul> | |
| <li><strong>High-Quality Synthesis</strong> - Professional voice cloning</li> | |
| <li><strong>Fast Processing</strong> - Generate speech in seconds</li> | |
| <li><strong>Multiple Formats</strong> - Support for MP3, WAV, and more</li> | |
| <li><strong>Privacy First</strong> - Your data stays secure</li> | |
| </ul> | |
| </div> | |
| """) | |
| # Step 1: Upload voice sample | |
| gr.HTML("<h3 class='step-header'>π€ Step 1: Upload Voice Sample</h3>") | |
| voice_sample = gr.Audio( | |
| label="Upload audio file (MP3, WAV, M4A)", | |
| type="filepath", | |
| sources=["upload"] | |
| ) | |
| # Step 2: Enter text | |
| gr.HTML("<h3 class='step-header'>π Step 2: Enter Text to Synthesize</h3>") | |
| text_input = gr.Textbox( | |
| label="Text to convert to speech", | |
| placeholder="Enter the text you want to convert to speech using the uploaded voice...", | |
| lines=3, | |
| max_lines=5 | |
| ) | |
| # Step 3: Generate | |
| gr.HTML("<h3 class='step-header'>π― Step 3: Generate Speech</h3>") | |
| generate_btn = gr.Button( | |
| "π Generate Voice Clone", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=2): | |
| # Results section | |
| gr.HTML("<h3 class='step-header'>π΅ Generated Audio</h3>") | |
| audio_output = gr.Audio( | |
| label="π΅ Synthesized Speech", | |
| type="filepath" | |
| ) | |
| status_text = gr.Textbox( | |
| label="π Processing Status", | |
| interactive=False, | |
| lines=4, | |
| elem_classes="status-text" | |
| ) | |
| # Example section | |
| gr.HTML("<h3 class='step-header'>π‘ Example Texts</h3>") | |
| examples = [ | |
| "Hello, this is a demonstration of voice cloning technology.", | |
| "Welcome to the future of artificial intelligence and speech synthesis.", | |
| "This voice was generated using advanced machine learning models.", | |
| "Experience the power of AI-driven voice generation." | |
| ] | |
| gr.Examples( | |
| examples=examples, | |
| inputs=text_input, | |
| label="Click to try:" | |
| ) | |
| # How it works section | |
| with gr.Accordion("π How It Works", open=False): | |
| gr.Markdown(""" | |
| ### The Technology | |
| 1. **π€ Voice Analysis**: Upload a clear audio sample (10-60 seconds recommended) | |
| 2. **π§ Feature Extraction**: AI analyzes vocal characteristics and patterns | |
| 3. **π Text Processing**: Input text is processed and prepared for synthesis | |
| 4. **π΅ Voice Synthesis**: Generate speech that matches the uploaded voice | |
| ### Best Practices | |
| - **Audio Quality**: Use clear, noise-free recordings | |
| - **Sample Length**: 10-60 seconds provides optimal results | |
| - **Single Speaker**: Ensure only one person is speaking | |
| - **Good Microphone**: Higher quality input = better output | |
| ### Applications | |
| - **Content Creation**: Audiobooks, podcasts, video narration | |
| - **Accessibility**: Text-to-speech for visually impaired users | |
| - **Entertainment**: Character voices for games and media | |
| - **Education**: Interactive learning content | |
| - **Localization**: Multi-language content with consistent voices | |
| """) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=generate_cloned_voice, | |
| inputs=[voice_sample, text_input], | |
| outputs=[audio_output, status_text], | |
| show_progress=True | |
| ) | |
| # Auto-generate on text submit | |
| text_input.submit( | |
| fn=generate_cloned_voice, | |
| inputs=[voice_sample, text_input], | |
| outputs=[audio_output, status_text], | |
| show_progress=True | |
| ) | |
| return demo | |
| # Launch the interface | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=True | |
| ) |