Spaces:

gzyzgzi
/

voice-cloning-demo

Sleeping

App Files Files Community

gzyzgzi commited on Jun 12

Commit

c2cd34a

verified ·

1 Parent(s): 4eb8666

Upload 3 files

Browse files

Files changed (1) hide show

app.py +233 -115

app.py CHANGED Viewed

@@ -25,36 +25,62 @@ model = None
 codec_model = None
 def load_models_once():
-    """Load models once when the space starts"""
     global tokenizer, model, codec_model
     if tokenizer is not None:
         return True
     try:
         from transformers import AutoTokenizer, AutoModelForCausalLM
-        print("🧠 Loading Llasa-3B...")
-        # Use the actual model path - you'll need to check if this exists on HF Hub
-        tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")  # Fallback for demo
         model = AutoModelForCausalLM.from_pretrained(
-            "microsoft/DialoGPT-medium",  # Fallback for demo
             torch_dtype=torch.float16 if device.type != 'cpu' else torch.float32,
-            device_map="auto" if device.type != 'cpu' else None
         )
         model.eval()
-        print("🎵 XCodec2 placeholder loaded...")
-        # For now, we'll simulate the codec model
-        codec_model = "simulated"
         return True
     except Exception as e:
-        print(f"Error loading models: {e}")
         return False
 def generate_cloned_voice(voice_sample_path, text, progress=gr.Progress()):
-    """Generate speech in a cloned voice from uploaded sample"""
     if not text or len(text.strip()) == 0:
         return None, "❌ Please enter some text to generate!"
@@ -65,73 +91,121 @@ def generate_cloned_voice(voice_sample_path, text, progress=gr.Progress()):
     if len(text) > 500:
         return None, "❌ Text too long! Keep it under 500 characters for best results."
-    progress(0.1, desc="Analyzing voice sample...")
     try:
-        # Analyze the uploaded voice sample
         import librosa
-        # Load and analyze the voice sample
-        audio_data, sample_rate = librosa.load(voice_sample_path, sr=16000)
-        duration = len(audio_data) / sample_rate
         if duration < 3:
             return None, "❌ Voice sample too short! Please upload at least 3 seconds of clear speech."
         if duration > 60:
             return None, "❌ Voice sample too long! Please keep it under 60 seconds for best results."
-        progress(0.3, desc="Learning voice characteristics...")
-        # Simulate voice analysis (in real implementation, this would extract voice features)
-        import time
-        time.sleep(2)  # Simulate processing time
-        progress(0.6, desc="Generating speech in target voice...")
-        # For demo purposes, create synthesized audio
-        # In real implementation, this would use the actual voice cloning models
-        import numpy as np
-        import soundfile as sf
-        import tempfile
-        # Generate audio based on text length
-        words = text.split()
-        duration = len(words) * 0.4  # ~0.4 seconds per word
-        samples = int(16000 * duration)
-        # Create more realistic audio synthesis
-        t = np.linspace(0, duration, samples)
-        # Generate multiple frequency components for more natural sound
-        fundamental = 150  # Base frequency
-        audio = (
-            0.3 * np.sin(2 * np.pi * fundamental * t) +
-            0.2 * np.sin(2 * np.pi * fundamental * 2 * t) +
-            0.1 * np.sin(2 * np.pi * fundamental * 3 * t)
-        )
-        # Add some variation to make it sound more natural
-        variation = 0.1 * np.sin(2 * np.pi * 0.5 * t)
-        audio = audio * (1 + variation)
-        # Apply envelope to make it sound more speech-like
-        envelope = np.exp(-t * 0.1) * (1 - np.exp(-t * 5))
-        audio = audio * envelope
-        # Add slight noise for realism
-        noise = 0.02 * np.random.randn(len(audio))
-        audio = audio + noise
-        # Normalize
-        audio = audio / np.max(np.abs(audio)) * 0.7
-        progress(0.9, desc="Finalizing audio...")
-        # Save to temporary file
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            sf.write(f.name, audio, 16000)
         progress(1.0, desc="Complete!")
@@ -139,35 +213,44 @@ def generate_cloned_voice(voice_sample_path, text, progress=gr.Progress()):
 📊 Voice Sample Analysis:
 • Duration: {duration:.1f} seconds
-• Quality: Good
-• Voice characteristics learned
 🎵 Generated Speech:
 • Text: "{text[:50]}{'...' if len(text) > 50 else ''}"
-• Duration: {len(audio)/16000:.1f} seconds
-• Sample rate: 16kHz
-💡 Tip: For better results, use 10-30 seconds of clear, single-speaker audio."""
         return f.name, status_message
     except Exception as e:
-        return None, f"❌ Error during voice cloning: {str(e)}\n\n💡 Make sure your audio file is a valid MP3/WAV format."
 # Create the Gradio interface
 def create_interface():
     with gr.Blocks(
         title="🎤 Voice Cloning Studio",
-        theme=gr.themes.Soft(),
         css="""
         .gradio-container {
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
         }
         .status-text textarea {
             color: #ffffff !important;
-            background-color: #2d3748 !important;
-            border: 1px solid #4a5568 !important;
             font-weight: 500 !important;
         }
         .status-text label {
@@ -175,101 +258,136 @@ def create_interface():
             font-weight: 600 !important;
         }
         .comparison-box {
-            background: rgba(255, 255, 255, 0.1);
-            border-radius: 10px;
-            padding: 15px;
-            margin: 10px 0;
         }
         .comparison-box h3 {
-            color: #ffffff !important;
-            margin-bottom: 10px;
         }
         .comparison-box ul {
             color: #ffffff !important;
         }
         .comparison-box li {
-            color: #ffffff !important;
-            margin: 5px 0;
         }
         .comparison-box strong {
-            color: #ffd700 !important;
         }
         """
     ) as demo:
         gr.HTML("""
-        <div style="text-align: center; margin-bottom: 20px;">
-            <h1 style="color: white; text-shadow: 2px 2px 4px rgba(0,0,0,0.5);">🎤 Voice Cloning Studio</h1>
-            <p style="font-size: 18px; color: #e2e8f0;">
-                Upload a voice sample, then generate speech in that voice!
             </p>
         </div>
         """)
         with gr.Row():
             with gr.Column(scale=2):
-                # Voice cloning comparison
                 gr.HTML("""
                 <div class="comparison-box">
-                    <h3>🆚 vs ElevenLabs:</h3>
                     <ul>
-                        <li>✅ <strong>Free</strong> (no subscription)</li>
-                        <li>✅ <strong>Open source</strong> (full control)</li>
-                        <li>✅ <strong>No limits</strong> (unlimited generation)</li>
-                        <li>✅ <strong>Privacy</strong> (your data stays private)</li>
                     </ul>
                 </div>
                 """)
                 # Step 1: Upload voice sample
-                gr.HTML("<h3 style='color: white;'>📤 Step 1: Upload Voice Sample</h3>")
                 voice_sample = gr.Audio(
-                    label="Upload MP3/WAV of voice to clone",
                     type="filepath",
                     sources=["upload"]
                 )
                 # Step 2: Enter text
-                gr.HTML("<h3 style='color: white;'>📝 Step 2: Enter Text to Speak</h3>")
                 text_input = gr.Textbox(
-                    label="Text to generate in cloned voice",
-                    placeholder="Enter what you want the cloned voice to say...",
                     lines=3,
                     max_lines=5
                 )
                 # Step 3: Generate
-                gr.HTML("<h3 style='color: white;'>🎯 Step 3: Generate Cloned Voice</h3>")
                 generate_btn = gr.Button(
-                    "🚀 Clone Voice & Generate Speech",
                     variant="primary",
                     size="lg"
                 )
             with gr.Column(scale=2):
                 # Results section
-                gr.HTML("<h3 style='color: white;'>🎵 Generated Results</h3>")
                 audio_output = gr.Audio(
-                    label="🎵 Generated Voice",
                     type="filepath"
                 )
                 status_text = gr.Textbox(
-                    label="📊 Status",
                     interactive=False,
-                    lines=3,
                     elem_classes="status-text"
                 )
         # Example section
-        gr.HTML("<h3 style='color: white;'>💡 Try these examples:</h3>")
         examples = [
-            "Hello, this is a test of voice cloning technology.",
-            "Welcome to the future of artificial intelligence!",
-            "This voice was cloned from just a few seconds of audio.",
-            "Amazing what we can do with open source AI models."
         ]
         gr.Examples(
@@ -279,29 +397,29 @@ def create_interface():
         )
         # How it works section
-        with gr.Accordion("🔍 How Voice Cloning Works", open=False):
             gr.Markdown("""
-            ### The Process:
-            1. **🎤 Voice Analysis**: Upload 10-30 seconds of clear speech
-            2. **🧠 Voice Modeling**: AI learns the unique characteristics of the voice
-            3. **📝 Text Processing**: Your text is converted to speech tokens
-            4. **🎵 Voice Synthesis**: Tokens are converted to audio in the target voice
-            ### Best Results:
-            - **Clear audio**: No background noise
-            - **Good quality**: 16kHz+ sample rate
-            - **Sufficient length**: 10-30 seconds of speech
-            - **Single speaker**: Only one person talking
-            ### Business Applications:
             - **Content Creation**: Audiobooks, podcasts, video narration
-            - **Gaming**: Character voices, NPC dialogue
-            - **Accessibility**: Personalized text-to-speech
-            - **Localization**: Multi-language content with consistent voice
-            - **Education**: Interactive learning with familiar voices
             """)
         # Event handlers

 codec_model = None
 def load_models_once():
+    """Load Llasa-3B and XCodec2 models for real voice cloning"""
     global tokenizer, model, codec_model
     if tokenizer is not None:
         return True
     try:
+        print("🧠 Loading Llasa-3B...")
+        # Add paths for local modules
+        import sys
+        sys.path.append('./Llasa-3B')
+        sys.path.append('./xcodec2')
         from transformers import AutoTokenizer, AutoModelForCausalLM
+        # Load Llasa-3B from local directory
+        tokenizer = AutoTokenizer.from_pretrained("./Llasa-3B", local_files_only=True)
         model = AutoModelForCausalLM.from_pretrained(
+            "./Llasa-3B",
+            local_files_only=True,
             torch_dtype=torch.float16 if device.type != 'cpu' else torch.float32,
+            low_cpu_mem_usage=True
         )
+        if device.type != 'cpu':
+            model = model.to(device)
         model.eval()
+        print("✅ Llasa-3B loaded successfully!")
+        print("🎵 Loading XCodec2...")
+        from modeling_xcodec2 import XCodec2Model
+        codec_model = XCodec2Model.from_pretrained("./xcodec2", local_files_only=True)
+        if device.type != 'cpu':
+            try:
+                codec_model = codec_model.to(device)
+                print("✅ XCodec2 loaded on GPU!")
+            except:
+                print("✅ XCodec2 loaded on CPU (some layers not GPU compatible)")
+        else:
+            print("✅ XCodec2 loaded on CPU!")
+        codec_model.eval()
         return True
     except Exception as e:
+        print(f"❌ Error loading models: {e}")
+        print("💡 Make sure Llasa-3B and xcodec2 directories exist with model files")
         return False
 def generate_cloned_voice(voice_sample_path, text, progress=gr.Progress()):
+    """Generate speech in a cloned voice using Llasa-3B zero-shot voice cloning"""
     if not text or len(text.strip()) == 0:
         return None, "❌ Please enter some text to generate!"
     if len(text) > 500:
         return None, "❌ Text too long! Keep it under 500 characters for best results."
+    progress(0.1, desc="Loading models...")
+    # Load models if not already loaded
+    if not load_models_once():
+        return None, "❌ Failed to load models!"
     try:
+        progress(0.2, desc="Processing voice sample...")
         import librosa
+        import soundfile as sf
+        import tempfile
+        import numpy as np
+        # Load and validate the voice sample
+        prompt_wav, sr = sf.read(voice_sample_path)
+        # Ensure 16kHz sample rate (required by Llasa)
+        if sr != 16000:
+            prompt_wav = librosa.resample(prompt_wav, orig_sr=sr, target_sr=16000)
+            sr = 16000
+        # Convert to tensor format
+        prompt_wav = torch.from_numpy(prompt_wav).float().unsqueeze(0)
+        duration = len(prompt_wav[0]) / sr
         if duration < 3:
             return None, "❌ Voice sample too short! Please upload at least 3 seconds of clear speech."
         if duration > 60:
             return None, "❌ Voice sample too long! Please keep it under 60 seconds for best results."
+        progress(0.4, desc="Extracting voice characteristics...")
+        # Extract speech tokens from the prompt audio using XCodec2
+        with torch.no_grad():
+            prompt_wav = prompt_wav.to(device)
+            vq_code = codec_model.encode_code(input_waveform=prompt_wav)
+        progress(0.6, desc="Generating speech tokens...")
+        # Convert the prompt audio back to speech tokens for conditioning
+        def extract_speech_ids(speech_tokens_str):
+            speech_ids = []
+            for token_str in speech_tokens_str:
+                if token_str.startswith('<|s_') and token_str.endswith('|>'):
+                    try:
+                        num_str = token_str[4:-2]
+                        num = int(num_str)
+                        speech_ids.append(num)
+                    except ValueError:
+                        continue
+            return speech_ids
+        # Create a short prompt text (this would ideally be transcribed from the audio)
+        # For now, we'll use a generic prompt
+        prompt_text = "Hello, this is a voice sample."
+        # Combine prompt and target text for voice cloning
+        input_text = prompt_text + " " + text
+        # Format for Llasa-3B
+        formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
+        chat = [
+            {"role": "user", "content": "Convert the text to speech:" + formatted_text},
+            {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
+        ]
+        input_ids = tokenizer.apply_chat_template(
+            chat,
+            tokenize=True,
+            return_tensors='pt',
+            continue_final_message=True
+        )
+        input_ids = input_ids.to(device)
+        speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
+        progress(0.8, desc="Generating cloned speech...")
+        # Generate speech tokens with voice conditioning
+        with torch.no_grad():
+            outputs = model.generate(
+                input_ids,
+                max_new_tokens=min(len(text.split()) * 10, 500),  # Adaptive length
+                eos_token_id=speech_end_id,
+                do_sample=True,
+                top_p=0.9,
+                temperature=0.7,
+                pad_token_id=tokenizer.eos_token_id,
+                use_cache=True
+            )
+        # Extract generated speech tokens
+        generated_ids = outputs[0][input_ids.shape[1]:-1]
+        speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)
+        speech_ids = extract_speech_ids(speech_tokens)
+        if not speech_ids:
+            return None, "❌ Failed to generate speech tokens. Try a different voice sample or text."
+        progress(0.9, desc="Converting to audio...")
+        # Convert speech tokens to audio using XCodec2
+        speech_tokens_tensor = torch.tensor(speech_ids).to(device).unsqueeze(0).unsqueeze(0)
+        with torch.no_grad():
+            gen_wav = codec_model.decode_code(speech_tokens_tensor)
+        # Save generated audio
+        audio_data = gen_wav[0, 0, :].cpu().numpy()
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            sf.write(f.name, audio_data, 16000)
         progress(1.0, desc="Complete!")
 📊 Voice Sample Analysis:
 • Duration: {duration:.1f} seconds
+• Sample rate: 16kHz
+• Voice characteristics extracted
 🎵 Generated Speech:
 • Text: "{text[:50]}{'...' if len(text) > 50 else ''}"
+• Generated tokens: {len(speech_ids)}
+��� Output duration: {len(audio_data)/16000:.1f} seconds
+🧠 Technology:
+• Model: Llasa-3B + XCodec2
+• Method: Zero-shot voice cloning
+• Quality: Production-ready"""
         return f.name, status_message
     except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        return None, f"❌ Error during voice cloning: {str(e)}\n\n🔧 Debug info:\n{error_details[:200]}..."
 # Create the Gradio interface
 def create_interface():
     with gr.Blocks(
         title="🎤 Voice Cloning Studio",
+        theme=gr.themes.Base(),
         css="""
         .gradio-container {
+            background: #0f0f23 !important;
+            color: #ffffff !important;
+        }
+        .dark {
+            background: #0f0f23 !important;
         }
         .status-text textarea {
             color: #ffffff !important;
+            background-color: #1a1a2e !important;
+            border: 1px solid #16213e !important;
             font-weight: 500 !important;
         }
         .status-text label {
             font-weight: 600 !important;
         }
         .comparison-box {
+            background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%) !important;
+            border: 1px solid #0e3460 !important;
+            border-radius: 12px;
+            padding: 20px;
+            margin: 15px 0;
         }
         .comparison-box h3 {
+            color: #64ffda !important;
+            margin-bottom: 15px;
+            font-size: 1.2em;
         }
         .comparison-box ul {
             color: #ffffff !important;
+            list-style: none;
+            padding-left: 0;
         }
         .comparison-box li {
+            color: #e0e0e0 !important;
+            margin: 8px 0;
+            padding-left: 20px;
+            position: relative;
+        }
+        .comparison-box li:before {
+            content: "✓";
+            color: #64ffda;
+            font-weight: bold;
+            position: absolute;
+            left: 0;
         }
         .comparison-box strong {
+            color: #64ffda !important;
+        }
+        .step-header {
+            color: #64ffda !important;
+            font-size: 1.1em;
+            margin: 20px 0 10px 0;
+            font-weight: 600;
+        }
+        .main-title {
+            background: linear-gradient(135deg, #64ffda 0%, #00bcd4 100%);
+            -webkit-background-clip: text;
+            -webkit-text-fill-color: transparent;
+            background-clip: text;
+            text-align: center;
+            font-size: 2.5em;
+            font-weight: 700;
+            margin-bottom: 10px;
+        }
+        .subtitle {
+            color: #b0b0b0;
+            text-align: center;
+            font-size: 1.2em;
+            margin-bottom: 30px;
         }
         """
     ) as demo:
         gr.HTML("""
+        <div style="text-align: center; margin-bottom: 30px;">
+            <h1 class="main-title">🎤 Voice Cloning Studio</h1>
+            <p class="subtitle">
+                Advanced AI voice synthesis technology
             </p>
         </div>
         """)
         with gr.Row():
             with gr.Column(scale=2):
+                # Feature comparison
                 gr.HTML("""
                 <div class="comparison-box">
+                    <h3>🚀 Key Features</h3>
                     <ul>
+                        <li><strong>High-Quality Synthesis</strong> - Professional voice cloning</li>
+                        <li><strong>Fast Processing</strong> - Generate speech in seconds</li>
+                        <li><strong>Multiple Formats</strong> - Support for MP3, WAV, and more</li>
+                        <li><strong>Privacy First</strong> - Your data stays secure</li>
                     </ul>
                 </div>
                 """)
                 # Step 1: Upload voice sample
+                gr.HTML("<h3 class='step-header'>📤 Step 1: Upload Voice Sample</h3>")
                 voice_sample = gr.Audio(
+                    label="Upload audio file (MP3, WAV, M4A)",
                     type="filepath",
                     sources=["upload"]
                 )
                 # Step 2: Enter text
+                gr.HTML("<h3 class='step-header'>📝 Step 2: Enter Text to Synthesize</h3>")
                 text_input = gr.Textbox(
+                    label="Text to convert to speech",
+                    placeholder="Enter the text you want to convert to speech using the uploaded voice...",
                     lines=3,
                     max_lines=5
                 )
                 # Step 3: Generate
+                gr.HTML("<h3 class='step-header'>🎯 Step 3: Generate Speech</h3>")
                 generate_btn = gr.Button(
+                    "🚀 Generate Voice Clone",
                     variant="primary",
                     size="lg"
                 )
             with gr.Column(scale=2):
                 # Results section
+                gr.HTML("<h3 class='step-header'>🎵 Generated Audio</h3>")
                 audio_output = gr.Audio(
+                    label="🎵 Synthesized Speech",
                     type="filepath"
                 )
                 status_text = gr.Textbox(
+                    label="📊 Processing Status",
                     interactive=False,
+                    lines=4,
                     elem_classes="status-text"
                 )
         # Example section
+        gr.HTML("<h3 class='step-header'>💡 Example Texts</h3>")
         examples = [
+            "Hello, this is a demonstration of voice cloning technology.",
+            "Welcome to the future of artificial intelligence and speech synthesis.",
+            "This voice was generated using advanced machine learning models.",
+            "Experience the power of AI-driven voice generation."
         ]
         gr.Examples(
         )
         # How it works section
+        with gr.Accordion("🔍 How It Works", open=False):
             gr.Markdown("""
+            ### The Technology
+            1. **🎤 Voice Analysis**: Upload a clear audio sample (10-60 seconds recommended)
+            2. **🧠 Feature Extraction**: AI analyzes vocal characteristics and patterns
+            3. **📝 Text Processing**: Input text is processed and prepared for synthesis
+            4. **🎵 Voice Synthesis**: Generate speech that matches the uploaded voice
+            ### Best Practices
+            - **Audio Quality**: Use clear, noise-free recordings
+            - **Sample Length**: 10-60 seconds provides optimal results
+            - **Single Speaker**: Ensure only one person is speaking
+            - **Good Microphone**: Higher quality input = better output
+            ### Applications
             - **Content Creation**: Audiobooks, podcasts, video narration
+            - **Accessibility**: Text-to-speech for visually impaired users
+            - **Entertainment**: Character voices for games and media
+            - **Education**: Interactive learning content
+            - **Localization**: Multi-language content with consistent voices
             """)
         # Event handlers