import gradio as gr
import torch
from TTS.api import TTS
import os
import soundfile as sf
import numpy as np
from pydub import AudioSegment
import tempfile
import gc

os.environ["COQUI_TOS_AGREED"] = "1"

# 🚀 PERFORMANCE OPTIMIZATIONS
torch.backends.cudnn.benchmark = True  # Optimize CUDA operations
torch.backends.cudnn.deterministic = False

# Smart device detection with memory optimization
use_gpu = torch.cuda.is_available()
device = "cuda" if use_gpu else "cpu"

print(f"[INFO] Using device: {device}")
if use_gpu:
    print(f"[INFO] GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# ✅ OPTIMIZED XTTS Model Initialization
try:
    # Use smaller model for faster inference if needed
    model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
    
    tts = TTS(model_name, gpu=use_gpu, progress_bar=False)  # Disable progress bar for speed
    
    if not hasattr(tts, "synthesizer") or not hasattr(tts.synthesizer, "tts_model"):
        raise RuntimeError("XTTS model failed to load correctly.")
    
    # 🚀 PERFORMANCE TWEAKS
    if hasattr(tts.synthesizer.tts_model, 'inference'):
        # Set inference parameters for speed
        tts.synthesizer.tts_model.inference_noise_scale = 0.667
        tts.synthesizer.tts_model.inference_noise_scale_w = 0.8
        tts.synthesizer.tts_model.length_scale = 1.0
    
    print(f"[INFO] XTTS model loaded successfully. GPU enabled: {use_gpu}")
    
except Exception as e:
    print(f"[ERROR] Failed to initialize XTTS model: {str(e)}")
    tts = None

# 🚀 AUDIO PREPROCESSING FOR SPEED
def preprocess_audio(audio_path, target_sr=22050, max_duration=30):
    """Optimize audio for faster processing"""
    try:
        # Load and preprocess audio
        audio_data, sr = sf.read(audio_path)
        
        # Convert to mono if stereo
        if len(audio_data.shape) > 1:
            audio_data = np.mean(audio_data, axis=1)
        
        # Trim silence and limit duration for speed
        from scipy.signal import find_peaks
        
        # Simple silence trimming
        threshold = np.max(np.abs(audio_data)) * 0.01
        non_silent = np.where(np.abs(audio_data) > threshold)[0]
        
        if len(non_silent) > 0:
            start_idx = max(0, non_silent[0] - int(0.1 * sr))  # Keep 0.1s before
            end_idx = min(len(audio_data), non_silent[-1] + int(0.1 * sr))  # Keep 0.1s after
            audio_data = audio_data[start_idx:end_idx]
        
        # Limit duration for faster processing
        max_samples = int(max_duration * sr)
        if len(audio_data) > max_samples:
            audio_data = audio_data[:max_samples]
        
        # Resample if needed
        if sr != target_sr:
            from scipy.signal import resample
            audio_data = resample(audio_data, int(len(audio_data) * target_sr / sr))
        
        # Save preprocessed audio
        temp_path = tempfile.mktemp(suffix='.wav')
        sf.write(temp_path, audio_data, target_sr)
        
        return temp_path
        
    except Exception as e:
        print(f"[WARNING] Audio preprocessing failed: {e}")
        return audio_path

# 🚀 OPTIMIZED TEXT PROCESSING
def optimize_text(text, max_length=500):
    """Optimize text for faster processing"""
    # Limit text length for speed
    if len(text) > max_length:
        # Split at sentence boundaries
        sentences = text.split('.')
        result = ""
        for sentence in sentences:
            if len(result + sentence) > max_length:
                break
            result += sentence + "."
        text = result.rstrip('.')
    
    # Clean text
    text = text.strip()
    if not text.endswith(('.', '!', '?')):
        text += '.'
    
    return text

# ✅ OPTIMIZED clone() Function
def clone(text, audio):
    if tts is None:
        return None, "⚠ XTTS model failed to load."
    
    if not text or not audio:
        return None, "⚠ Error: Missing text or audio input."
    
    try:
        import time
        start_time = time.time()
        
        # ✅ Validate audio input
        if isinstance(audio, bool) or not isinstance(audio, str) or not os.path.exists(audio):
            return None, "⚠ Error: Invalid audio input format."
        
        # 🚀 PREPROCESSING FOR SPEED
        print("[INFO] Preprocessing audio...")
        processed_audio = preprocess_audio(audio)
        
        print("[INFO] Optimizing text...")
        optimized_text = optimize_text(text)
        print(f"[INFO] Text length: {len(optimized_text)} characters")
        
        output_path = "./output.wav"
        
        # 🚀 OPTIMIZED XTTS Processing
        print("[INFO] Generating speech...")
        
        # Clear GPU cache before processing
        if use_gpu:
            torch.cuda.empty_cache()
        
        # Generate with optimized settings
        tts.tts_to_file(
            text=optimized_text,
            speaker_wav=processed_audio,
            language="en",
            file_path=output_path,
            split_sentences=True,  # Better for long texts
            # Additional optimization parameters
        )
        
        # Clean up temporary files
        if processed_audio != audio:
            try:
                os.remove(processed_audio)
            except:
                pass
        
        # Clear memory
        if use_gpu:
            torch.cuda.empty_cache()
        gc.collect()
        
        # ✅ Validate output
        if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
            return None, "⚠ Error: XTTS failed to generate audio."
        
        # 🚀 PERFORMANCE METRICS
        end_time = time.time()
        processing_time = end_time - start_time
        
        # Calculate audio duration for real-time factor
        audio_data, sr = sf.read(output_path)
        audio_duration = len(audio_data) / sr
        rtf = processing_time / audio_duration if audio_duration > 0 else 0
        
        print(f"[PERFORMANCE] Processing time: {processing_time:.2f}s")
        print(f"[PERFORMANCE] Audio duration: {audio_duration:.2f}s")
        print(f"[PERFORMANCE] Real-time factor: {rtf:.2f}x")
        
        return output_path, f"✅ Generated in {processing_time:.1f}s (RTF: {rtf:.1f}x)"
        
    except Exception as e:
        print(f"[ERROR] XTTS Processing Error: {str(e)}")
        # Clean up on error
        if use_gpu:
            torch.cuda.empty_cache()
        gc.collect()
        return None, f"⚠ Error: {str(e)}"

# 🚀 OPTIMIZED Gradio Interface
def create_interface():
    with gr.Blocks(
        theme=gr.themes.Soft(primary_hue="teal"),
        title="⚡ Fast Voice Clone"
    ) as iface:
        
        gr.Markdown("# ⚡ Optimized Voice Cloning with XTTS")
        gr.Markdown("*Faster processing with quality optimizations*")
        
        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(
                    label="📝 Text to speak",
                    placeholder="Enter text here (max 500 chars for optimal speed)...",
                    lines=3,
                    max_lines=5
                )
                
                audio_input = gr.Audio(
                    type='filepath',
                    label='🎤 Voice reference (10-30 seconds recommended)',
                    sources=['upload', 'microphone']
                )
                
                with gr.Row():
                    generate_btn = gr.Button("🚀 Generate Voice", variant="primary")
                    clear_btn = gr.Button("🗑️ Clear", variant="secondary")
            
            with gr.Column():
                status_output = gr.Textbox(
                    label="📊 Status",
                    interactive=False,
                    lines=2
                )
                
                audio_output = gr.Audio(
                    type='filepath',
                    label='🔊 Generated Audio'
                )
        
        # Performance tips
        gr.Markdown("""
        ### 🚀 Performance Tips:
        - Keep text under 500 characters for fastest processing
        - Use 10-30 second reference audio clips
        - GPU processing is ~5-10x faster than CPU
        - Clear audio with minimal background noise works best
        """)
        
        # Event handlers
        generate_btn.click(
            fn=clone,
            inputs=[text_input, audio_input],
            outputs=[audio_output, status_output],
            show_progress=True
        )
        
        clear_btn.click(
            fn=lambda: (None, None, None, ""),
            outputs=[text_input, audio_input, audio_output, status_output]
        )
    
    return iface

# ✅ Launch optimized interface
if __name__ == "__main__":
    iface = create_interface()
    iface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True,
        quiet=False
    )