import spaces
from snac import SNAC
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading SNAC model...")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
snac_model = snac_model.to(device)

# Single model trained on all voices
MODEL_NAME = "Vyvo/VyvoTTS-LFM2-Multi-Speaker"  

AVAILABLE_VOICES = [
    "Stephen_Fry",
    "Tighnari", 
    "Thoma",
    "Shikanoin_Heizou",
    "Noelle",
    "Ningguang", 
    "Nilou",
    "Neuvillette",
    "Navia",
    "Nahida",
    "Mualani",
    "Lyney",
    "Lynette", 
    "Layla",
    "Kaveh",
    "Kaeya",
    "Furina",
    "Dehya", 
    "Cyno",
    "Collei",
    "Beidou",
    "Alhaitham",
    "Arataki_Itto",
]

# Load single model
print(f"Loading model: {MODEL_NAME}")
try:
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16)
    model = model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    print("✓ Model loaded successfully!")
except Exception as e:
    print(f"✗ Failed to load model: {e}")
    model = None
    tokenizer = None

# LFM2 Special Tokens Configuration
TOKENIZER_LENGTH = 64400
START_OF_TEXT = 1
END_OF_TEXT = 7
START_OF_SPEECH = TOKENIZER_LENGTH + 1
END_OF_SPEECH = TOKENIZER_LENGTH + 2
START_OF_HUMAN = TOKENIZER_LENGTH + 3
END_OF_HUMAN = TOKENIZER_LENGTH + 4
START_OF_AI = TOKENIZER_LENGTH + 5
END_OF_AI = TOKENIZER_LENGTH + 6
PAD_TOKEN = TOKENIZER_LENGTH + 7
AUDIO_TOKENS_START = TOKENIZER_LENGTH + 10

# Process text prompt for LFM2 with voice
def process_prompt(prompt, voice, tokenizer, device):
    # Voice-aware prompt formatting
    if voice:
        formatted_prompt = f"[VOICE:{voice}] {prompt}"
    else:
        formatted_prompt = prompt
    
    input_ids = tokenizer(formatted_prompt, return_tensors="pt").input_ids
    
    start_token = torch.tensor([[START_OF_HUMAN]], dtype=torch.int64)
    end_tokens = torch.tensor([[END_OF_TEXT, END_OF_HUMAN]], dtype=torch.int64)
    
    modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
    
    # No padding needed for single input
    attention_mask = torch.ones_like(modified_input_ids)
    
    return modified_input_ids.to(device), attention_mask.to(device)

# Parse output tokens to audio for LFM2
def parse_output(generated_ids):
    token_to_find = START_OF_SPEECH
    token_to_remove = END_OF_SPEECH
    
    token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)

    if len(token_indices[1]) > 0:
        last_occurrence_idx = token_indices[1][-1].item()
        cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
    else:
        cropped_tensor = generated_ids

    processed_rows = []
    for row in cropped_tensor:
        masked_row = row[row != token_to_remove]
        processed_rows.append(masked_row)

    code_lists = []
    for row in processed_rows:
        row_length = row.size(0)
        new_length = (row_length // 7) * 7
        trimmed_row = row[:new_length]
        trimmed_row = [t - AUDIO_TOKENS_START for t in trimmed_row]
        code_lists.append(trimmed_row)
        
    return code_lists[0]  # Return just the first one for single sample

# Redistribute codes for audio generation
def redistribute_codes(code_list, snac_model):
    device = next(snac_model.parameters()).device  # Get the device of SNAC model
    
    layer_1 = []
    layer_2 = []
    layer_3 = []
    for i in range((len(code_list)+1)//7):
        layer_1.append(code_list[7*i])
        layer_2.append(code_list[7*i+1]-4096)
        layer_3.append(code_list[7*i+2]-(2*4096))
        layer_3.append(code_list[7*i+3]-(3*4096))
        layer_2.append(code_list[7*i+4]-(4*4096))
        layer_3.append(code_list[7*i+5]-(5*4096))
        layer_3.append(code_list[7*i+6]-(6*4096))
        
    # Move tensors to the same device as the SNAC model
    codes = [
        torch.tensor(layer_1, device=device).unsqueeze(0),
        torch.tensor(layer_2, device=device).unsqueeze(0),
        torch.tensor(layer_3, device=device).unsqueeze(0)
    ]
    
    audio_hat = snac_model.decode(codes)
    return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array

# Main generation function
@spaces.GPU()
def generate_speech(text, voice_choice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
    if not text.strip():
        return None
    
    if model is None or tokenizer is None:
        return None
    
    try:
        progress(0.1, f"🔄 Processing text with {voice_choice} voice...")
        
        # Voice parametresi ile prompt oluştur
        input_ids, attention_mask = process_prompt(text, voice_choice, tokenizer, device)
        
        progress(0.3, "🎵 Generating speech tokens...")
        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=temperature,
                top_p=top_p,
                repetition_penalty=repetition_penalty,
                num_return_sequences=1,
                eos_token_id=END_OF_SPEECH,
            )
        
        progress(0.6, "🔧 Processing speech tokens...")
        code_list = parse_output(generated_ids)
        
        progress(0.8, "🎧 Converting to audio...")
        audio_samples = redistribute_codes(code_list, snac_model)
        
        progress(1.0, f"✅ Completed with {voice_choice}!")
        return (24000, audio_samples)
    except Exception as e:
        print(f"Error generating speech with {voice_choice}: {e}")
        return None

# Genshin karakterleri için örnek metinler
EXAMPLE_TEXTS = [
    "Hello! I am ready to help you on your adventure in Teyvat.",
    "The wind brings new adventures and ancient secrets.",
    "Let me share the wisdom of the elements with you.",
    "Together we can explore the mysteries of this world.",
    "Every journey begins with a single step forward.",
    "The stars above guide us through the darkest nights."
]

# Create modern Gradio interface
with gr.Blocks(title="🎮 Genshin Voice TTS", theme=gr.themes.Soft(), css="""
.gradio-textbox textarea { background-color: #6b7280 !important; color: white !important; }
.gradio-audio { background-color: #6b7280 !important; }
.character-category { margin: 10px 0; padding: 10px; border-radius: 8px; background-color: #f0f0f0; }
""") as demo:
    # Header section
    gr.Markdown("""
    # 🎮 VyvoTTS-LFM2
    ### 🎭 Character Voices | 🔗 [Github](https://github.com/Vyvo-Labs/VyvoTTS) | 🤗 [HF Model](https://huggingface.co/collections/Vyvo/lfm2-tts-689eedae5353ff5b048efd55)
    """)

    gr.Markdown("""
    VyvoTTS is a text-to-speech model by Vyvo team using LFM2 architecture, trained on multiple diverse open-source datasets. 
    Since some datasets may contain transcription errors or quality issues, output quality can vary. 
    Higher quality datasets typically produce better speech synthesis results.
    
    **Roadmap:**
    - [ ] Transformers.js support
    - [ ] Pretrained model release
    - [ ] vLLM support
    - [x] Training and inference code release
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            # Text input section
            text_input = gr.Textbox(
                label="📝 Text Input",
                placeholder="Enter the text you want your chosen character to say...",
                lines=6,
                max_lines=10
            )
            
            # Voice selection
            voice_choice = gr.Dropdown(
                choices=AVAILABLE_VOICES,
                value="Tighnari",
                label="🎤 Character Voice",
                info=f"Choose from {len(AVAILABLE_VOICES)} available character voices"
            )
            
            # Advanced settings
            with gr.Accordion("⚙️ Advanced Settings", open=False):
                temperature = gr.Slider(
                    minimum=0.1, maximum=1.5, value=0.6, step=0.05,
                    label="🌡️ Temperature", 
                    info="Higher values create more expressive but less stable speech"
                )
                top_p = gr.Slider(
                    minimum=0.1, maximum=1.0, value=0.95, step=0.05,
                    label="🎯 Top P", 
                    info="Nucleus sampling threshold value"
                )
                repetition_penalty = gr.Slider(
                    minimum=1.0, maximum=2.0, value=1.1, step=0.05,
                    label="🔄 Repetition Penalty", 
                    info="Higher values discourage repetitive patterns"
                )
                max_new_tokens = gr.Slider(
                    minimum=100, maximum=2000, value=1200, step=100,
                    label="📏 Maximum Length", 
                    info="Maximum length of generated audio (in tokens)"
                )
            
            # Action buttons
            with gr.Row():
                submit_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
                clear_btn = gr.Button("🗑️ Clear", size="lg")
        
        with gr.Column(scale=1):
            # Output section
            audio_output = gr.Audio(
                label="🎧 Generated Audio",
                type="numpy",
                interactive=False
            )

    # Example texts
    gr.Markdown("### 📚 Example Texts")
    with gr.Row():
        for i in range(0, len(EXAMPLE_TEXTS), 2):
            with gr.Column():
                if i < len(EXAMPLE_TEXTS):
                    example_btn = gr.Button(
                        EXAMPLE_TEXTS[i][:50] + "..." if len(EXAMPLE_TEXTS[i]) > 50 else EXAMPLE_TEXTS[i], 
                        size="sm"
                    )
                    example_btn.click(fn=lambda text=EXAMPLE_TEXTS[i]: text, outputs=text_input)
                
                if i+1 < len(EXAMPLE_TEXTS):
                    example_btn2 = gr.Button(
                        EXAMPLE_TEXTS[i+1][:50] + "..." if len(EXAMPLE_TEXTS[i+1]) > 50 else EXAMPLE_TEXTS[i+1], 
                        size="sm"
                    )
                    example_btn2.click(fn=lambda text=EXAMPLE_TEXTS[i+1]: text, outputs=text_input)
    
    # Set up event handlers
    submit_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_choice, temperature, top_p, repetition_penalty, max_new_tokens],
        outputs=audio_output,
        show_progress=True
    )
    
    def clear_interface():
        return "", None
    
    clear_btn.click(
        fn=clear_interface,
        inputs=[],
        outputs=[text_input, audio_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.queue().launch(share=False, ssr_mode=False)