import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import time
import os

# --- Configuration ---
BASE_MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
# NOW, this points to your model on the Hugging Face Hub
FINETUNED_MODEL_ID = "serhany/cineguide-qwen2.5-7b-instruct-ft"

# System prompts (same as before)
SYSTEM_PROMPT_CINEGUIDE = """You are CineGuide, a knowledgeable and friendly movie recommendation assistant. Your goal is to:
1. Provide personalized movie recommendations based on user preferences
2. Give brief, compelling rationales for why you recommend each movie
3. Ask thoughtful follow-up questions to better understand user tastes
4. Maintain an enthusiastic but not overwhelming tone about cinema

When recommending movies, always explain WHY the movie fits their preferences."""
SYSTEM_PROMPT_BASE = "You are a helpful AI assistant."

# --- Model Loading ---
_models_cache = {}

def get_model_and_tokenizer(model_id_or_path, is_local_path=False): # Added is_local_path for flexibility
    if model_id_or_path in _models_cache:
        return _models_cache[model_id_or_path]

    print(f"Loading model: {model_id_or_path}")
    # For models from Hub, trust_remote_code is often needed for custom architectures like Qwen
    # For local paths, it might also be needed if they were saved with trust_remote_code=True
    tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id_or_path,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
        # attn_implementation="flash_attention_2" # Optional
    )
    model.eval()

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        # Ensure pad_token_id is also set if pad_token is set
        if hasattr(tokenizer, "pad_token_id") and tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
            tokenizer.pad_token_id = tokenizer.eos_token_id


    _models_cache[model_id_or_path] = (model, tokenizer)
    print(f"Finished loading: {model_id_or_path}")
    return model, tokenizer

print("Pre-loading models...")
model_base, tokenizer_base = None, None
model_ft, tokenizer_ft = None, None

try:
    model_base, tokenizer_base = get_model_and_tokenizer(BASE_MODEL_ID)
    print("Base model loaded.")
except Exception as e:
    print(f"Error loading base model ({BASE_MODEL_ID}): {e}")

try:
    model_ft, tokenizer_ft = get_model_and_tokenizer(FINETUNED_MODEL_ID)
    print("Fine-tuned model loaded.")
except Exception as e:
    print(f"Error loading fine-tuned model ({FINETUNED_MODEL_ID}): {e}")

print("Model pre-loading complete.")

# --- Inference Function (generate_chat_response) ---
# This function remains largely the same as in the previous app.py.
# Make sure it uses `model_base, tokenizer_base` and `model_ft, tokenizer_ft` correctly.
def generate_chat_response(message: str, chat_history: list, model_type: str):
    # ... (Keep the exact same generate_chat_response function from the previous app.py)
    if model_type == "base":
        if model_base is None or tokenizer_base is None:
            yield f"Base model ({BASE_MODEL_ID}) is not available."
            return
        model, tokenizer = model_base, tokenizer_base
        system_prompt = SYSTEM_PROMPT_BASE
    elif model_type == "finetuned":
        if model_ft is None or tokenizer_ft is None:
            yield f"Fine-tuned model ({FINETUNED_MODEL_ID}) is not available."
            return
        model, tokenizer = model_ft, tokenizer_ft
        system_prompt = SYSTEM_PROMPT_CINEGUIDE
    else:
        yield "Invalid model type."
        return

    conversation = []
    if system_prompt:
        conversation.append({"role": "system", "content": system_prompt})

    for user_msg, assistant_msg in chat_history:
        if user_msg: # Ensure user_msg is not None
             conversation.append({"role": "user", "content": user_msg})
        if assistant_msg: # Ensure assistant_msg is not None
             conversation.append({"role": "assistant", "content": assistant_msg})
    conversation.append({"role": "user", "content": message})

    prompt = tokenizer.apply_chat_template(
        conversation,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1800).to(model.device)

    full_response = ""
    # Make sure eos_token_id is a list if multiple EOS tokens are possible
    eos_tokens_ids = [tokenizer.eos_token_id]
    im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
    if im_end_id != tokenizer.unk_token_id: # Check if <|im_end|> is in vocab
        eos_tokens_ids.append(im_end_id)


    generated_token_ids = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.pad_token_id, # Use pad_token_id
        eos_token_id=eos_tokens_ids
    )

    new_tokens = generated_token_ids[0, inputs['input_ids'].shape[1]:]
    response_text = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
    response_text = response_text.replace("<|im_end|>", "").strip()

    for char in response_text:
        full_response += char
        time.sleep(0.005)
        yield full_response

def respond_base(message, chat_history):
    yield from generate_chat_response(message, chat_history, "base")

def respond_finetuned(message, chat_history):
    yield from generate_chat_response(message, chat_history, "finetuned")


# --- Gradio UI (with gr.Blocks as demo:) ---
# This part remains largely the same as the previous app.py
# Ensure the Markdown and labels correctly reference the models being loaded.
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        f"""
        # 🎬 CineGuide vs. Base {BASE_MODEL_ID}
        Compare the fine-tuned CineGuide movie recommender (loaded from `{FINETUNED_MODEL_ID}`)
        with the base {BASE_MODEL_ID} model.
        Type your movie-related query below and see how each model responds!
        """
    )
    # ... (Rest of the UI definition: Rows, Columns, Chatbots, Textbox, Button, Examples)
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown(f"## 🗣️ Base {BASE_MODEL_ID}")
            chatbot_base = gr.Chatbot(label="Base Model Chat", height=500, bubble_full_width=False)
            if model_base is None:
                 gr.Markdown(f"⚠️ Base model ({BASE_MODEL_ID}) could not be loaded.")

        with gr.Column(scale=1):
            gr.Markdown(f"## 🤖 Fine-tuned CineGuide (from {FINETUNED_MODEL_ID})")
            chatbot_ft = gr.Chatbot(label="CineGuide Chat", height=500, bubble_full_width=False)
            if model_ft is None:
                 gr.Markdown(f"⚠️ Fine-tuned model ({FINETUNED_MODEL_ID}) could not be loaded.")

    with gr.Row():
        shared_input_textbox = gr.Textbox(
            show_label=False,
            placeholder="Enter your movie query here and press Enter...",
            container=False,
            scale=7,
        )
        submit_button = gr.Button("✉️ Send", variant="primary", scale=1)

    gr.Examples(
        examples=[
            "Hi! I'm looking for something funny to watch tonight.",
            "I love dry, witty humor more than slapstick. Think more British comedy style.",
            "I'm really into complex sci-fi movies that make you think. I loved Arrival and Blade Runner 2049.",
            "I need help planning a family movie night. We have kids aged 8, 11, and 14, plus adults.",
            "I'm going through a tough breakup and need something uplifting but not cheesy romantic.",
            "I loved Parasite and want to explore more international cinema. Where should I start?",
        ],
        inputs=[shared_input_textbox],
        label="Example Prompts (click to use)"
    )

    def base_model_predict(user_message, chat_history):
        if model_base is None: # Add this check
            chat_history.append((user_message, f"Base model ({BASE_MODEL_ID}) is not available."))
            yield chat_history
            return

        chat_history.append((user_message, ""))
        for response_chunk in respond_base(user_message, chat_history[:-1]):
            chat_history[-1] = (user_message, response_chunk)
            yield chat_history
    
    def ft_model_predict(user_message, chat_history):
        if model_ft is None: # Add this check
            chat_history.append((user_message, f"Fine-tuned model ({FINETUNED_MODEL_ID}) is not available."))
            yield chat_history
            return

        chat_history.append((user_message, ""))
        for response_chunk in respond_finetuned(user_message, chat_history[:-1]):
            chat_history[-1] = (user_message, response_chunk)
            yield chat_history

    # Event handlers
    actions = []
    if model_base is not None:
        actions.append(
            shared_input_textbox.submit(
                base_model_predict,
                [shared_input_textbox, chatbot_base],
                [chatbot_base],
                queue=True
            )
        )
        actions.append(
            submit_button.click(
                base_model_predict,
                [shared_input_textbox, chatbot_base],
                [chatbot_base],
                queue=True
            )
        )
    
    if model_ft is not None:
        actions.append(
            shared_input_textbox.submit(
                ft_model_predict,
                [shared_input_textbox, chatbot_ft],
                [chatbot_ft],
                queue=True
            )
        )
        actions.append(
            submit_button.click(
                ft_model_predict,
                [shared_input_textbox, chatbot_ft],
                [chatbot_ft],
                queue=True
            )
        )
    
    # Clear textbox after all submits are queued. This is slightly simplified.
    # For a more robust clear, you might need to chain these events or use gr.Group.
    def clear_textbox_fn():
        return ""

    if actions: # If any model is active
        shared_input_textbox.submit(clear_textbox_fn, [], [shared_input_textbox])
        submit_button.click(clear_textbox_fn, [], [shared_input_textbox])


# --- Launch the App ---
if __name__ == "__main__":
    demo.queue()
    demo.launch(debug=True) # share=True for public link if running locally