import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ---------------- CONFIG ----------------
MODEL_NAME = "google/gemma-3-270m-it"   # ✅ instruction-tuned Gemma 3 model
SYSTEM_PROMPT_DEFAULT = (
    "You are a formal and polite AI assistant. "
    "Always respond appropriately depending on the selected explanation style."
)

MAX_NEW_TOKENS_DEFAULT = 256
TEMP_DEFAULT = 0.7
TOP_P_DEFAULT = 0.9

# ---------------- LOAD MODEL ----------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,   # ✅ safe for CPU
)

generator = pipeline(
    "text-generation",   # ✅ causal LM (not seq2seq)
    model=model,
    tokenizer=tokenizer,
    device=-1   # ✅ force CPU
)

# ---------------- HELPERS ----------------
def format_prompt(chat_history, user_message, system_message, response_style):
    # Start with system message
    prompt = system_message + "\n\n"

    # Add only user messages (optional: you can also add last assistant reply if needed)
    for turn in chat_history:
        if turn["role"] == "user":
            prompt += f"{turn['content']}\n"

    # Add the new user message
    prompt += f"{user_message}\n"

    # Optionally instruct for explanation style
    if response_style == "No explanation":
        prompt += " Answer concisely with no explanation."
    elif response_style == "Short explanation":
        prompt += " Answer briefly with a one-sentence explanation."
    elif response_style == "Detailed explanation":
        prompt += " Answer in detail with reasoning and examples."

    return prompt


# ---------------- CHAT FUNCTION ----------------
def chat(user_message, chat_history, system_message, max_tokens, temperature, top_p, response_style):
    chat_history = chat_history or []
    prompt = format_prompt(chat_history, user_message, system_message, response_style)

    output = generator(
        prompt,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
    )[0]['generated_text']

    # For causal LMs, output includes the prompt → strip it
    response = output[len(prompt):].strip()

    # Save user and assistant content without labels
    chat_history.append({"role": "user", "content": user_message})
    chat_history.append({"role": "assistant", "content": response})

    return "", chat_history


# ---------------- UI ----------------
with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink")) as demo:
    gr.Markdown("# 🧠 Gemma-3-270M Chat Assistant (CPU-safe)")

    chatbot = gr.Chatbot(type="messages", height=500, show_copy_button=True)

    with gr.Row():
        msg = gr.Textbox(label="💬 Your Message", placeholder="Type here…", scale=6)
        send_btn = gr.Button("🚀 Send", variant="primary", scale=1)
        clear_btn = gr.Button("🧹 Clear Chat", scale=1)

    with gr.Accordion("⚙️ Advanced Settings", open=False):
        system_prompt = gr.Textbox(label="System Prompt", value=SYSTEM_PROMPT_DEFAULT, lines=3)
        response_style = gr.Dropdown(
            ["No explanation", "Short explanation", "Detailed explanation"],
            value="Detailed explanation",
            label="Response Style"
        )
        temperature = gr.Slider(0.1, 1.5, value=TEMP_DEFAULT, step=0.1, label="Temperature")
        top_p = gr.Slider(0.1, 1.0, value=TOP_P_DEFAULT, step=0.05, label="Top-p")
        max_tokens = gr.Slider(32, 512, value=MAX_NEW_TOKENS_DEFAULT, step=16, label="Max new tokens")

    send_btn.click(
        chat,
        [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style],
        [msg, chatbot]
    )
    msg.submit(
        chat,
        [msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style],
        [msg, chatbot]
    )
    clear_btn.click(lambda: [], None, chatbot, queue=False)

if __name__ == "__main__":
    demo.launch()