# app.py — Corrected for Hugging Face ZeroGPU Spaces
# ---------------------------------------------------------------
# This version is adapted for the ZeroGPU environment by using
# the @spaces.GPU decorator.
# ---------------------------------------------------------------
import os
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces # 1. Import the spaces library

IS_CUDA = torch.cuda.is_available()
IS_ZEROGPU = True if os.getenv("SPACES_ZERO_GPU", None) else False
if IS_ZEROGPU:
    torch.compiler.set_stance("force_eager")
    torch.set_float32_matmul_precision("high")
    torch.backends.cuda.matmul.allow_tf32 = True

# ── Configuration ────────────────────────────────────────────────────────────
MODEL_ID = "Reubencf/gemma3-konkani"
HF_TOKEN = os.getenv("HF_TOKEN", None)

TITLE = "Konkani LLM Fine Tuned on Gemma 3"
DESCRIPTION = (
    "Version 1 of the Konkani LLM.\n"
    "This release may contain inconsistencies, but improvements will follow in future updates."
)

# ── Loading ──────────────────────────────────────────────────────────────────
print(f"[Init] Loading model pipeline for the first time: {MODEL_ID}...")
def load_model():
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
        kwargs = {"attn_implementation": "sdpa"} if IS_CUDA else {}
        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16 if IS_CUDA else torch.float32,
                                                     device_map="auto", token=HF_TOKEN, **kwargs)
        print("[Init] Model loaded successfully.")
        return model, tokenizer
    except Exception as e:
        # If model loading fails, we can't proceed.
        print(f"[Fatal] Could not load model: {e}")
        raise Exception(f"❌ Model failed to load: {e}")

model, tokenizer = load_model()

DEF_TOKENS = 256
DEF_TEMPERATURE = 0.7
DEF_TOPK = 50
DEF_TOPP = 0.95
DEF_DURATION = 10

def get_duration(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
    return int(duration if duration is not None else DEF_DURATION)

# ── Generation Function ──────────────────────────────────────────────────────
@spaces.GPU(duration=get_duration) # 2. Decorate the function that needs the GPU
@torch.inference_mode()
def generate_response(message, history=[], system_message="", max_tokens=DEF_TOKENS, temperature=DEF_TEMPERATURE, top_k=DEF_TOPK, top_p=DEF_TOPP, duration=DEF_DURATION):
    """
    This function is called for each user message.
    The @spaces.GPU decorator ensures a GPU is allocated when this runs.
    """
    try:
        # Format the conversation history
        conversation = []
        if system_message: conversation.append({"role": "system", "content": system_message})
        for msg in history: # https://www.gradio.app/docs/gradio/chatbot
            if not isinstance(msg, dict) or not {"role", "content"}.issubset(msg.keys()): continue
            conversation.append({"role": msg["role"], "content": msg["content"]})

        # Add the current user's message
        conversation.append({"role": "user", "content": message})

        # Apply the chat template
        inputs = tokenizer.apply_chat_template(
            conversation,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True,
        ).to(model.device)

        # Generate the response
        gen_kwargs = dict(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            #eos_token_id=tokenizer.eos_token_id,
            #num_beams=1,
            output_scores=False,
            cache_implementation="static", # https://github.com/huggingface/transformers/issues/38501
        )
        outputs = model.generate(**gen_kwargs)

        # Extract only the newly generated text
        gen_ids = outputs[0][inputs["input_ids"].shape[-1]:]
        new_response = tokenizer.decode(gen_ids, skip_special_tokens=True)

        return new_response
    except Exception as e:
        print(f"Error: {e}")
        gr.Warning(f"Error: {e}")
        return ""

# ── UI ────────────────────────────────────────────────────────────────────────
examples = [
    ["Translate From English to Devnagri Konkani: what is color?"],
    ["घरांत विजेचो वापर उणो करपाची येवजण तयार करप."],
]

demo = gr.ChatInterface(
    fn=generate_response,
    type="messages",
    title=TITLE,
    description=DESCRIPTION,
    examples=examples,
    cache_examples=True,
    theme="soft",
    additional_inputs=[
        gr.Textbox(value="", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=DEF_TOKENS, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=DEF_TEMPERATURE, step=0.1, label="Temperature"),
        gr.Slider(minimum=0, maximum=360, value=DEF_TOPK, step=1, label="Top-k"),
        gr.Slider(minimum=0.1, maximum=1.0, value=DEF_TOPP, step=0.05, label="Top-p (nucleus sampling)"),
        gr.Slider(minimum=1, maximum=360, value=DEF_DURATION, step=1, label="Duration"),
    ],
)

# ── Launch ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    print("🚀 Starting Gradio app for ZeroGPU...")
    demo.queue().launch()