import os

# Disable Xet/CAS backend (it’s what’s throwing the error)
os.environ["HF_HUB_ENABLE_XET"] = "0"

# Use the robust Rust downloader for big files
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# Optional but helpful: resume and avoid symlinks on some filesystems
os.environ["HF_HUB_ENABLE_RESUME"] = "1"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"


import gradio as gr
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration


# Hugging Face model identifier.  See the model card for more details:
# https://huggingface.co/StarCycle/llava-dinov2-internlm2-7b-v1
MODEL_ID = "xtuner/llava-phi-3-mini-hf"

# Determine the computation device.  If a CUDA‑enabled GPU is
# available we will use it and cast the weights to half precision to
# reduce memory consumption.  Otherwise we fall back to CPU.
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    TORCH_DTYPE = torch.float16
else:
    DEVICE = torch.device("cpu")
    TORCH_DTYPE = torch.float32


def load_model():
    """Load the LLaVA model and its processor.

    The model is loaded with ``trust_remote_code=True`` to allow the
    repository’s custom projector and adapter classes to be registered
    correctly.  We specify ``device_map='auto'`` so that the
    ``accelerate`` library will distribute the model across the
    available hardware (GPU/CPU) automatically.  The ``torch_dtype``
    argument ensures that the model weights are loaded in half
    precision on a GPU and in full precision on a CPU.
    """
    model = LlavaForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=TORCH_DTYPE,
        device_map="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True,
    )
    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
    return model, processor


# Load the model and processor at import time.  Loading is expensive so
# we only do it once.  If the model fails to load (for example
# because of missing dependencies) the exception will be raised here.
MODEL, PROCESSOR = load_model()


def answer_question(image: "PIL.Image.Image", question: str) -> str:
    """Generate an answer for the given question about the uploaded image.

    Parameters
    ----------
    image: PIL.Image.Image
        The user‑provided image.  Gradio supplies images as PIL
        objects, which the LLaVA processor accepts directly.
    question: str
        The user’s question about the image.

    Returns
    -------
    str
        The answer generated by the model.  If either the image or
        question is missing, an explanatory message is returned.
    """
    # Basic validation: ensure both inputs are provided.
    if image is None:
        return "Please upload an image."
    if not question or not question.strip():
        return "Please enter a question about the image."

    # Build the chat prompt.  The LLaVA model uses the ``<image>``
    # placeholder to indicate where the image will be inserted.
    prompt = f"USER: <image>\n{question.strip()} ASSISTANT:"

    # Tokenize the inputs.  The processor will process both the image
    # and the text and return PyTorch tensors.  We move these to the
    # same device as the model to avoid device mismatch errors.
    inputs = PROCESSOR(
        images=image,
        text=prompt,
        return_tensors="pt",
    )
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

    # Generate the answer.  We limit the number of new tokens to 256 to
    # avoid excessive memory usage.  Feel free to adjust this value
    # depending on your hardware constraints and desired response length.
    with torch.no_grad():
        generated_ids = MODEL.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=False,
        )

    # Decode the generated ids back into text.  The output will include
    # the entire conversation (e.g., ``USER: ... ASSISTANT: ...``).
    output = PROCESSOR.batch_decode(
        generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )[0]

    # Extract the assistant's response by splitting on the
    # ``ASSISTANT:`` delimiter.
    if "ASSISTANT:" in output:
        answer = output.split("ASSISTANT:")[-1].strip()
    else:
        # Fallback if the delimiter is not present.
        answer = output.strip()

    return answer


def build_interface() -> gr.Interface:
    """Construct the Gradio Interface object for the app."""
    description = (
        "Upload an image and ask a question about it.\n\n"
        "This demo uses the multimodal model "
        "StarCycle/llava‑dinov2‑internlm2‑7b‑v1 to perform visual "
        "question answering.  The model combines the Dinov2 vision encoder with "
        "the InternLM2‑Chat‑7B language model via a lightweight projector and "
        "LoRA adapters.  Note: inference requires a GPU with sufficient "
        "memory; on a CPU the generation will be extremely slow."
    )
    iface = gr.Interface(
        fn=answer_question,
        inputs=[
            gr.Image(type="pil", label="Image"),
            gr.Textbox(
                label="Question",
                placeholder="Describe or ask something about the image",
                lines=1,
            ),
        ],
        outputs=gr.Textbox(label="Answer"),
        title="Visual Question Answering with LLaVA Dinov2 InternLM2 7B",
        description=description,
        allow_flagging="never",
    )
    return iface


def main() -> None:
    """Launch the Gradio app."""
    iface = build_interface()
    # When running on Hugging Face Spaces the app will automatically set
    # the appropriate host and port.  For local development you can
    # uncomment the ``server_name`` argument to make the app reachable
    # from other machines on your network.
    iface.launch()


if __name__ == "__main__":
    main()