import gradio as gr
import spaces
import torch
import os

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info  # pip install qwen-vl-utils[decord]==0.0.8

# =============================================================================
# Qwen2.5-VL-7B-Instruct: model & processor
# =============================================================================
MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"

# 권장: flash-attn2 사용 (환경에 따라 주석 해제)
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     MODEL_ID,
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# 기본 로드
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype="auto",
    device_map="auto",
)
model.eval()

# 해상도 자동 조절(기본값 사용). 필요시 min/max_pixels로 토큰 비용 제어 가능.
processor = AutoProcessor.from_pretrained(MODEL_ID)
# 예: min_pixels = 256*28*28; max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=min_pixels, max_pixels=max_pixels)


# =============================================================================
# Inference (image-only UI, text는 선택)
# =============================================================================
@spaces.GPU
def qwen_vl_inference(image_path: str | None, text_input: str | None = None):
    if image_path is None:
        return "Please upload an image first."

    # Qwen은 파일 경로를 file:// URI로 전달하는 방식을 공식 예제로 제공
    file_uri = f"file://{os.path.abspath(image_path)}"
    user_text = text_input.strip() if text_input else "Describe this image."

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": file_uri},
                {"type": "text", "text": user_text},
            ],
        }
    ]

    # 텍스트/비전 전처리
    chat_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[chat_text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )

    # 모델 디바이스로 이동 (device_map="auto" 환경에서도 안전)
    inputs = {k: (v.to(model.device) if isinstance(v, torch.Tensor) else v) for k, v in inputs.items()}

    # 생성
    gen_ids = model.generate(**inputs, max_new_tokens=512)
    # 입력 토큰 제거 후 디코딩
    trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], gen_ids)]
    output = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return output


# =============================================================================
# Gradio UI (Gradio 5)
# =============================================================================
DESCRIPTION = (
    "[Qwen2.5-VL-7B-Instruct demo](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) — "
    "upload an image and ask anything about it."
)

css = """
#output_text {
  height: 500px;
  overflow: auto;
  border: 1px solid #ccc;
}
"""

with gr.Blocks(css=css, theme="origin") as demo:
    gr.Markdown(DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=1):
            input_image = gr.Image(label="Upload Image", type="filepath")
            text_input = gr.Textbox(label="Question")
            submit_btn = gr.Button("Submit")
        with gr.Column(scale=1):
            output_text = gr.Textbox(label="Model Output", elem_id="output_text")

    gr.Examples(
        examples=[["example.webp", "Explain this image"]],
        inputs=[input_image, text_input],
        outputs=output_text,
        fn=qwen_vl_inference,
        cache_examples=True,
        label="Try an example"
    )

    submit_btn.click(qwen_vl_inference, [input_image, text_input], [output_text])

if __name__ == "__main__":
    demo.launch()