# Copyright 2024 Ronan Le Meillat
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Import necessary libraries
import gradio as gr
from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils
import torch
import spaces

# Determine the device (GPU or CPU) to run the model on
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")  # Log the device being used

# Define the model ID and base model path
model_id = "eltorio/IDEFICS3_ROCO"
base_model_path = "HuggingFaceM4/Idefics3-8B-Llama3"  # or change to local path

# Initialize the processor from the base model path
processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True)

# Initialize the model from the base model path and set the torch dtype to bfloat16
model = Idefics3ForConditionalGeneration.from_pretrained(
    base_model_path, torch_dtype=torch.bfloat16
).to(device)  # Move the model to the specified device

# Load the adapter from the model ID and automatically map it to the device
model.load_adapter(model_id, device_map="auto")

# Define a function to infer a description from an image
@spaces.GPU
def infere(image):
    """
    Generate a description of a medical image.

    Args:
    - image (PIL Image): The medical image to describe.

    Returns:
    - generated_texts (List[str]): A list containing the generated description.
    """

    # Define a chat template for the model to respond to
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a valuable medical doctor and you are looking at an image of your patient."},
            ]
        },
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "What do we see in this image?"},
            ]
        },
    ]

    # Apply the chat template and add a generation prompt
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)

    # Preprocess the input image and text
    inputs = processor(text=prompt, images=[image], return_tensors="pt")

    # Move the inputs to the specified device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate a description with the model
    generated_ids = model.generate(**inputs, max_new_tokens=100)

    # Decode the generated IDs into text
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

    return generated_texts

# Define the title, description, and device description for the Gradio interface
title = f"<a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO</a>: Medical Image to Text <b>running on {device}</b>"
desc = "This model generates a description of a medical image.<br>Note: No affiliation with original author. This is a ZeroGPU-enabled duplicate of <a href='https://huggingface.co/spaces/eltorio/IDEFICS3_ROCO'>spaces/eltorio/IDEFICS3_ROCO</a> to support accelerated inference. Please cite and refer to the original work."

device_desc = f"This model is running on {device} 🚀." if device == 'cuda' else f"🐢 This model is running on {device} it will be very (very) slow. If you can donate some GPU time it will be usable 🐢. <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO/discussions'>Please contact us.</a>"

# Define the long description for the Gradio interface
long_desc = f"This demo is based on the <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO model</a>, which is a multimodal model that can generate text from images. It has been fine-tuned on <a href='https://huggingface.co/datasets/eltorio/ROCO-radiology'>eltorio/ROCO-radiology</a>&nbsp;a dataset of medical images and can generate descriptions of medical images. Try uploading an image of a medical image and see what the model generates!<br><b>{device_desc}</b><br> 2024 - Ronan Le Meillat"

# Create a Gradio interface with the infere function and specified title and descriptions
radiotest = gr.Interface(fn=infere, inputs="image", outputs="text", title=title, 
                description=desc, article=long_desc)

# Launch the Gradio interface and share it
radiotest.launch()