# Copyright 2024 Ronan Le Meillat
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Import necessary libraries
import gradio as gr
from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils
import torch
import spaces
# Determine the device (GPU or CPU) to run the model on
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")  # Log the device being used
# Define the model ID and base model path
model_id = "eltorio/IDEFICS3_ROCO"
base_model_path = "HuggingFaceM4/Idefics3-8B-Llama3"  # or change to local path
# Initialize the processor from the base model path
processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True)
# Initialize the model from the base model path and set the torch dtype to bfloat16
model = Idefics3ForConditionalGeneration.from_pretrained(
    base_model_path, torch_dtype=torch.bfloat16
).to(device)  # Move the model to the specified device
# Load the adapter from the model ID and automatically map it to the device
model.load_adapter(model_id, device_map="auto")
# Define a function to infer a description from an image
@spaces.GPU
def infere(image):
    """
    Generate a description of a medical image.
    Args:
    - image (PIL Image): The medical image to describe.
    Returns:
    - generated_texts (List[str]): A list containing the generated description.
    """
    # Define a chat template for the model to respond to
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a valuable medical doctor and you are looking at an image of your patient."},
            ]
        },
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "What do we see in this image?"},
            ]
        },
    ]
    # Apply the chat template and add a generation prompt
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    # Preprocess the input image and text
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    # Move the inputs to the specified device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # Generate a description with the model
    generated_ids = model.generate(**inputs, max_new_tokens=100)
    # Decode the generated IDs into text
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return generated_texts
# Define the title, description, and device description for the Gradio interface
title = f"IDEFICS3_ROCO: Medical Image to Text running on {device}"
desc = "This model generates a description of a medical image.
Note: No affiliation with original author. This is a ZeroGPU-enabled duplicate of spaces/eltorio/IDEFICS3_ROCO to support accelerated inference. Please cite and refer to the original work."
device_desc = f"This model is running on {device} 🚀." if device == 'cuda' else f"🐢 This model is running on {device} it will be very (very) slow. If you can donate some GPU time it will be usable 🐢. Please contact us."
# Define the long description for the Gradio interface
long_desc = f"This demo is based on the IDEFICS3_ROCO model, which is a multimodal model that can generate text from images. It has been fine-tuned on eltorio/ROCO-radiology a dataset of medical images and can generate descriptions of medical images. Try uploading an image of a medical image and see what the model generates!
{device_desc}
 2024 - Ronan Le Meillat"
# Create a Gradio interface with the infere function and specified title and descriptions
radiotest = gr.Interface(fn=infere, inputs="image", outputs="text", title=title, 
                description=desc, article=long_desc)
# Launch the Gradio interface and share it
radiotest.launch()