|
import gradio as gr |
|
import torch |
|
from transformers import ( |
|
Idefics2Processor, Idefics2ForConditionalGeneration, |
|
Blip2Processor, Blip2ForConditionalGeneration |
|
) |
|
from PIL import Image |
|
import time |
|
import pandas as pd |
|
import nltk |
|
from nltk.translate.bleu_score import sentence_bleu |
|
|
|
|
|
try: |
|
nltk.data.find("tokenizers/punkt") |
|
except LookupError: |
|
nltk.download("punkt") |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
print(f"Usando dispositivo: {device}") |
|
|
|
|
|
models = { |
|
"IDEFICS2": { |
|
"model_id": "HuggingFaceM4/idefics2-8b", |
|
"processor_class": Idefics2Processor, |
|
"model_class": Idefics2ForConditionalGeneration, |
|
"caption_prompt": "<image>Describe the image in detail" |
|
}, |
|
"BLIP2": { |
|
"model_id": "Salesforce/blip2-opt-2.7b", |
|
"processor_class": Blip2Processor, |
|
"model_class": Blip2ForConditionalGeneration, |
|
"caption_prompt": "" |
|
} |
|
} |
|
|
|
|
|
model_instances = {} |
|
for model_name, config in models.items(): |
|
processor = config["processor_class"].from_pretrained(config["model_id"]) |
|
model = config["model_class"].from_pretrained(config["model_id"]).to(device) |
|
model_instances[model_name] = (processor, model) |
|
|
|
|
|
vqa_questions = [ |
|
"Are there people in the image?", |
|
"Which color predominates in the image?" |
|
] |
|
|
|
|
|
reference_caption = ["An image with people and various objects"] |
|
|
|
def infer(image, model_name, task, question=None): |
|
if image is None: |
|
return "Por favor, sube una imagen.", None, None, None, None, None |
|
|
|
|
|
image = Image.open(image).convert("RGB") |
|
if "BLIP2" in model_name: |
|
image = image.resize((224, 224)) |
|
|
|
processor, model = model_instances[model_name] |
|
|
|
start_time = time.time() |
|
vram = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0 |
|
|
|
if task == "captioning": |
|
caption_prompt = models[model_name]["caption_prompt"] |
|
caption_text = "" if "BLIP2" in model_name else caption_prompt |
|
inputs = processor(images=image, text=caption_text, return_tensors="pt").to(device) |
|
output_ids = model.generate( |
|
**inputs, |
|
max_new_tokens=50, |
|
num_beams=5 if "BLIP2" in model_name else 1, |
|
no_repeat_ngram_size=2 if "BLIP2" in model_name else 0 |
|
) |
|
caption = processor.decode(output_ids[0], skip_special_tokens=True) |
|
inference_time = time.time() - start_time |
|
|
|
|
|
bleu_score = sentence_bleu([reference_caption[0].split()], caption.split()) if caption else 0.0 |
|
|
|
return (caption, inference_time, None, None, vram, bleu_score) |
|
|
|
elif task == "vqa" and question: |
|
vqa_text = question if "BLIP2" in model_name else f"<image>Q: {question}" |
|
inputs = processor(images=image, text=vqa_text, return_tensors="pt").to(device) |
|
output_ids = model.generate( |
|
**inputs, |
|
max_new_tokens=10, |
|
num_beams=5 if "BLIP2" in model_name else 1, |
|
no_repeat_ngram_size=2 if "BLIP2" in model_name else 0 |
|
) |
|
vqa_answer = processor.decode(output_ids[0], skip_special_tokens=True) |
|
inference_time = time.time() - start_time |
|
|
|
return (None, None, vqa_answer, inference_time, vram, None) |
|
|
|
return "Selecciona una tarea válida y, para VQA, una pregunta.", None, None, None, None, None |
|
|
|
|
|
with gr.Blocks(title="MLLM Benchmark Demo") as demo: |
|
gr.Markdown("# Benchmark para Modelos Multimodales (MLLMs)") |
|
gr.Markdown("Sube una imagen, selecciona un modelo y una tarea, y obtén resultados de captioning o VQA.") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
image_input = gr.Image(type="filepath", label="Subir Imagen") |
|
model_dropdown = gr.Dropdown(choices=["IDEFICS2", "BLIP2"], label="Seleccionar Modelo", value="IDEFICS2") |
|
task_dropdown = gr.Dropdown(choices=["captioning", "vqa"], label="Seleccionar Tarea", value="captioning") |
|
question_input = gr.Textbox(label="Pregunta VQA (opcional, solo para VQA)", placeholder="Ej: Are there people in the image?") |
|
submit_btn = gr.Button("Generar") |
|
|
|
with gr.Column(): |
|
caption_output = gr.Textbox(label="Subtítulo Generado") |
|
vqa_output = gr.Textbox(label="Respuesta VQA") |
|
metrics_output = gr.Textbox(label="Métricas (Tiempo, VRAM, BLEU)") |
|
|
|
submit_btn.click( |
|
fn=infer, |
|
inputs=[image_input, model_dropdown, task_dropdown, question_input], |
|
outputs=[caption_output, gr.Number(label="Tiempo Captioning (s)"), vqa_output, gr.Number(label="Tiempo VQA (s)"), gr.Number(label="VRAM (GB)"), gr.Number(label="BLEU Score")] |
|
) |
|
|
|
gr.Markdown("### Notas") |
|
gr.Markdown(""" |
|
- para mejroar la velocidad de inferencia, descarga en local y usar GPU avanzada. |
|
- La métrica BLEU usa una referencia genérica y puede no reflejar la calidad real. |
|
- Para más detalles, consulta el [repositorio del paper](https://huggingface.co/spaces/Pdro-ruiz/MLLM_Estado_del_Arte_Feb25/tree/main). |
|
""") |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |