import gradio as gr import torch from PIL import Image from transformers import ColPaliForRetrieval, ColPaliProcessor import numpy as np model_name = "vidore/colpali-v1.3-hf" model = ColPaliForRetrieval.from_pretrained(model_name, torch_dtype=torch.float32).eval() processor = ColPaliProcessor.from_pretrained(model_name) def process_image(image): # Ensure the image is in RGB format image = image.convert('RGB') # Process the image inputs = processor(images=image, return_tensors="pt") # Generate embeddings with torch.no_grad(): outputs = model(**inputs) # Extract embeddings and convert to list embeddings = outputs.embeddings.squeeze().cpu().numpy().tolist() # Truncate the embeddings for display purposes truncated_embeddings = embeddings[:10] # Show only first 10 values # Prepare the output output = { "embedding_sample": truncated_embeddings, "embedding_length": len(embeddings), "embedding_shape": list(np.array(embeddings).shape) } return output # Create Gradio interface demo = gr.Interface( fn=process_image, inputs=gr.Image(type="pil"), outputs=gr.JSON(), title="ColPali Image Embedding Generator", description="Upload an image to generate its embedding using the ColPali model." ) # Launch the interface demo.launch()