tech9-ai
/

colpali-v1.3-hf

+import base64
+import io
+import os
+from PIL import Image
+import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText
+from typing import Dict, Any, List
+class EndpointHandler:
+    def __init__(self, model_path: str = None):
+        """
+        Initialize the endpoint handler by loading the ColPali model for image-to-text generation.
+        If no model path is provided, it defaults to 'vidore/colpali-v1.3-hf' on Hugging Face.
+        """
+        if model_path is None:
+            model_path = os.path.dirname(os.path.realpath(__file__))
+        try:
+            # Select GPU if available, otherwise fall back to CPU.
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            # Load the model with the generic ImageTextToText interface.
+            self.model = AutoModelForImageTextToText.from_pretrained(
+                model_path,
+                device_map="cuda" if torch.cuda.is_available() else "cpu",
+                trust_remote_code=True,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                _attn_implementation="flash_attention_2"
+            ).to(self.device)
+            # Load the processor which handles both image preprocessing and text tokenization.
+            self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+        except Exception as e:
+            raise RuntimeError(f"Error loading model or processor: {e}")
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Process the input data for image-to-text generation.
+        Expects a dictionary with an "inputs" key containing a list of dictionaries.
+        Each dictionary should have:
+          - "image": a base64-encoded image string.
+          - "prompt": (optional) a text prompt (a default prompt is used if missing).
+        """
+        try:
+            inputs_list = data.get("inputs", [])
+            config = data.get("config", {})
+            if not inputs_list or not isinstance(inputs_list, list):
+                return {"error": "Inputs should be a list of dictionaries with 'image' and optionally 'prompt' keys."}
+            images: List[Image.Image] = []
+            texts: List[str] = []
+            for item in inputs_list:
+                image_b64 = item.get("image")
+                if not image_b64:
+                    return {"error": "One of the input items is missing 'image' data."}
+                try:
+                    # Decode base64 image and convert to RGB.
+                    image = Image.open(io.BytesIO(base64.b64decode(image_b64))).convert("RGB")
+                    images.append(image)
+                except Exception as e:
+                    return {"error": f"Failed to decode one of the images: {e}"}
+                # Use the provided prompt or fall back to a default prompt.
+                prompt = item.get("prompt", "Describe the image content in detail.")
+                texts.append(prompt)
+            # Process both text and image inputs via the processor.
+            model_inputs = self.processor(
+                text=texts,
+                images=images,
+                padding=True,
+                return_tensors="pt",
+            ).to(self.device)
+            # Generation configuration (can be overridden by the request).
+            max_new_tokens = config.get("max_new_tokens", 1000)
+            temperature = config.get("temperature", 0.8)
+            num_return_sequences = config.get("num_return_sequences", 1)
+            do_sample = bool(config.get("do_sample", True))
+            # Generate outputs using the model.
+            outputs = self.model.generate(
+                **model_inputs,
+                temperature=temperature,
+                max_new_tokens=max_new_tokens,
+                num_return_sequences=num_return_sequences,
+                do_sample=do_sample,
+            )
+            # Decode the generated tokens into human-readable text.
+            text_output = self.processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            return {"responses": text_output}
+        except Exception as e:
+            return {"error": f"Unexpected error: {e}"}
+# Instantiate the endpoint handler.
+_service = EndpointHandler()
+def handle(data, context):
+    """
+    Entry point for the Hugging Face dedicated inference endpoint.
+    It processes the input data and returns the model's generated responses.
+    """
+    try:
+        if data is None:
+            return {"error": "No input data received"}
+        return _service(data)
+    except Exception as e:
+        return {"error": f"Exception in handler: {e}"}