tech9-ai
/

colpali-v1.3-hf

Visual Document Retrieval

Transformers

Safetensors

ColPali

English

pretraining

Model card Files Files and versions Community

adrish commited on 17 days ago

Commit

16843db

1 Parent(s): a5e6882

Updated handler.py to use ColPaliProcessor, ColPaliForRetrieval

Browse files

Files changed (1) hide show

handler.py +23 -35

handler.py CHANGED Viewed

@@ -3,40 +3,40 @@ import io
 import os
 from PIL import Image
 import torch
-from transformers import AutoProcessor, AutoModelForImageTextToText
 from typing import Dict, Any, List
 class EndpointHandler:
     def __init__(self, model_path: str = None):
         """
-        Initialize the endpoint handler by loading the ColPali model for image-to-text generation.
-        If no model path is provided, it defaults to 'vidore/colpali-v1.3-hf' on Hugging Face.
         """
         if model_path is None:
-            model_path = os.path.dirname(os.path.realpath(__file__))
         try:
-            # Select GPU if available, otherwise fall back to CPU.
             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-            # Load the model with the generic ImageTextToText interface.
-            self.model = AutoModelForImageTextToText.from_pretrained(
                 model_path,
                 device_map="cuda" if torch.cuda.is_available() else "cpu",
                 trust_remote_code=True,
                 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                _attn_implementation="flash_attention_2"
             ).to(self.device)
-            # Load the processor which handles both image preprocessing and text tokenization.
-            self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
         except Exception as e:
             raise RuntimeError(f"Error loading model or processor: {e}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
-        Process the input data for image-to-text generation.
         Expects a dictionary with an "inputs" key containing a list of dictionaries.
         Each dictionary should have:
           - "image": a base64-encoded image string.
-          - "prompt": (optional) a text prompt (a default prompt is used if missing).
         """
         try:
             inputs_list = data.get("inputs", [])
@@ -53,42 +53,30 @@ class EndpointHandler:
                 if not image_b64:
                     return {"error": "One of the input items is missing 'image' data."}
                 try:
-                    # Decode base64 image and convert to RGB.
                     image = Image.open(io.BytesIO(base64.b64decode(image_b64))).convert("RGB")
                     images.append(image)
                 except Exception as e:
                     return {"error": f"Failed to decode one of the images: {e}"}
-                # Use the provided prompt or fall back to a default prompt.
                 prompt = item.get("prompt", "Describe the image content in detail.")
                 texts.append(prompt)
-            # Process both text and image inputs via the processor.
             model_inputs = self.processor(
-                text=texts,
                 images=images,
                 padding=True,
                 return_tensors="pt",
             ).to(self.device)
-            # Generation configuration (can be overridden by the request).
-            max_new_tokens = config.get("max_new_tokens", 1000)
-            temperature = config.get("temperature", 0.8)
-            num_return_sequences = config.get("num_return_sequences", 1)
-            do_sample = bool(config.get("do_sample", True))
-            # Generate outputs using the model.
-            outputs = self.model.generate(
-                **model_inputs,
-                temperature=temperature,
-                max_new_tokens=max_new_tokens,
-                num_return_sequences=num_return_sequences,
-                do_sample=do_sample,
-            )
-            # Decode the generated tokens into human-readable text.
-            text_output = self.processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)
-            return {"responses": text_output}
         except Exception as e:
             return {"error": f"Unexpected error: {e}"}
@@ -99,7 +87,7 @@ _service = EndpointHandler()
 def handle(data, context):
     """
     Entry point for the Hugging Face dedicated inference endpoint.
-    It processes the input data and returns the model's generated responses.
     """
     try:
         if data is None:

 import os
 from PIL import Image
 import torch
+from transformers import ColPaliProcessor, ColPaliForRetrieval
 from typing import Dict, Any, List
 class EndpointHandler:
     def __init__(self, model_path: str = None):
         """
+        Initialize the endpoint handler using the ColPali retrieval model.
+        If no model path is provided, it defaults to 'vidore/colpali-v1.3-hf'.
         """
         if model_path is None:
+            model_path = "vidore/colpali-v1.3-hf"
         try:
             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            # Use the specialized ColPaliForRetrieval class.
+            self.model = ColPaliForRetrieval.from_pretrained(
                 model_path,
                 device_map="cuda" if torch.cuda.is_available() else "cpu",
                 trust_remote_code=True,
                 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             ).to(self.device)
+            # Use the specialized ColPaliProcessor.
+            self.processor = ColPaliProcessor.from_pretrained(model_path, trust_remote_code=True)
         except Exception as e:
             raise RuntimeError(f"Error loading model or processor: {e}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
+        Process the input data, run inference using the ColPali retrieval model,
+        and return the outputs.
         Expects a dictionary with an "inputs" key containing a list of dictionaries.
         Each dictionary should have:
           - "image": a base64-encoded image string.
+          - "prompt": (optional) a text prompt (default is used if missing).
         """
         try:
             inputs_list = data.get("inputs", [])
                 if not image_b64:
                     return {"error": "One of the input items is missing 'image' data."}
                 try:
+                    # Decode the base64-encoded image and convert to RGB.
                     image = Image.open(io.BytesIO(base64.b64decode(image_b64))).convert("RGB")
                     images.append(image)
                 except Exception as e:
                     return {"error": f"Failed to decode one of the images: {e}"}
+                # Use the provided prompt or a default prompt.
                 prompt = item.get("prompt", "Describe the image content in detail.")
                 texts.append(prompt)
+            # Prepare inputs with the ColPali processor.
             model_inputs = self.processor(
                 images=images,
+                text=texts,
                 padding=True,
                 return_tensors="pt",
             ).to(self.device)
+            # For retrieval, we call the model directly rather than using generate().
+            outputs = self.model(**model_inputs)
+            # Assuming that the model returns logits or retrieval scores,
+            # we extract and convert them to lists.
+            retrieval_scores = outputs.logits.tolist() if hasattr(outputs, "logits") else outputs
+            return {"responses": retrieval_scores}
         except Exception as e:
             return {"error": f"Unexpected error: {e}"}
 def handle(data, context):
     """
     Entry point for the Hugging Face dedicated inference endpoint.
+    Processes the input data and returns the model's outputs.
     """
     try:
         if data is None: