Feat - Model, Requirements & Handler

Files changed (7) hide show

.gitignore +2 -0
config.json +26 -0
download.py +25 -0
handler.py +131 -0
model.safetensors +3 -0
preprocessor_config.json +27 -0
requirements.txt +27 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ env
2	+ .DS_Store

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "apple/aimv2-large-patch14-native",
+  "architectures": [
+    "AIMv2Model"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "apple/aimv2-large-patch14-native--configuration_aimv2.AIMv2Config",
+    "AutoModel": "apple/aimv2-large-patch14-native--modeling_aimv2.AIMv2Model",
+    "FlaxAutoModel": "apple/aimv2-large-patch14-native--modeling_flax_aimv2.FlaxAIMv2Model"
+  },
+  "hidden_size": 1024,
+  "intermediate_size": 2816,
+  "model_type": "aimv2",
+  "num_attention_heads": 8,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "num_queries": 256,
+  "patch_size": 14,
+  "projection_dropout": 0.0,
+  "qkv_bias": false,
+  "rms_norm_eps": 1e-05,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.1",
+  "use_bias": false
+}

download.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# import requests
+# from PIL import Image
+from transformers import AutoImageProcessor, AutoModel
+# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+# image = Image.open(requests.get(url, stream=True).raw)
+PATH = "."
+MODEL_NAME = "apple/aimv2-large-patch14-native"
+processor = AutoImageProcessor.from_pretrained(MODEL_NAME)
+processor.save_pretrained(PATH)
+model = AutoModel.from_pretrained(
+    MODEL_NAME,
+    trust_remote_code=True,
+)
+model.save_pretrained(PATH)
+# inputs = processor(images=image, return_tensors="pt")
+# outputs = model(**inputs)

handler.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import torch
+from typing import Dict, Any, List
+from PIL import Image
+import base64
+from io import BytesIO
+import logging
+from transformers import AutoImageProcessor, AutoModel
+import os
+from dataclasses import dataclass
+# Define a dataclass for the results
+@dataclass
+class ImageEncodingResult:
+    image_encoded: List[List[float]]  # Full encoded embeddings
+    image_encoded_average: List[float]  # Average of the embeddings
+class EndpointHandler:
+    """
+    A handler class for processing images and generating embeddings using a pre-trained model.
+    Attributes:
+        processor: The pre-trained image processor.
+        model: The pre-trained model for generating embeddings.
+        device: The device (CPU or CUDA) used to run model inference.
+    """
+    def __init__(self):
+        """
+        Initializes the EndpointHandler with the model and processor from the current directory.
+        """
+        # Initialize logging
+        logging.basicConfig(level=logging.INFO)
+        self.logger = logging.getLogger(__name__)
+        # Determine the device (CPU or CUDA)
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.logger.info(f"Using device: {self.device}")
+        # Load the model and processor from the current directory
+        self.logger.info("Loading model and processor from the current directory.")
+        try:
+            self.processor = AutoImageProcessor.from_pretrained(os.getcwd())
+            self.model = AutoModel.from_pretrained(
+                os.getcwd(), trust_remote_code=True
+            ).to(self.device)
+            self.logger.info("Model and processor loaded successfully.")
+        except Exception as e:
+            self.logger.error(f"Failed to load model or processor: {e}")
+            raise
+    def _resize_image_if_large(
+        self, image: Image.Image, max_size: int = 1080
+    ) -> Image.Image:
+        """
+        Resizes an image if its dimensions exceed the specified maximum size.
+        Args:
+            image (Image.Image): Input image.
+            max_size (int): Maximum size for the image dimensions.
+        Returns:
+            Image.Image: Resized image.
+        """
+        width, height = image.size
+        if width > max_size or height > max_size:
+            scale = max_size / max(width, height)
+            new_width = int(width * scale)
+            new_height = int(height * scale)
+            image = image.resize((new_width, new_height), resample=Image.BILINEAR)
+        return image
+    def _encode_image(self, image: Image.Image) -> ImageEncodingResult:
+        """
+        Encodes an image into embeddings using the model.
+        Args:
+            image (Image.Image): Input image.
+        Returns:
+            ImageEncodingResult: Dataclass containing the encoded embeddings and their average.
+        """
+        try:
+            # Resize the image if necessary
+            image = self._resize_image_if_large(image)
+            # Process the image and generate embeddings
+            inputs = self.processor(image, return_tensors="pt").to(self.device)
+            with torch.inference_mode():
+                outputs = self.model(**inputs)
+                last_hidden_state = outputs.last_hidden_state
+                image_encoded = last_hidden_state.squeeze().tolist()
+                image_encoded_average = last_hidden_state.mean(dim=1).squeeze().tolist()
+            return ImageEncodingResult(
+                image_encoded=image_encoded,
+                image_encoded_average=image_encoded_average,
+            )
+        except Exception as e:
+            self.logger.error(f"Error encoding image: {e}")
+            raise
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Processes input data containing base64-encoded images and generates embeddings.
+        Args:
+            data (Dict[str, Any]): Dictionary containing input images.
+        Returns:
+            Dict[str, Any]: Dictionary containing encoded embeddings or error messages.
+        """
+        images_data = data.get("images", [])
+        if not images_data:
+            return {"error": "No image data provided."}
+        results = []
+        for img_data in images_data:
+            if isinstance(img_data, str):
+                try:
+                    # Decode the base64-encoded image
+                    image_bytes = base64.b64decode(img_data)
+                    image = Image.open(BytesIO(image_bytes)).convert("RGB")
+                    # Encode the image
+                    encoded_image = self._encode_image(image)
+                    results.append(encoded_image)
+                except Exception as e:
+                    self.logger.error(f"Invalid image data: {e}")
+                    return {"error": f"Invalid image data: {e}"}
+            else:
+                self.logger.error("Images should be base64-encoded strings.")
+                return {"error": "Images should be base64-encoded strings."}
+        # Convert the results to a dictionary for JSON serialization
+        return {"results": [result.__dict__ for result in results]}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6cdc4c4ea6f2a477edebb482cc36ba021409a313eabdf3e6be62eb722771e7d1
+size 1235760720

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": false,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": false,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+accelerate==1.3.0
+certifi==2024.12.14
+charset-normalizer==3.4.1
+filelock==3.17.0
+fsspec==2024.12.0
+huggingface-hub==0.27.1
+idna==3.10
+Jinja2==3.1.5
+MarkupSafe==3.0.2
+mpmath==1.3.0
+networkx==3.4.2
+numpy==2.2.2
+packaging==24.2
+pillow==11.1.0
+psutil==6.1.1
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+safetensors==0.5.2
+setuptools==75.8.0
+sympy==1.13.1
+tokenizers==0.21.0
+torch==2.5.1
+tqdm==4.67.1
+transformers==4.48.1
+typing_extensions==4.12.2
+urllib3==2.3.0