Superlore
/

clip-vit-large-patch14

@@ -1,11 +1,15 @@
 import base64
-import io
 import torch
 from typing import Dict, List, Any
 from transformers import CLIPProcessor, CLIPModel
 from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
 from PIL import Image
 from torch.nn.functional import cosine_similarity
 class EndpointHandler():
     def __init__(self, path: str="", image_size: int=224) -> None:
@@ -34,7 +38,7 @@ class EndpointHandler():
             data (Dict[str, Any]): A dictionary containing the following key:
                 - "inputs" (Dict[str, list]): A dictionary containing the following keys:
                     - "image_list" (List[str]): A list of base64-encoded images.
-                    - "text_list" (List[str]): A list of text strings.
         Returns:
             Dict[str, list]: A dictionary containing the following keys:
@@ -43,10 +47,37 @@ class EndpointHandler():
                 - "similarity_scores" (List[List[float]]): A list of similarity scores between image and text embeddings.
                                                         Empty if either "image_list" or "text_list" is empty.
         """
         inputs = data.get("inputs", {})
         image_list = inputs.get("image_list", []) # list of b64 images
-        text_list = inputs.get("text_list", []) # list of texts
         image_features = self.get_image_embeddings(image_list) if len(image_list) > 0 else None
         text_features = self.get_text_embeddings(text_list) if len(text_list) > 0 else None
@@ -68,7 +99,7 @@ class EndpointHandler():
         for base64_image in base64_images:
             # Decode the base64-encoded image and convert it to an RGB image
             image_data = base64.b64decode(base64_image)
-            image = Image.open(io.BytesIO(image_data)).convert("RGB")
             preprocessed_image = self.image_transform(image).unsqueeze(0)
             preprocessed_images.append(preprocessed_image)
@@ -83,7 +114,7 @@ class EndpointHandler():
         return image_features
-    def get_text_embeddings(self, text_list: List[str]) -> torch.Tensor:
         with torch.no_grad():
             # Tokenize the input text list
             input_tokens = self.processor(text_list, return_tensors="pt", padding=True, truncation=True)
@@ -93,4 +124,18 @@ class EndpointHandler():
             text_features = self.model.get_text_features(**input_tokens)
         return text_features

 import base64
 import torch
 from typing import Dict, List, Any
+from io import BytesIO
 from transformers import CLIPProcessor, CLIPModel
 from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
 from PIL import Image
 from torch.nn.functional import cosine_similarity
+from typing import Union
+max_text_list_length = 30
+max_image_list_length = 20
 class EndpointHandler():
     def __init__(self, path: str="", image_size: int=224) -> None:
             data (Dict[str, Any]): A dictionary containing the following key:
                 - "inputs" (Dict[str, list]): A dictionary containing the following keys:
                     - "image_list" (List[str]): A list of base64-encoded images.
+                    - "text_list" (Union[List[str], str]): A list of text strings.
         Returns:
             Dict[str, list]: A dictionary containing the following keys:
                 - "similarity_scores" (List[List[float]]): A list of similarity scores between image and text embeddings.
                                                         Empty if either "image_list" or "text_list" is empty.
         """
+        if not isinstance(data, dict):
+            raise ValueError("Expected input data to be a dict.")
         inputs = data.get("inputs", {})
+        if not isinstance(inputs, dict):
+            raise ValueError("Expected 'inputs' to be a dict.")
         image_list = inputs.get("image_list", []) # list of b64 images
+        text_list = inputs.get("text_list", []) # list of texts (or just plain string)
+        if not isinstance(image_list, list):
+            raise ValueError("Expected 'image_list' to be a list.")
+        if not isinstance(text_list, list) and not isinstance(text_list, str):
+            raise ValueError("Expected 'text_list' to be a list or string.")
+        if not all(isinstance(image, str) for image in image_list):
+            raise ValueError("Expected 'image_list' to contain only strings.")
+        if isinstance(text_list, list) and not all(isinstance(text, str) for text in text_list):
+            raise ValueError("Expected 'text_list' to contain only strings.")
+        # if text_list is a string, convert to list
+        if isinstance(text_list, str):
+            text_list = [text_list]
+        if len(image_list) > max_image_list_length:
+            raise ValueError(f"Expected 'image_list' to have a maximum length of {max_image_list_length}.")
+        if len(text_list) > max_text_list_length:
+            raise ValueError(f"Expected 'text_list' to have a maximum length of {max_text_list_length}.")
+        if not all(is_valid_base64_image(image) for image in image_list):
+            raise ValueError("Expected 'image_list' to contain only valid base64-encoded images.")
         image_features = self.get_image_embeddings(image_list) if len(image_list) > 0 else None
         text_features = self.get_text_embeddings(text_list) if len(text_list) > 0 else None
         for base64_image in base64_images:
             # Decode the base64-encoded image and convert it to an RGB image
             image_data = base64.b64decode(base64_image)
+            image = Image.open(BytesIO(image_data)).convert("RGB")
             preprocessed_image = self.image_transform(image).unsqueeze(0)
             preprocessed_images.append(preprocessed_image)
         return image_features
+    def get_text_embeddings(self, text_list: Union[List[str], str]) -> torch.Tensor:
         with torch.no_grad():
             # Tokenize the input text list
             input_tokens = self.processor(text_list, return_tensors="pt", padding=True, truncation=True)
             text_features = self.model.get_text_features(**input_tokens)
         return text_features
+def is_valid_base64_image(data: str) -> bool:
+    try:
+        # Decode the base64 string
+        img_data = base64.b64decode(data)
+        # Open the image using PIL
+        img = Image.open(BytesIO(img_data))
+        # Check that the image format is supported
+        img.verify()
+        return True
+    except:
+        return False