KB-VQA

Sleeping

App Files Files Community

m7mdal7aj commited on May 16, 2024

Commit

b9d4498

verified ·

1 Parent(s): e72ec95

Update my_model/captioner/image_captioning.py

Browse files

Files changed (1) hide show

my_model/captioner/image_captioning.py +75 -8

my_model/captioner/image_captioning.py CHANGED Viewed

@@ -3,6 +3,7 @@ import io
 import torch
 import PIL
 from PIL import Image
 from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
 import bitsandbytes
 import accelerate
@@ -11,7 +12,31 @@ from my_model.utilities.gen_utilities import free_gpu_resources
 class ImageCaptioningModel:
-    def __init__(self):
         self.model_type = config.MODEL_TYPE
         self.processor = None
         self.model = None
@@ -29,9 +54,12 @@ class ImageCaptioningModel:
-    def load_model(self):
-        if self.load_in_4bit and self.load_in_8bit:  # check if in case both set to True by mistake.
             self.load_in_4bit = False
         if self.model_type == 'i_blip':
@@ -53,7 +81,18 @@ class ImageCaptioningModel:
             free_gpu_resources()
-    def resize_image(self, image, max_image_size=None):
         if max_image_size is None:
             max_image_size = int(os.getenv("MAX_IMAGE_SIZE", "1024"))
         h, w = image.size
@@ -67,7 +106,17 @@ class ImageCaptioningModel:
         return image
-    def generate_caption(self, image_path):
         free_gpu_resources()
         free_gpu_resources()
         if isinstance(image_path, str) or isinstance(image_path, io.IOBase):
@@ -85,12 +134,30 @@ class ImageCaptioningModel:
         free_gpu_resources()
         return caption
-    def generate_captions_for_multiple_images(self, image_paths):
         return [self.generate_caption(image_path) for image_path in image_paths]
-def get_caption(img):
     captioner = ImageCaptioningModel()
     free_gpu_resources()
     captioner.load_model()

 import torch
 import PIL
 from PIL import Image
+from typing import Optional, Union, List
 from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
 import bitsandbytes
 import accelerate
 class ImageCaptioningModel:
+    """
+    A class to handle image captioning using InstructBlip model.
+    Attributes:
+        model_type (str): Type of the model to use.
+        processor (InstructBlipProcessor or None): The processor for handling image input.
+        model (InstructBlipForConditionalGeneration or None): The loaded model.
+        prompt (str): Prompt for the model.
+        max_image_size (int): Maximum size for the input image.
+        min_length (int): Minimum length of the generated caption.
+        max_new_tokens (int): Maximum number of new tokens to generate.
+        model_path (str): Path to the pre-trained model.
+        device_map (str): Device map for model loading.
+        torch_dtype (torch.dtype): Data type for torch tensors.
+        load_in_8bit (bool): Whether to load the model in 8-bit precision.
+        load_in_4bit (bool): Whether to load the model in 4-bit precision.
+        low_cpu_mem_usage (bool): Whether to optimize for low CPU memory usage.
+        skip_special_tokens (bool): Whether to skip special tokens in the generated captions.
+    """
+    def __init__(self) -> None:
+        """
+        Initializes the ImageCaptioningModel class with configuration settings.
+        """
         self.model_type = config.MODEL_TYPE
         self.processor = None
         self.model = None
+    def load_model(self) -> None:
+        """
+        Loads the InstructBlip model and processor based on the specified configuration.
+        """
+        if self.load_in_4bit and self.load_in_8bit:  # Ensure only one of 4-bit or 8-bit precision is used.
             self.load_in_4bit = False
         if self.model_type == 'i_blip':
             free_gpu_resources()
+    def resize_image(self, image: Image.Image, max_image_size: Optional[int] = None) -> Image.Image:
+        """
+        Resizes the image to fit within the specified maximum size while maintaining aspect ratio.
+        Args:
+            image (Image.Image): The input image to resize.
+            max_image_size (Optional[int]): The maximum size for the resized image. Defaults to None.
+        Returns:
+            Image.Image: The resized image.
+        """
         if max_image_size is None:
             max_image_size = int(os.getenv("MAX_IMAGE_SIZE", "1024"))
         h, w = image.size
         return image
+    def generate_caption(self, image_path: Union[str, io.IOBase, Image.Image]) -> str:
+        """
+        Generates a caption for the given image.
+        Args:
+            image_path (Union[str, io.IOBase, Image.Image]): The path to the image, file-like object, or PIL Image.
+        Returns:
+            str: The generated caption for the image.
+        """
         free_gpu_resources()
         free_gpu_resources()
         if isinstance(image_path, str) or isinstance(image_path, io.IOBase):
         free_gpu_resources()
         return caption
+    def generate_captions_for_multiple_images(self, image_paths: List[Union[str, io.IOBase, Image.Image]]) -> List[str]:
+        """
+        Generates captions for multiple images.
+        Args:
+            image_paths (List[Union[str, io.IOBase, Image.Image]]): A list of paths to images, file-like objects, or PIL Images.
+        Returns:
+            List[str]: A list of captions for the provided images.
+        """
         return [self.generate_caption(image_path) for image_path in image_paths]
+def get_caption(img: Union[str, io.IOBase, Image.Image]) -> str:
+    """
+    Loads the captioning model and generates a caption for a single image.
+    Args:
+        img (Union[str, io.IOBase, Image.Image]): The path to the image, file-like object, or PIL Image.
+    Returns:
+        str: The generated caption for the image.
+    """
     captioner = ImageCaptioningModel()
     free_gpu_resources()
     captioner.load_model()