# utils.py from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image import torch class ImageCaptioningModel: def __init__(self, model_name="Salesforce/blip-image-captioning-base"): """ Initialize BLIP Image Captioning model. """ self.processor = BlipProcessor.from_pretrained(model_name) self.model = BlipForConditionalGeneration.from_pretrained(model_name) self.model.eval() def generate_caption(self, image_path): """ Generate a caption for the given image. :param image_path: Path to the input image :return: Generated caption (string) """ image = Image.open(image_path).convert("RGB") inputs = self.processor(images=image, return_tensors="pt") with torch.no_grad(): output = self.model.generate(**inputs) caption = self.processor.tokenizer.decode(output[0], skip_special_tokens=True) return caption