import open_clip import torch from PIL import Image model, _, transform = open_clip.create_model_and_transforms( model_name="coca_ViT-L-14", pretrained="mscoco_finetuned_laion2B-s13B-b90k" ) def get_captions(image): im = transform(image).unsqueeze(0) with torch.no_grad(), torch.cuda.amp.autocast(): generated = model.generate(im) return open_clip.decode(generated[0]).split("")[0].replace("", "")