import open_clip | |
import torch | |
from PIL import Image | |
model, _, transform = open_clip.create_model_and_transforms( | |
model_name="coca_ViT-L-14", | |
pretrained="mscoco_finetuned_laion2B-s13B-b90k" | |
) | |
def get_captions(image): | |
im = transform(image).unsqueeze(0) | |
with torch.no_grad(), torch.cuda.amp.autocast(): | |
generated = model.generate(im) | |
return open_clip.decode(generated[0]).split("<end_of_text>")[0].replace("<start_of_text>", "") |