File size: 1,147 Bytes
2359bda |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
from sentence_transformers import SentenceTransformer, util, models
from PIL import ImageFile, Image
import numpy as np
import requests
###########
image = Image.open('two_dogs_in_snow.jpg')
from transformers import CLIPProcessor, CLIPModel
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
inputs = processor(texts=["a cat", "a dog"], images=[image], return_tensors="pt", padding=True)
output = model(**inputs)
#vision_outputs = model.vision_model(pixel_values=inputs['pixel_values'])
#image_embeds = model.visual_projection(vision_outputs[1])
#print(image_embeds.shape)
#exit()
#Load CLIP model
clip = models.CLIPModel()
model = SentenceTransformer(modules=[clip])
model.save('tmp-clip-model')
model = SentenceTransformer('tmp-clip-model')
#Encode an image:
img_emb = model.encode(Image.open('two_dogs_in_snow.jpg'))
#Encode text descriptions
text_emb = model.encode(['Two dogs in the snow', 'A cat on a table', 'A picture of London at night'])
#Compute cosine similarities
cos_scores = util.cos_sim(img_emb, text_emb)
print(cos_scores) |