|
from sentence_transformers import SentenceTransformer, util, models |
|
from PIL import ImageFile, Image |
|
import numpy as np |
|
import requests |
|
|
|
|
|
|
|
|
|
|
|
|
|
image = Image.open('two_dogs_in_snow.jpg') |
|
|
|
from transformers import CLIPProcessor, CLIPModel |
|
|
|
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") |
|
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") |
|
|
|
|
|
|
|
inputs = processor(texts=["a cat", "a dog"], images=[image], return_tensors="pt", padding=True) |
|
output = model(**inputs) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
clip = models.CLIPModel() |
|
model = SentenceTransformer(modules=[clip]) |
|
|
|
model.save('tmp-clip-model') |
|
|
|
model = SentenceTransformer('tmp-clip-model') |
|
|
|
|
|
img_emb = model.encode(Image.open('two_dogs_in_snow.jpg')) |
|
|
|
|
|
text_emb = model.encode(['Two dogs in the snow', 'A cat on a table', 'A picture of London at night']) |
|
|
|
|
|
cos_scores = util.cos_sim(img_emb, text_emb) |
|
print(cos_scores) |