File size: 2,667 Bytes
15ad19e e19e289 1e53e47 e19e289 15ad19e 9a541ed 15ad19e 5e78ed3 15ad19e 1e53e47 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import gradio as gr
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, DetrFeatureExtractor, DetrForObjectDetection
import torch
feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
dmodel = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50')
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
i1 = gr.inputs.Image(type="pil")
i2 = gr.inputs.Textbox()
o1 = gr.outputs.Image(type="pil")
o2 = gr.outputs.Textbox()
def extract_image(image, text, num=1):
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = dmodel(**inputs)
# model predicts bounding boxes and corresponding COCO classes
logits = outputs.logits
bboxes = outputs.pred_boxes
probas = outputs.logits.softmax(-1)[0, :, :-1] #removing no class as detr maps
keep = probas.max(-1).values > 0.96
outs = feature_extractor.post_process(outputs, torch.tensor(image.size[::-1]).unsqueeze(0))
bboxes_scaled = outs[0]['boxes'][keep].detach().numpy()
labels = outs[0]['labels'][keep].detach().numpy()
scores = outs[0]['scores'][keep].detach().numpy()
images_list = []
for i,j in enumerate(bboxes_scaled):
xmin = int(j[0])
ymin = int(j[1])
xmax = int(j[2])
ymax = int(j[3])
im_arr = np.array(image)
roi = im_arr[ymin:ymax, xmin:xmax]
roi_im = Image.fromarray(roi)
images_list.append(roi_im)
inpu = processor(text = [text], images=images_list , return_tensors="pt", padding=True)
output = model(**inpu)
logits_per_image = output.logits_per_text
probs = logits_per_image.softmax(-1)
l_idx = np.argsort(probs[-1].detach().numpy())[::-1][0:num]
final_ims = []
for i,j in enumerate(images_list):
json_dict = {}
if i in l_idx:
json_dict['image'] = images_list[i]
json_dict['score'] = probs[-1].detach().numpy()[i]
final_ims.append(json_dict)
fi = sorted(final_ims, key=lambda item: item.get("score"), reverse=True)
return fi[0]['image'], fi[0]['score']
title = "ClipnCrop"
description = "Extract sections of images from your image by using OpenAI's CLIP and Facebooks Detr implemented on HuggingFace Transformers"
examples=[['ex3.jpg', 'black bag'],['ex2.jpg', 'man in red dress']]
article = "<p style='text-align: center'>"
gr.Interface(fn=extract_image, inputs=[i1, i2], outputs=[o1, o2], title=title, description=description, article=article, examples=examples, enable_queue=True).launch() |