File size: 3,021 Bytes
15ad19e
 
 
9a7ed40
15ad19e
 
ae343dc
 
15ad19e
a00839f
 
15ad19e
4dd0aa6
 
a10fa1a
4dd0aa6
 
15ad19e
0b1c7be
15ad19e
 
 
 
 
 
 
 
 
817115e
15ad19e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a541ed
 
 
15ad19e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6bcb798
db3aa1c
817115e
9c97f04
cf1be2e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, YolosImageProcessor, YolosForObjectDetection
import torch

feature_extractor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
dmodel = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

i1 = gr.Image(type="pil", label="Input image")
i2 = gr.Textbox(label="Description for section to extracted")
i3 = gr.Number(value=0.96, label="Threshold percentage score")
o1 = gr.Image(type="pil", label="Extracted Crop part")
o2 = gr.Textbox(label="Similarity score")

def extract_image(image, text, prob, num=1):
    
    inputs = feature_extractor(images=image, return_tensors="pt")
    outputs = dmodel(**inputs)
    
    # model predicts bounding boxes and corresponding COCO classes
    logits = outputs.logits
    bboxes = outputs.pred_boxes
    probas = outputs.logits.softmax(-1)[0, :, :-1] #removing no class as detr maps 
    
    keep = probas.max(-1).values > prob
    outs = feature_extractor.post_process(outputs, torch.tensor(image.size[::-1]).unsqueeze(0))
    bboxes_scaled = outs[0]['boxes'][keep].detach().numpy()
    labels = outs[0]['labels'][keep].detach().numpy()
    scores = outs[0]['scores'][keep].detach().numpy()
    
    images_list = []
    for i,j in enumerate(bboxes_scaled):
      
      xmin = int(j[0])
      ymin = int(j[1])
      xmax = int(j[2])
      ymax = int(j[3])
    
      im_arr = np.array(image)
      roi = im_arr[ymin:ymax, xmin:xmax]
      roi_im = Image.fromarray(roi)
    
      images_list.append(roi_im)
    
    inpu = processor(text = [text], images=images_list , return_tensors="pt", padding=True)
    output = model(**inpu)
    logits_per_image = output.logits_per_text
    probs = logits_per_image.softmax(-1)
    l_idx = np.argsort(probs[-1].detach().numpy())[::-1][0:num]
    
    final_ims = []
    for i,j in enumerate(images_list):
      json_dict = {}
      if i in l_idx:
        json_dict['image'] = images_list[i]
        json_dict['score'] = probs[-1].detach().numpy()[i]
    
        final_ims.append(json_dict)
    
    fi = sorted(final_ims, key=lambda item: item.get("score"), reverse=True)
    return fi[0]['image'], fi[0]['score']

title = "ClipnCrop"
description = "<p style= 'color:white'>Extract sections of images from your image by using OpenAI's CLIP and Facebooks Detr implemented on HuggingFace Transformers, if the similarity score is not so much, then please consider the prediction to be void.</p>" 
examples=[['ex3.jpg', 'black bag', 0.96],['ex2.jpg', 'man in red dress', 0.85]]
article = "<p style= 'color:white; text-align:center;'><a href='https://github.com/Vishnunkumar/clipcrop' target='_blank'>clipcrop</a></p>"
gr.Interface(fn=extract_image, inputs=[i1, i2, i3], outputs=[o1, o2], title=title, description=description, article=article, examples=examples).launch()