File size: 2,892 Bytes
ec563f5
15ad19e
 
9a7ed40
9438682
15ad19e
ec563f5
 
15ad19e
9438682
 
 
 
 
ec563f5
 
 
15ad19e
ec563f5
 
 
 
15ad19e
0b1c7be
15ad19e
 
 
 
 
 
 
 
 
817115e
15ad19e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a541ed
 
 
15ad19e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec563f5
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import streamlit as st
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, YolosImageProcessor, YolosForObjectDetection
import torch

st.title("CLIP & CROP")
st.markdown("**Extract sections of images from your image by using OpenAI's CLIP and Facebooks Detr implemented on HuggingFace Transformers, if the similarity score is not so much, then please consider the prediction to be void.**")

IMAGE_INPUT = st.file_uploader(type=["jpg", "png"], label="Input image")
TEXT_INPUT = st.text_input(label="Description for section to extracted")
NUMBER_INPUT = st.number_input(value=0.96, label="Threshold percentage score")


with st.spinner("Models are loading"):
    feature_extractor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
    dmodel = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')

    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
 
SUBMIT_BUTTON = st.button("SUBMIT")

def extract_image(image, text, prob, num=1):
    
    inputs = feature_extractor(images=image, return_tensors="pt")
    outputs = dmodel(**inputs)
    
    # model predicts bounding boxes and corresponding COCO classes
    logits = outputs.logits
    bboxes = outputs.pred_boxes
    probas = outputs.logits.softmax(-1)[0, :, :-1] #removing no class as detr maps 
    
    keep = probas.max(-1).values > prob
    outs = feature_extractor.post_process(outputs, torch.tensor(image.size[::-1]).unsqueeze(0))
    bboxes_scaled = outs[0]['boxes'][keep].detach().numpy()
    labels = outs[0]['labels'][keep].detach().numpy()
    scores = outs[0]['scores'][keep].detach().numpy()
    
    images_list = []
    for i,j in enumerate(bboxes_scaled):
      
      xmin = int(j[0])
      ymin = int(j[1])
      xmax = int(j[2])
      ymax = int(j[3])
    
      im_arr = np.array(image)
      roi = im_arr[ymin:ymax, xmin:xmax]
      roi_im = Image.fromarray(roi)
    
      images_list.append(roi_im)
    
    inpu = processor(text = [text], images=images_list , return_tensors="pt", padding=True)
    output = model(**inpu)
    logits_per_image = output.logits_per_text
    probs = logits_per_image.softmax(-1)
    l_idx = np.argsort(probs[-1].detach().numpy())[::-1][0:num]
    
    final_ims = []
    for i,j in enumerate(images_list):
      json_dict = {}
      if i in l_idx:
        json_dict['image'] = images_list[i]
        json_dict['score'] = probs[-1].detach().numpy()[i]
    
        final_ims.append(json_dict)
    
    fi = sorted(final_ims, key=lambda item: item.get("score"), reverse=True)
    return fi[0]['image'], fi[0]['score']

if SUBMIT_BUTTON:
    imageOutput, scoreOutput = extract(IMAGE_INPUT, TEXT_INPUT, NUMBER_INPUT)
    st.image(imageOutput, caption="Cropped Image")
    st.markdown("*Confidence Score:*")
    st.success(scoreOutput)