CLIPnCROP / app.py
vishnun's picture
Update app.py
ec563f5
raw
history blame
2.88 kB
import streamlit as st
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, YolosImageProcessor, YolosForObjectDetection
import torch
st.title("CLIP & CROP")
st.markdown("**Extract sections of images from your image by using OpenAI's CLIP and Facebooks Detr implemented on HuggingFace Transformers, if the similarity score is not so much, then please consider the prediction to be void.**")
with st.spinner("Models are loading"):
feature_extractor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
dmodel = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
IMAGE_INPUT = st.file_uploader(type="pil", label="Input image")
TEXT_INPUT = st.text_input(label="Description for section to extracted")
NUMBER_INPUT = st.number_input(value=0.96, label="Threshold percentage score")
SUBMIT_BUTTON = st.button("SUBMIT")
def extract_image(image, text, prob, num=1):
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = dmodel(**inputs)
# model predicts bounding boxes and corresponding COCO classes
logits = outputs.logits
bboxes = outputs.pred_boxes
probas = outputs.logits.softmax(-1)[0, :, :-1] #removing no class as detr maps
keep = probas.max(-1).values > prob
outs = feature_extractor.post_process(outputs, torch.tensor(image.size[::-1]).unsqueeze(0))
bboxes_scaled = outs[0]['boxes'][keep].detach().numpy()
labels = outs[0]['labels'][keep].detach().numpy()
scores = outs[0]['scores'][keep].detach().numpy()
images_list = []
for i,j in enumerate(bboxes_scaled):
xmin = int(j[0])
ymin = int(j[1])
xmax = int(j[2])
ymax = int(j[3])
im_arr = np.array(image)
roi = im_arr[ymin:ymax, xmin:xmax]
roi_im = Image.fromarray(roi)
images_list.append(roi_im)
inpu = processor(text = [text], images=images_list , return_tensors="pt", padding=True)
output = model(**inpu)
logits_per_image = output.logits_per_text
probs = logits_per_image.softmax(-1)
l_idx = np.argsort(probs[-1].detach().numpy())[::-1][0:num]
final_ims = []
for i,j in enumerate(images_list):
json_dict = {}
if i in l_idx:
json_dict['image'] = images_list[i]
json_dict['score'] = probs[-1].detach().numpy()[i]
final_ims.append(json_dict)
fi = sorted(final_ims, key=lambda item: item.get("score"), reverse=True)
return fi[0]['image'], fi[0]['score']
if SUBMIT_BUTTON:
imageOutput, scoreOutput = extract(IMAGE_INPUT, TEXT_INPUT, NUMBER_INPUT)
st.image(imageOutput, caption="Cropped Image")
st.markdown("*Confidence Score:*")
st.success(scoreOutput)