File size: 2,994 Bytes
ec563f5 15ad19e 9a7ed40 9438682 15ad19e ec563f5 52bf2af 15ad19e 52bf2af 9438682 52bf2af 15ad19e 52bf2af ec563f5 52bf2af 15ad19e 52bf2af 15ad19e 52bf2af 15ad19e 52bf2af 15ad19e 52bf2af 15ad19e 52bf2af 15ad19e 52bf2af 15ad19e 52bf2af 15ad19e 52bf2af 15ad19e 52bf2af 15ad19e 52bf2af 15ad19e 52bf2af 15ad19e 52bf2af 15ad19e 52bf2af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import streamlit as st
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, YolosImageProcessor, YolosForObjectDetection
import torch
st.title("CLIP & CROP")
# st.markdown("**Extract sections of images from your image by using OpenAI's CLIP and Facebooks Detr implemented on HuggingFace Transformers, if the similarity score is not so much, then please consider the prediction to be void.**")
# IMAGE_INPUT = st.file_uploader(type=["jpg", "png"], label="Input image")
# TEXT_INPUT = st.text_input(label="Description for section to extracted")
# NUMBER_INPUT = st.number_input(value=0.96, label="Threshold percentage score")
# with st.spinner("Models are loading"):
# feature_extractor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
# dmodel = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
# model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
# SUBMIT_BUTTON = st.button("SUBMIT")
# def extract_image(image, text, prob, num=1):
# inputs = feature_extractor(images=image, return_tensors="pt")
# outputs = dmodel(**inputs)
# # model predicts bounding boxes and corresponding COCO classes
# logits = outputs.logits
# bboxes = outputs.pred_boxes
# probas = outputs.logits.softmax(-1)[0, :, :-1] #removing no class as detr maps
# keep = probas.max(-1).values > prob
# outs = feature_extractor.post_process(outputs, torch.tensor(image.size[::-1]).unsqueeze(0))
# bboxes_scaled = outs[0]['boxes'][keep].detach().numpy()
# labels = outs[0]['labels'][keep].detach().numpy()
# scores = outs[0]['scores'][keep].detach().numpy()
# images_list = []
# for i,j in enumerate(bboxes_scaled):
# xmin = int(j[0])
# ymin = int(j[1])
# xmax = int(j[2])
# ymax = int(j[3])
# im_arr = np.array(image)
# roi = im_arr[ymin:ymax, xmin:xmax]
# roi_im = Image.fromarray(roi)
# images_list.append(roi_im)
# inpu = processor(text = [text], images=images_list , return_tensors="pt", padding=True)
# output = model(**inpu)
# logits_per_image = output.logits_per_text
# probs = logits_per_image.softmax(-1)
# l_idx = np.argsort(probs[-1].detach().numpy())[::-1][0:num]
# final_ims = []
# for i,j in enumerate(images_list):
# json_dict = {}
# if i in l_idx:
# json_dict['image'] = images_list[i]
# json_dict['score'] = probs[-1].detach().numpy()[i]
# final_ims.append(json_dict)
# fi = sorted(final_ims, key=lambda item: item.get("score"), reverse=True)
# return fi[0]['image'], fi[0]['score']
# if SUBMIT_BUTTON:
# imageOutput, scoreOutput = extract(IMAGE_INPUT, TEXT_INPUT, NUMBER_INPUT)
# st.image(imageOutput, caption="Cropped Image")
# st.markdown("*Confidence Score:*")
# st.success(scoreOutput) |