Spaces:
Runtime error
Runtime error
File size: 6,448 Bytes
b0e19ad bfa6130 b0e19ad bfa6130 b0e19ad bfa6130 b0e19ad bfa6130 b0e19ad 57ee827 b0e19ad c7a12b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import torch
import cv2
import gradio as gr
import numpy as np
from transformers import OwlViTProcessor, OwlViTForObjectDetection
import pdb
from collections import OrderedDict
# Use GPU if available
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
model.eval()
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
def query_image(img, text_queries,max_results):
text_queries = text_queries
text_queries = text_queries.split(",")
target_sizes = torch.Tensor([img.shape[:2]])
inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model(**inputs)
outputs.logits = outputs.logits.cpu()
outputs.pred_boxes = outputs.pred_boxes.cpu()
results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
results_dict = {}
count = 0
for box, score, label in zip(boxes, scores, labels):
results_dict[count] = {"score":score.tolist(),"box":box,"label":label}
count += 1
sorted_results_dict = OrderedDict(sorted(results_dict.items(),key=lambda item: item[1]["score"],reverse=True))
font = cv2.FONT_HERSHEY_SIMPLEX
score_dist = []
count = 0
for score in sorted_results_dict:
score_dist.append(round(score,2))
count += 1
if (count == 10):
break
#for box, score, label in zip(boxes, scores, labels):
result_count = 0
for score in sorted_results_dict:
box = sorted_results_dict[score]["box"]
label = sorted_results_dict[score]["label"]
box = [int(i) for i in box.tolist()]
print("label:",label,"score:",score)
#if score >= score_threshold:
img = cv2.rectangle(img, box[:2], box[2:], (255,0,0), 1)
if box[3] + 25 > 768:
y = box[3] - 10
else:
y = box[3] + 25
rounded_score = round(score,2)
img = cv2.putText(
img, f"({rounded_score}):{text_queries[label]}", (box[0], y), font, .5, (255,0,0), 1, cv2.LINE_AA
)
result_count += 1
if (result_count >= max_results):
break
return (img,f"Top {count} score confidences:{str(score_dist)}")
description = """
<div style=\"font-size:18px; color: #2f2f2f; text-align: center\">
</i>This app is a tweaked variation of <a href="https://huggingface.co/spaces/adirik/OWL-ViT">Alara Dirik's OWL-ViT demo</a>
</i></div>
<div style=\"font-size:18px; color: #2f2f2f; text-align: left\">
<b>Use cases of this model</b>
<br/>1) Given an image with an object, detect it. <i>(e.g. Where is Waldo? app)</i>
<br/>2) Given an image with multiple instances of an object, detect them <i>(e.g. labeling tool assistance for bounding box annotation)</i>
<br/>3) Find an object within an image using either text or image as input <i>(e.g. Image Search app - this would require pruning candidates using a threshold and using the score distribution in the output. Search using an input image could be useful when trying to find things that are hard to describe in text like a machine part)</i>
<br/><div style=\"font-size:16px; color: #3f3f3f; text-align: left\">
<br/>Links to apps/notebooks of other SOTA models for open vocabulary object detection or zero-shot object detection
<br/>a) <a href="https://huggingface.co/spaces/CVPR/regionclip-demo">RegionCLIP</a>
<br/>b) <a href="https://colab.research.google.com/drive/19LBqQg0cS36rTLL_TaXZ7Ka9KJGkxiSe?usp=sharing">Colab notebook for Object-Centric-OVD</a>
</div>
<br/><div style=\"font-size:16px; color: #4f4f4f; text-align: left\"><b>Note: Inference time depends on input image size. Typically images with dimensions less than 500px has response time under 5 secs on CPU.</b><br/><i> While most examples showcased illustrate model capabilities, some illustrate model's limitations - such as finding globe,bird cage,teapot etc.Also, the model appears to have text region detection and limited text recognition capabilities</i></div>
<div style=\"font-size:14px; color: #6f6f6f; text-align: left\"><i>Images below are from <a href="https://en.wikipedia.org/wiki/Hidden_object_game">Wikipedia</a>, <a href="http://images.cocodataset.org/val2017/000000133819.jpg">COCO</a> and <a href="http://host.robots.ox.ac.uk/pascal/VOC/voc2012/">PASCAL VOC 2012</a> datasets </i></div>
"""
demo = gr.Interface(
query_image,
inputs=[gr.Image(), "text",gr.Slider(1, 10, value=1)],
outputs=["image","text"],
server_port=80,
server_name="0.0.0.0",
title="Where is Waldo? <i>(implemented with OWL-ViT)</i>",
description=description,
examples=[
["assets/Hidden_object_game_scaled.png", "bicycle", 1],
["assets/Hidden_object_game_scaled.png", "laptop", 1],
["assets/Hidden_object_game_scaled.png", "abacus", 1],
["assets/Hidden_object_game_scaled.png", "frog", 1],
["assets/Hidden_object_game_scaled.png", "bird cage", 2],
["assets/Hidden_object_game_scaled.png", "globe", 2],
["assets/Hidden_object_game_scaled.png", "teapot", 3],
["assets/bus_ovd.jpg", "license plate", 1],
["assets/bus_ovd.jpg", "sign saying ARRIVA", 1],
["assets/bus_ovd.jpg", "sign saying ARRIVAL", 1],
["assets/bus_ovd.jpg", "crossing push button", 1],
["assets/bus_ovd.jpg", "building on moutain", 2],
["assets/bus_ovd.jpg", "road marking", 3],
["assets/bus_ovd.jpg", "mirror", 1],
["assets/bus_ovd.jpg", "traffic camera", 1],
["assets/bus_ovd.jpg", "red bus,blue bus", 2],
["assets/calf.png", "snout,tail", 1],
["assets/calf.png", "hoof", 4],
["assets/calf.png", "ear", 2],
["assets/calf.png", "tag", 1],
["assets/calf.png", "hay", 1],
["assets/calf.png", "barbed wire", 1],
["assets/calf.png", "grass", 1],
["assets/calf.png", "can", 2],
["assets/road_signs.png", "STOP", 1],
["assets/road_signs.png", "STOP sign", 1],
["assets/road_signs.png", "arrow", 1],
["assets/road_signs.png", "ROAD", 1],
["assets/road_signs.png", "triangle", 1],
],
)
demo.launch()
|