File size: 6,448 Bytes
b0e19ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfa6130
b0e19ad
bfa6130
 
 
b0e19ad
 
 
 
 
 
 
bfa6130
b0e19ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfa6130
b0e19ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57ee827
b0e19ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7a12b5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import torch
import cv2
import gradio as gr
import numpy as np
from transformers import OwlViTProcessor, OwlViTForObjectDetection
import pdb
from collections import OrderedDict


# Use GPU if available
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
model.eval()
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")


def query_image(img, text_queries,max_results):
    text_queries = text_queries
    text_queries = text_queries.split(",")

    target_sizes = torch.Tensor([img.shape[:2]])
    inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)
    
    outputs.logits = outputs.logits.cpu()
    outputs.pred_boxes = outputs.pred_boxes.cpu() 
    results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
    boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
    results_dict = {}
    count = 0
    for box, score, label in zip(boxes, scores, labels):
        results_dict[count] = {"score":score.tolist(),"box":box,"label":label}
        count += 1
    sorted_results_dict = OrderedDict(sorted(results_dict.items(),key=lambda item: item[1]["score"],reverse=True))

    
    font = cv2.FONT_HERSHEY_SIMPLEX

    score_dist = []
    count = 0
    for score in sorted_results_dict:
        score_dist.append(round(score,2))
        count += 1
        if (count == 10):
            break
    

    #for box, score, label in zip(boxes, scores, labels):
    result_count = 0
    for score in sorted_results_dict:
        box = sorted_results_dict[score]["box"]
        label = sorted_results_dict[score]["label"]
        box = [int(i) for i in box.tolist()]

        print("label:",label,"score:",score)
        #if score >= score_threshold:
        img = cv2.rectangle(img, box[:2], box[2:], (255,0,0), 1)
        if box[3] + 25 > 768:
            y = box[3] - 10
        else:
            y = box[3] + 25
            
        rounded_score = round(score,2)
        img = cv2.putText(
            img, f"({rounded_score}):{text_queries[label]}", (box[0], y), font, .5, (255,0,0), 1, cv2.LINE_AA
        )
        result_count += 1
        if (result_count >= max_results):
            break
    return (img,f"Top {count} score confidences:{str(score_dist)}")


description = """
<div style=\"font-size:18px; color: #2f2f2f; text-align: center\">
</i>This app is a tweaked variation of <a href="https://huggingface.co/spaces/adirik/OWL-ViT">Alara Dirik's OWL-ViT demo</a>
</i></div>
<div style=\"font-size:18px; color: #2f2f2f; text-align: left\">
<b>Use cases of this model</b>
<br/>1) Given an image with an object, detect it.  <i>(e.g. Where is Waldo? app)</i>
<br/>2) Given an  image with  multiple instances of an object, detect them <i>(e.g. labeling tool assistance for bounding box annotation)</i>
<br/>3) Find an object within an image using either text or image as input <i>(e.g. Image Search app - this would require pruning candidates using a threshold and using the score distribution in the output. Search using an input image could be useful when trying to find things that are hard to describe in text like a machine part)</i>
<br/><div style=\"font-size:16px; color: #3f3f3f; text-align: left\">
<br/>Links to apps/notebooks of other SOTA models for open vocabulary object detection or zero-shot object detection
<br/>a) <a href="https://huggingface.co/spaces/CVPR/regionclip-demo">RegionCLIP</a>
<br/>b) <a href="https://colab.research.google.com/drive/19LBqQg0cS36rTLL_TaXZ7Ka9KJGkxiSe?usp=sharing">Colab notebook for Object-Centric-OVD</a>
</div>
<br/><div style=\"font-size:16px; color: #4f4f4f; text-align: left\"><b>Note: Inference time depends on input image size. Typically images with dimensions less than 500px has response time under 5 secs on CPU.</b><br/><i> While most examples showcased illustrate model capabilities, some illustrate model's limitations - such as finding globe,bird cage,teapot etc.Also, the model appears to have text region detection and limited text recognition capabilities</i></div>
<div style=\"font-size:14px; color: #6f6f6f; text-align: left\"><i>Images below are from&nbsp;&nbsp;<a href="https://en.wikipedia.org/wiki/Hidden_object_game">Wikipedia</a>,&nbsp;<a href="http://images.cocodataset.org/val2017/000000133819.jpg">COCO</a> and <a href="http://host.robots.ox.ac.uk/pascal/VOC/voc2012/">PASCAL VOC 2012</a>&nbsp;datasets </i></div>
"""
demo = gr.Interface(
    query_image, 
    inputs=[gr.Image(), "text",gr.Slider(1, 10, value=1)], 
    outputs=["image","text"],
    server_port=80,
    server_name="0.0.0.0",
    title="Where is Waldo? <i>(implemented with OWL-ViT)</i>",
    description=description,
    examples=[
        ["assets/Hidden_object_game_scaled.png", "bicycle", 1], 
        ["assets/Hidden_object_game_scaled.png", "laptop", 1], 
        ["assets/Hidden_object_game_scaled.png", "abacus", 1], 
        ["assets/Hidden_object_game_scaled.png", "frog", 1], 
        ["assets/Hidden_object_game_scaled.png", "bird cage", 2], 
        ["assets/Hidden_object_game_scaled.png", "globe", 2], 
        ["assets/Hidden_object_game_scaled.png", "teapot", 3], 
        ["assets/bus_ovd.jpg", "license plate", 1], 
        ["assets/bus_ovd.jpg", "sign saying ARRIVA", 1], 
        ["assets/bus_ovd.jpg", "sign saying ARRIVAL", 1], 
        ["assets/bus_ovd.jpg", "crossing push button", 1], 
        ["assets/bus_ovd.jpg", "building on moutain", 2], 
        ["assets/bus_ovd.jpg", "road marking", 3], 
        ["assets/bus_ovd.jpg", "mirror", 1], 
        ["assets/bus_ovd.jpg", "traffic camera", 1], 
        ["assets/bus_ovd.jpg", "red bus,blue bus", 2], 
        ["assets/calf.png", "snout,tail", 1], 
        ["assets/calf.png", "hoof", 4], 
        ["assets/calf.png", "ear", 2], 
        ["assets/calf.png", "tag", 1], 
        ["assets/calf.png", "hay", 1], 
        ["assets/calf.png", "barbed wire", 1], 
        ["assets/calf.png", "grass", 1], 
        ["assets/calf.png", "can", 2], 
        ["assets/road_signs.png", "STOP", 1], 
        ["assets/road_signs.png", "STOP sign", 1], 
        ["assets/road_signs.png", "arrow", 1], 
        ["assets/road_signs.png", "ROAD", 1], 
        ["assets/road_signs.png", "triangle", 1], 
    ],
)
demo.launch()