OWL-ViT

Runtime error

App Files Files Community

kellyxiaowei commited on Jun 26, 2023

Commit

1bcb7e6

1 Parent(s): f1725e1

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -29

app.py CHANGED Viewed

@@ -1,77 +1,79 @@
-import torch
 import cv2
 import gradio as gr
 import numpy as np
 from transformers import OwlViTProcessor, OwlViTForObjectDetection
-# Use GPU if available
 if torch.cuda.is_available():
     device = torch.device("cuda")
 else:
     device = torch.device("cpu")
 model = OwlViTForObjectDetection.from_pretrained("google/owlvit-large-patch14").to(device)
 model.eval()
 processor = OwlViTProcessor.from_pretrained("google/owlvit-large-patch14")
-def query_image(img, text_queries, score_threshold):
-    text_queries = text_queries
-    text_queries = text_queries.split(",")
     target_sizes = torch.Tensor([img.shape[:2]])
-    inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
     with torch.no_grad():
-        outputs = model(**inputs)
     outputs.logits = outputs.logits.cpu()
-    outputs.pred_boxes = outputs.pred_boxes.cpu()
     results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
     boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
     font = cv2.FONT_HERSHEY_SIMPLEX
     for box, score, label in zip(boxes, scores, labels):
         box = [int(i) for i in box.tolist()]
         if score >= score_threshold:
             img = cv2.rectangle(img, box[:2], box[2:], (255,0,0), 5)
-            if box[3] + 25 > 768:
-                y = box[3] - 10
-            else:
-                y = box[3] + 25
             img = cv2.putText(
                 img, text_queries[label], (box[0], y), font, 1, (255,0,0), 2, cv2.LINE_AA
             )
     return img
 description = """
-Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/owlvit">OWL-ViT</a>,
-introduced in <a href="https://arxiv.org/abs/2205.06230">Simple Open-Vocabulary Object Detection
-with Vision Transformers</a>.
-\n\nYou can use OWL-ViT to query images with text descriptions of any object.
-To use it, simply upload an image and enter comma separated text descriptions of objects you want to query the image for. You
-can also use the score threshold slider to set a threshold to filter out low probability predictions.
-\n\nOWL-ViT is trained on text templates,
-hence you can get better predictions by querying the image with text templates used in training the original model: *"photo of a star-spangled banner"*,
-*"image of a shoe"*. Refer to the <a href="https://arxiv.org/abs/2103.00020">CLIP</a> paper to see the full list of text templates used to augment the training data.
-\n\n<a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb">Colab demo</a>
 """
 demo = gr.Interface(
     query_image,
-    inputs=[gr.Image(), "text", gr.Slider(0, 1, value=0.1)],
     outputs="image",
     title="Zero-Shot Object Detection with OWL-ViT",
     description=description,
     examples=[
-        ["assets/astronaut.png", "human face, rocket, star-spangled banner, nasa badge", 0.11],
-        ["assets/coffee.png", "coffee mug, spoon, plate", 0.1],
-        ["assets/butterflies.jpeg", "orange butterfly", 0.3],
     ],
 )
 demo.launch()

+iimport torch
 import cv2
 import gradio as gr
 import numpy as np
 from transformers import OwlViTProcessor, OwlViTForObjectDetection
+import requests
+# 如果GPU可用，就使用GPU，否则使用CPU
 if torch.cuda.is_available():
     device = torch.device("cuda")
 else:
     device = torch.device("cpu")
+# 从预训练模型"google/owlvit-large-patch14"加载OWL-ViT模型，并将其放置到适当的设备上
 model = OwlViTForObjectDetection.from_pretrained("google/owlvit-large-patch14").to(device)
 model.eval()
+# 从同一预训练模型中加载处理器
 processor = OwlViTProcessor.from_pretrained("google/owlvit-large-patch14")
+# 定义一个函数来处理图像URL，文本查询和分数阈值
+def query_image(img_url, text_queries, score_threshold):
+    # 使用requests库从URL中获取图像
+    response = requests.get(img_url)
+    response.raise_for_status()
+    arr = np.asarray(bytearray(response.content), dtype=np.uint8)
+    img = cv2.imdecode(arr, -1)  # 使用-1来加载原始图像
+    text_queries = text_queries.split(",")  # 将文本查询分割成独立的查询
     target_sizes = torch.Tensor([img.shape[:2]])
+    inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)  # 使用处理器创建模型的输入
     with torch.no_grad():
+        outputs = model(**inputs)  # 获取模型的输出
+    # 将输出转移到CPU上
     outputs.logits = outputs.logits.cpu()
+    outputs.pred_boxes = outputs.pred_boxes.cpu()
+    # 使用处理器进行后处理
     results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
     boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
     font = cv2.FONT_HERSHEY_SIMPLEX
+    # 在图像上绘制边界框并添加标签
     for box, score, label in zip(boxes, scores, labels):
         box = [int(i) for i in box.tolist()]
         if score >= score_threshold:
             img = cv2.rectangle(img, box[:2], box[2:], (255,0,0), 5)
+            y = box[3] - 10 if box[3] + 25 > 768 else box[3] + 25
             img = cv2.putText(
                 img, text_queries[label], (box[0], y), font, 1, (255,0,0), 2, cv2.LINE_AA
             )
     return img
 description = """
+Gradio demo for OWL-ViT.
+You can use OWL-ViT to query images with text descriptions of any object.
+To use it, simply provide an image URL and enter comma separated text descriptions of objects you want to query the image for.
+You can also use the score threshold slider to set a threshold to filter out low probability predictions.
 """
+# 创建一个Gradio界面
 demo = gr.Interface(
     query_image,
+    inputs=["text", "text", gr.Slider(0, 1, value=0.1)],  # 修改输入，使其接受URL而不是图像
     outputs="image",
     title="Zero-Shot Object Detection with OWL-ViT",
     description=description,
     examples=[
+        ["https://example.com/path/to/image.png", "human face, rocket, star-spangled banner, nasa badge", 0.11],
+        ["https://example.com/path/to/another/image.png", "coffee mug, spoon, plate", 0.1],
     ],
 )
 demo.launch()