Spaces:

omlab
/

VLM-R1-OVD

Running on Zero

App Files Files Community

qq-hzlh commited on Mar 21

Commit

03418d9

verified ·

1 Parent(s): 2fcecb0

Upload 3 files

Browse files

Files changed (3) hide show

README.md +3 -3
app.py +152 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: VLM R1 OVD
 emoji: 👁
-colorFrom: blue
-colorTo: blue
 sdk: gradio
-sdk_version: 5.22.0
 app_file: app.py
 pinned: false
 license: mit

 ---
 title: VLM R1 OVD
 emoji: 👁
+colorFrom: yellow
+colorTo: purple
 sdk: gradio
+sdk_version: 5.0.1
 app_file: app.py
 pinned: false
 license: mit

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import re
+import torch
+import json_repair
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from PIL import Image, ImageDraw
+def draw_bbox(image, annotation):
+    x1, y1, x2, y2 = annotation["bbox_2d"]
+    label = annotation["label"]
+    draw = ImageDraw.Draw(image)
+    # 绘制边界框
+    draw.rectangle((x1, y1, x2, y2), outline="red", width=5)
+    # 绘制标签文本
+    font_size = 20
+    text_position = (x1, y1 - font_size - 5) if y1 > font_size + 5 else (x1, y2 + 5)
+    try:
+        draw.text(text_position, label, fill="red", font_size = font_size)
+    except Exception as e:
+        print(f"文本绘制错误: {e}")
+        # 如果默认绘制失败，使用简单的方式绘制文本
+        draw.text(text_position, label, fill="red")
+    return image
+def draw_bboxes(image, annotations):
+    """绘制多个边界框和标签"""
+    result_image = image.copy()
+    for annotation in annotations:
+        result_image = draw_bbox(result_image, annotation)
+    return result_image
+def extract_bbox_answer(content):
+    # Extract content between <answer> and </answer> if present
+    answer_matches = re.findall(r'<answer>(.*?)</answer>', content, re.DOTALL)
+    if answer_matches:
+        # Use the last match
+        text = answer_matches[-1]
+    else:
+        text = content
+    # 使用json_repair修复JSON
+    try:
+        data = json_repair.loads(text)
+        if isinstance(data, list) and len(data) > 0:
+            return data
+        else:
+            return []
+    except Exception as e:
+        print(f"JSON解析错误: {e}")
+        return []
+import spaces
+@spaces.GPU
+def process_image_and_text(image, text):
+    """Process image and text input, return thinking process and bbox"""
+    question = f"Please carefully check the image and detect the following objects: [{text}]. "
+    question = question + "First thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Please carefully check the image and detect the following objects: [\"equestrian rider's helmet\"]. Output the bbox coordinates of detected objects in <answer></answer>. The bbox coordinates in Markdown format should be: \n```json\n[{\"bbox_2d\": [x1, y1, x2, y2], \"label\": \"object name\"}]\n```\n If no targets are detected in the image, simply respond with \"None\"."
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = processor(
+        text=[text],
+        images=image,
+        return_tensors="pt",
+        padding=True,
+        padding_side="left",
+        add_special_tokens=False,
+    )
+    inputs = inputs.to("cuda")
+    with torch.no_grad():
+        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=1024, do_sample=False)
+        generated_ids_trimmed = [
+            out_ids[len(inputs.input_ids[0]):] for out_ids in generated_ids
+        ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True
+    )[0]
+    print("output_text: ", output_text)
+    # Extract thinking process
+    think_match = re.search(r'<think>(.*?)</think>', output_text, re.DOTALL)
+    thinking_process = think_match.group(1).strip() if think_match else "No thinking process found"
+    answer_match = re.search(r'<answer>(.*?)</answer>', output_text, re.DOTALL)
+    answer_output = answer_match.group(1).strip() if answer_match else "No answer extracted"
+    # Get bbox and draw
+    bbox = extract_bbox_answer(output_text)
+    # Draw bbox on the image
+    result_image = image.copy()
+    result_image = draw_bboxes(result_image, bbox)
+    return thinking_process, answer_output,result_image
+if __name__ == "__main__":
+    import gradio as gr
+    model_path = "omlab/VLM-R1-Qwen2.5VL-3B-Math-0305"
+    # device = "cuda" if torch.cuda.is_available() else "cpu"
+    device = "cuda"
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
+    model.to(device)
+    processor = AutoProcessor.from_pretrained(model_path)
+    def gradio_interface(image, text):
+        thinking, output,result_image = process_image_and_text(image, text)
+        return thinking, output, result_image
+    demo = gr.Interface(
+        fn=gradio_interface,
+        inputs=[
+            gr.Image(type="pil", label="Input Image"),
+            gr.Textbox(label="Description Text")
+        ],
+        outputs=[
+            gr.Textbox(label="Thinking Process"),
+            gr.Textbox(label="Response"),
+            gr.Image(type="pil", label="Result with Bbox")
+        ],
+        title="Open-Vocabulary Object Detection Demo",
+        description="Upload an image and input description text, the system will return the thinking process and region annotation. \n\nOur GitHub: [VLM-R1](https://github.com/om-ai-lab/VLM-R1/tree/main)",
+        examples=[
+            ["examples/image1.jpg", "person"],
+            ["examples/image2.jpg", "drink, fruit"],
+            ["examples/image3.png", "keyboard, white cup, laptop"],
+        ],
+        cache_examples=False,
+        examples_per_page=10
+    )
+    demo.launch(server_name="0.0.0.0", server_port=7861, share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch>=2.0.0
+git+https://github.com/huggingface/transformers
+Pillow>=10.0.0
+httpx[socks]
+accelerate>=0.26.0
+json_repair>=0.1.0