Spaces:

danieaneta
/

zero-shot-seg

Sleeping

App Files Files Community

danieaneta commited on Dec 25, 2024

Commit

945be6a

verified ·

1 Parent(s): 03f73fb

Upload 3 files

Browse files

Files changed (3) hide show

app.py +71 -0
main.py +164 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import gradio as gr
+from PIL import Image
+import base64
+import io
+import numpy as np
+from typing import List
+from main import segmenter  # Import the segmenter instance
+def process_image(image: Image.Image, objects_text: str) -> dict:
+    """Process image and return results"""
+    try:
+        # Parse objects
+        objects = [obj.strip() for obj in objects_text.split('.') if obj.strip()]
+        # Use the segmenter to process the image
+        results = segmenter.segment_objects(image, objects)
+        # Create visualization of results
+        # For now, just returning the original image
+        buffered = io.BytesIO()
+        image.save(buffered, format="PNG")
+        img_str = base64.b64encode(buffered.getvalue()).decode()
+        # Format results for response
+        return {
+            "success": True,
+            "message": f"Processed image with objects: {objects}",
+            "image": img_str,
+            "results": [
+                {
+                    "label": r.label,
+                    "confidence": float(r.confidence),
+                    "bounding_box": r.bounding_box
+                }
+                for r in results
+            ]
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "message": str(e),
+            "image": None,
+            "results": []
+        }
+# Create Gradio interface with API mode enabled
+demo = gr.Interface(
+    fn=process_image,
+    inputs=[
+        gr.Image(type="pil", label="Input Image"),
+        gr.Textbox(label="Objects (separate with dots)", placeholder="cat. dog. chair")
+    ],
+    outputs=gr.JSON(label="API Response"),
+    title="Zero Shot Segmentation",
+    description="Upload an image and specify objects to detect.",
+    allow_flagging="never",
+    examples=[
+        ["path/to/example.jpg", "cat. dog"]
+    ]
+)
+# Enable API access
+demo.queue()
+if __name__ == "__main__":
+    demo.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_api=True
+    )

main.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import torch
+import numpy as np
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+from ultralytics import YOLO
+from typing import Dict, List, Tuple, Union, Optional
+from dataclasses import dataclass
+@dataclass
+class SegmentationResult:
+    """Data class to store segmentation results"""
+    label: str
+    confidence: float
+    mask: np.ndarray
+    bounding_box: List[int]
+class ObjectSegmenter:
+    """A class for zero-shot object detection and segmentation"""
+    def __init__(self, device: Optional[str] = None):
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        torch.cuda.empty_cache()
+        self._init_models()
+    def _init_models(self):
+        """Initialize DINO and YOLO models"""
+        # Grounding DINO setup
+        self.dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
+        self.dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
+            "IDEA-Research/grounding-dino-tiny"
+        ).to(self.device).eval()
+        # YOLO setup
+        self.yolo_model = YOLO('yolov8n-seg.pt')
+    def segment_objects(
+        self,
+        image: Union[Image.Image, np.ndarray, str],
+        objects: Union[str, List[str]],
+        box_threshold: float = 0.4,
+        text_threshold: float = 0.3
+    ) -> List[SegmentationResult]:
+        """Segment specified objects in the image"""
+        # Prepare image
+        if isinstance(image, str):
+            image = Image.open(image)
+        elif isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        # Prepare text prompt
+        if isinstance(objects, list):
+            text_prompt = ". ".join(objects)
+        else:
+            text_prompt = objects
+        if not text_prompt.endswith('.'):
+            text_prompt += '.'
+        # Get DINO detections
+        dino_results = self._get_dino_detections(
+            image, text_prompt, box_threshold, text_threshold
+        )
+        # Get YOLO segmentation
+        yolo_results = self.yolo_model(image, verbose=False)[0]
+        # Match detections with segmentations
+        return self._process_results(dino_results, yolo_results)
+    @torch.no_grad()
+    def _get_dino_detections(
+        self,
+        image: Image.Image,
+        text_prompt: str,
+        box_threshold: float,
+        text_threshold: float
+    ) -> dict:
+        """Get object detections from Grounding DINO"""
+        inputs = self.dino_processor(
+            images=image,
+            text=text_prompt,
+            return_tensors="pt"
+        ).to(self.device)
+        outputs = self.dino_model(**inputs)
+        results = self.dino_processor.post_process_grounded_object_detection(
+            outputs,
+            inputs.input_ids,
+            box_threshold=box_threshold,
+            text_threshold=text_threshold,
+            target_sizes=[image.size[::-1]]
+        )[0]
+        return results
+    def _process_results(
+        self,
+        dino_results: dict,
+        yolo_results
+    ) -> List[SegmentationResult]:
+        """Match detections with segmentations and create result objects"""
+        segmentation_results = []
+        for box, score, label in zip(
+            dino_results["boxes"],
+            dino_results["scores"],
+            dino_results["labels"]
+        ):
+            box = [int(x) for x in box.tolist()]
+            # Find best matching YOLO mask
+            best_mask = self._find_best_mask(box, yolo_results)
+            if best_mask is not None:
+                result = SegmentationResult(
+                    label=label,
+                    confidence=float(score),
+                    mask=best_mask,
+                    bounding_box=box
+                )
+                segmentation_results.append(result)
+        return segmentation_results
+    def _find_best_mask(self, box: List[int], yolo_results) -> Optional[np.ndarray]:
+        """Find best matching YOLO mask for a given bounding box"""
+        if len(yolo_results.masks) == 0:
+            return None
+        best_iou = 0
+        best_mask = None
+        for mask in yolo_results.masks.data:
+            mask_np = mask.cpu().numpy()
+            y_indices, x_indices = np.where(mask_np > 0)
+            if len(y_indices) == 0:
+                continue
+            mask_box = [
+                x_indices.min(),
+                y_indices.min(),
+                x_indices.max(),
+                y_indices.max()
+            ]
+            iou = self._calculate_iou(box, mask_box)
+            if iou > best_iou:
+                best_iou = iou
+                best_mask = mask_np
+        return best_mask
+    @staticmethod
+    def _calculate_iou(box1: List[int], box2: List[int]) -> float:
+        """Calculate Intersection over Union between two boxes"""
+        intersection = max(0, min(box1[2], box2[2]) - max(box1[0], box2[0])) * \
+                      max(0, min(box1[3], box2[3]) - max(box1[1], box2[1]))
+        box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+        box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+        return intersection / (box1_area + box2_area - intersection)
+# Initialize the segmenter
+segmenter = ObjectSegmenter()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio==4.44.1
+Pillow
+numpy
+torch
+transformers
+ultralytics