Spaces:

reab5555
/

Owlv2-Video-Object-Detection

Paused

App Files Files Community

reab5555 commited on Jul 24, 2024

Commit

c9f1714

verified ·

1 Parent(s): 5dde850

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -37

app.py CHANGED Viewed

@@ -11,10 +11,19 @@ import tempfile
 import shutil
 # Check if CUDA is available, otherwise use CPU
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
 processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16")
-model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16").to(device)
 def process_video(video_path, target, progress=gr.Progress()):
     if video_path is None:
@@ -37,6 +46,19 @@ def process_video(video_path, target, progress=gr.Progress()):
     temp_dir = tempfile.mkdtemp()
     frame_paths = []
     for i, time in enumerate(progress.tqdm(np.arange(0, video_duration, frame_duration))):
         frame_number = int(time * original_fps)
         cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
@@ -48,41 +70,52 @@ def process_video(video_path, target, progress=gr.Progress()):
         img_resized = cv2.resize(img, (640, 360))
         pil_img = Image.fromarray(cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB))
-        # Process single image
-        inputs = processor(text=[target], images=pil_img, return_tensors="pt", padding=True).to(device)
-        outputs = model(**inputs)
-        target_sizes = torch.Tensor([pil_img.size[::-1]])
-        results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)
-        draw = ImageDraw.Draw(pil_img)
-        max_score = 0
-        try:
-            font = ImageFont.truetype("arial.ttf", 20)
-        except IOError:
-            font = ImageFont.load_default()
-        boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
-        for box, score, label in zip(boxes, scores, labels):
-            if score.item() >= 0.5:
-                box = [round(i, 2) for i in box.tolist()]
-                object_label = target
-                confidence = round(score.item(), 3)
-                annotation = f"{object_label}: {confidence}"
-                draw.rectangle(box, outline="red", width=2)
-                text_position = (box[0], box[1] - 20)
-                draw.text(text_position, annotation, fill="white", font=font)
-                max_score = max(max_score, confidence)
-        # Save frame to disk
-        frame_path = os.path.join(temp_dir, f"frame_{i:04d}.png")
-        pil_img.save(frame_path)
-        frame_paths.append(frame_path)
-        frame_scores.append(max_score)
         # Clear GPU cache every 10 frames
         if i % 10 == 0:

 import shutil
 # Check if CUDA is available, otherwise use CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16")
+model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16")
+# Try to move model to GPU and use half precision
+try:
+    model = model.to(device).half()
+except RuntimeError:
+    print("GPU out of memory, using CPU instead")
+    device = torch.device("cpu")
+    model = model.to(device)
 def process_video(video_path, target, progress=gr.Progress()):
     if video_path is None:
     temp_dir = tempfile.mkdtemp()
     frame_paths = []
+    # Try to use GPU with half precision, fall back to CPU if out of memory
+    try:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model.to(device).half()  # Convert model to half precision
+    except RuntimeError:
+        print("GPU out of memory, falling back to CPU")
+        device = torch.device("cpu")
+        model.to(device)
+    batch_size = 4  # Process 4 frames at a time
+    batch_frames = []
+    batch_indices = []
     for i, time in enumerate(progress.tqdm(np.arange(0, video_duration, frame_duration))):
         frame_number = int(time * original_fps)
         cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
         img_resized = cv2.resize(img, (640, 360))
         pil_img = Image.fromarray(cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB))
+        batch_frames.append(pil_img)
+        batch_indices.append(i)
+        if len(batch_frames) == batch_size or i == int(video_duration / frame_duration) - 1:
+            # Process batch
+            inputs = processor(text=[target] * len(batch_frames), images=batch_frames, return_tensors="pt", padding=True).to(device)
+            with torch.no_grad():
+                outputs = model(**inputs)
+            target_sizes = torch.Tensor([pil_img.size[::-1] for _ in batch_frames]).to(device)
+            results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)
+            for idx, (pil_img, result) in enumerate(zip(batch_frames, results)):
+                draw = ImageDraw.Draw(pil_img)
+                max_score = 0
+                try:
+                    font = ImageFont.truetype("arial.ttf", 20)
+                except IOError:
+                    font = ImageFont.load_default()
+                boxes, scores, labels = result["boxes"], result["scores"], result["labels"]
+                for box, score, label in zip(boxes, scores, labels):
+                    if score.item() >= 0.5:
+                        box = [round(i, 2) for i in box.tolist()]
+                        object_label = target
+                        confidence = round(score.item(), 3)
+                        annotation = f"{object_label}: {confidence}"
+                        draw.rectangle(box, outline="red", width=2)
+                        text_position = (box[0], box[1] - 20)
+                        draw.text(text_position, annotation, fill="white", font=font)
+                        max_score = max(max_score, confidence)
+                # Save frame to disk
+                frame_path = os.path.join(temp_dir, f"frame_{batch_indices[idx]:04d}.png")
+                pil_img.save(frame_path)
+                frame_paths.append(frame_path)
+                frame_scores.append(max_score)
+            # Clear batch
+            batch_frames = []
+            batch_indices = []
         # Clear GPU cache every 10 frames
         if i % 10 == 0: