SAM2-Video-Predictor

Running on T4

App Files Files Community

fffiloni commited on Aug 2, 2024

Commit

0479145

verified ·

1 Parent(s): 2ed6668

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -49

app.py CHANGED Viewed

@@ -7,12 +7,53 @@ import numpy as np
 import cv2
 import matplotlib.pyplot as plt
 from PIL import Image, ImageFilter
-from sam2.build_sam import build_sam2
-from sam2.sam2_image_predictor import SAM2ImagePredictor
 def preprocess_image(image):
     return image, gr.State([]), gr.State([]), image
 def get_point(point_type, tracking_points, trackings_input_label, first_frame_path, evt: gr.SelectData):
     print(f"You selected {evt.value} at {evt.index} from {evt.target}")
@@ -56,27 +97,23 @@ if torch.cuda.get_device_properties(0).major >= 8:
     torch.backends.cuda.matmul.allow_tf32 = True
     torch.backends.cudnn.allow_tf32 = True
-def show_mask(mask, ax, random_color=False, borders = True):
     if random_color:
         color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
     else:
-        color = np.array([30/255, 144/255, 255/255, 0.6])
     h, w = mask.shape[-2:]
-    mask = mask.astype(np.uint8)
-    mask_image =  mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
-    if borders:
-        import cv2
-        contours, _ = cv2.findContours(mask,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
-        # Try to smooth contours
-        contours = [cv2.approxPolyDP(contour, epsilon=0.01, closed=True) for contour in contours]
-        mask_image = cv2.drawContours(mask_image, contours, -1, (1, 1, 1, 0.5), thickness=2)
     ax.imshow(mask_image)
-def show_points(coords, labels, ax, marker_size=375):
     pos_points = coords[labels==1]
     neg_points = coords[labels==0]
     ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
-    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
 def show_box(box, ax):
     x0, y0 = box[0], box[1]
@@ -130,10 +167,12 @@ def show_masks(image, masks, scores, point_coords=None, box_coords=None, input_l
     return combined_images, mask_images
-def sam_process(input_image, checkpoint, tracking_points, trackings_input_label):
-    image = Image.open(input_image)
-    image = np.array(image.convert("RGB"))
     if checkpoint == "tiny":
         sam2_checkpoint = "./checkpoints/sam2_hiera_tiny.pt"
         model_cfg = "sam2_hiera_t.yaml"
@@ -147,56 +186,118 @@ def sam_process(input_image, checkpoint, tracking_points, trackings_input_label)
         sam2_checkpoint = "./checkpoints/sam2_hiera_large.pt"
         model_cfg = "sam2_hiera_l.yaml"
-    sam2_model = build_sam2(model_cfg, sam2_checkpoint, device="cuda")
-    predictor = SAM2ImagePredictor(sam2_model)
-    predictor.set_image(image)
-    input_point = np.array(tracking_points.value)
-    input_label = np.array(trackings_input_label.value)
-    print(predictor._features["image_embed"].shape, predictor._features["image_embed"][-1].shape)
-    masks, scores, logits = predictor.predict(
-        point_coords=input_point,
-        point_labels=input_label,
-        multimask_output=False,
-    )
-    sorted_ind = np.argsort(scores)[::-1]
-    masks = masks[sorted_ind]
-    scores = scores[sorted_ind]
-    logits = logits[sorted_ind]
-    print(masks.shape)
-    results, mask_results = show_masks(image, masks, scores, point_coords=input_point, input_labels=input_label, borders=True)
-    print(results)
-    return results[0], mask_results[0]
 with gr.Blocks() as demo:
     first_frame_path = gr.State()
     tracking_points = gr.State([])
     trackings_input_label = gr.State([])
     with gr.Column():
-        gr.Markdown("# SAM2 Image Predictor")
-        gr.Markdown("This is a simple demo for image segmentation with SAM2.")
         gr.Markdown("""Instructions:
-        1. Upload your image
-        2. With 'include' point type selected, Click on the object to mask
         3. Switch to 'exclude' point type if you want to specify an area to avoid
         4. Submit !
         """)
         with gr.Row():
             with gr.Column():
-                input_image = gr.Image(label="input image", interactive=False, type="filepath", visible=False)
                 points_map = gr.Image(
                     label="points map",
                     type="filepath",
                     interactive=True
                 )
                 with gr.Row():
                     point_type = gr.Radio(label="point type", choices=["include", "exclude"], value="include")
                     clear_points_btn = gr.Button("Clear Points")
@@ -204,19 +305,19 @@ with gr.Blocks() as demo:
                 submit_btn = gr.Button("Submit")
             with gr.Column():
                 output_result = gr.Image()
-                output_result_mask = gr.Image()
     clear_points_btn.click(
         fn = preprocess_image,
-        inputs = input_image,
         outputs = [first_frame_path, tracking_points, trackings_input_label, points_map],
         queue=False
     )
-    points_map.upload(
-        fn = preprocess_image,
-        inputs = [points_map],
-        outputs = [first_frame_path, tracking_points, trackings_input_label, input_image],
         queue = False
     )
@@ -229,8 +330,8 @@ with gr.Blocks() as demo:
     submit_btn.click(
         fn = sam_process,
-        inputs = [input_image, checkpoint, tracking_points, trackings_input_label],
-        outputs = [output_result, output_result_mask]
     )
 demo.launch(show_api=False, show_error=True)

 import cv2
 import matplotlib.pyplot as plt
 from PIL import Image, ImageFilter
+from sam2.build_sam import build_sam2_video_predictor
 def preprocess_image(image):
     return image, gr.State([]), gr.State([]), image
+def preprocess_video_in(video_path):
+    # Generate a unique ID based on the current date and time
+    unique_id = datetime.now().strftime('%Y%m%d%H%M%S')
+    output_dir = f'frames_{unique_id}'
+    # Create the output directory
+    os.makedirs(output_dir, exist_ok=True)
+    # Open the video file
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print("Error: Could not open video.")
+        return None
+    frame_number = 0
+    first_frame = None
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Format the frame filename as '00000.jpg'
+        frame_filename = os.path.join(output_dir, f'{frame_number:05d}.jpg')
+        # Save the frame as a JPEG file
+        cv2.imwrite(frame_filename, frame)
+        # Store the first frame
+        if frame_number == 0:
+            first_frame = frame_filename
+        frame_number += 1
+    # Release the video capture object
+    cap.release()
+    # 'image' is the first frame extracted from video_in
+    return first_frame, gr.State([]), gr.State([]), first_frame, first_frame
 def get_point(point_type, tracking_points, trackings_input_label, first_frame_path, evt: gr.SelectData):
     print(f"You selected {evt.value} at {evt.index} from {evt.target}")
     torch.backends.cuda.matmul.allow_tf32 = True
     torch.backends.cudnn.allow_tf32 = True
+def show_mask(mask, ax, obj_id=None, random_color=False):
     if random_color:
         color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
     else:
+        cmap = plt.get_cmap("tab10")
+        cmap_idx = 0 if obj_id is None else obj_id
+        color = np.array([*cmap(cmap_idx)[:3], 0.6])
     h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
     ax.imshow(mask_image)
+def show_points(coords, labels, ax, marker_size=200):
     pos_points = coords[labels==1]
     neg_points = coords[labels==0]
     ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
 def show_box(box, ax):
     x0, y0 = box[0], box[1]
     return combined_images, mask_images
+def sam_process(input_first_frame_image, checkpoint, tracking_points, trackings_input_label):
+    # 1. We need to preprocess the video and store frames in the right directory
+    # — Penser à utiliser un ID unique pour le dossier
+    # Load model accordingly to user's choice
     if checkpoint == "tiny":
         sam2_checkpoint = "./checkpoints/sam2_hiera_tiny.pt"
         model_cfg = "sam2_hiera_t.yaml"
         sam2_checkpoint = "./checkpoints/sam2_hiera_large.pt"
         model_cfg = "sam2_hiera_l.yaml"
+    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint)
+    # `video_dir` a directory of JPEG frames with filenames like `<frame_index>.jpg`
+    video_dir = "./videos/bedroom"
+    # scan all the JPEG frame names in this directory
+    frame_names = [
+        p for p in os.listdir(video_dir)
+        if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
+    ]
+    frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
+    inference_state = predictor.init_state(video_path=video_dir)
+    # segment and track one object
+    predictor.reset_state(inference_state) # if any previous tracking, reset
+    # Add new point
+    ann_frame_idx = 0  # the frame index we interact with
+    ann_obj_id = 1  # give a unique id to each object we interact with (it can be any integers)
+    # Let's add a positive click at (x, y) = (210, 350) to get started
+    points = np.array(tracking_points.value), dtype=np.float32)
+    # for labels, `1` means positive click and `0` means negative click
+    labels = np.array(trackings_input_label.value, np.int32)
+    _, out_obj_ids, out_mask_logits = predictor.add_new_points(
+        inference_state=inference_state,
+        frame_idx=ann_frame_idx,
+        obj_id=ann_obj_id,
+        points=points,
+        labels=labels,
+    )
+    # Create the plot
+    plt.figure(figsize=(12, 8))
+    plt.title(f"frame {ann_frame_idx}")
+    plt.imshow(Image.open(os.path.join(video_dir, frame_names[ann_frame_idx])))
+    show_points(points, labels, plt.gca())
+    show_mask((out_mask_logits[0] > 0.0).cpu().numpy(), plt.gca(), obj_id=out_obj_ids[0])
+    # Save the plot as a JPG file
+    output_filename = "output_frame.jpg"
+    plt.savefig(output_filename, format='jpg')
+    plt.close()
+    """
+    #### PROPAGATION ####
+    # Define a directory to save the JPEG images
+    frames_output_dir = "frames_output_images"
+    os.makedirs(frames_output_dir, exist_ok=True)
+    # Initialize a list to store file paths of saved images
+    jpeg_images = []
+    # run propagation throughout the video and collect the results in a dict
+    video_segments = {}  # video_segments contains the per-frame segmentation results
+    for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
+        video_segments[out_frame_idx] = {
+            out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+            for i, out_obj_id in enumerate(out_obj_ids)
+        }
+    # render the segmentation results every few frames
+    vis_frame_stride = 15
+    plt.close("all")
+    for out_frame_idx in range(0, len(frame_names), vis_frame_stride):
+        plt.figure(figsize=(6, 4))
+        plt.title(f"frame {out_frame_idx}")
+        plt.imshow(Image.open(os.path.join(video_dir, frame_names[out_frame_idx])))
+        for out_obj_id, out_mask in video_segments[out_frame_idx].items():
+            show_mask(out_mask, plt.gca(), obj_id=out_obj_id)
+        # Define the output filename and save the figure as a JPEG file
+        output_filename = os.path.join(frames_output_dir, f"frame_{out_frame_idx}.jpg")
+        plt.savefig(output_filename, format='jpg')
+        # Append the file path to the list
+        jpeg_images.append(output_filename)
+        # Close the plot
+        plt.close()
+    """
+    # OLD
+    return output_filename
 with gr.Blocks() as demo:
     first_frame_path = gr.State()
     tracking_points = gr.State([])
     trackings_input_label = gr.State([])
     with gr.Column():
+        gr.Markdown("# SAM2 Video Predictor")
+        gr.Markdown("This is a simple demo for video segmentation with SAM2.")
         gr.Markdown("""Instructions:
+        1. Upload your video
+        2. With 'include' point type selected, Click on the object to mask on first frame
         3. Switch to 'exclude' point type if you want to specify an area to avoid
         4. Submit !
         """)
         with gr.Row():
             with gr.Column():
+                input_first_frame_image = gr.Image(label="input image", interactive=False, type="filepath", visible=False)
                 points_map = gr.Image(
                     label="points map",
                     type="filepath",
                     interactive=True
                 )
+                video_in = gr.Video(label="Video IN")
                 with gr.Row():
                     point_type = gr.Radio(label="point type", choices=["include", "exclude"], value="include")
                     clear_points_btn = gr.Button("Clear Points")
                 submit_btn = gr.Button("Submit")
             with gr.Column():
                 output_result = gr.Image()
+                # output_result_mask = gr.Image()
     clear_points_btn.click(
         fn = preprocess_image,
+        inputs = input_first_frame_image,
         outputs = [first_frame_path, tracking_points, trackings_input_label, points_map],
         queue=False
     )
+    video_in.upload(
+        fn = preprocess_video_in,
+        inputs = [video_in],
+        outputs = [first_frame_path, tracking_points, trackings_input_label, input_first_frame_image, point_map],
         queue = False
     )
     submit_btn.click(
         fn = sam_process,
+        inputs = [input_first_frame_image, checkpoint, tracking_points, trackings_input_label],
+        outputs = [output_result]
     )
 demo.launch(show_api=False, show_error=True)