EdgeTAM

Runtime error

App Files Files Community

bla commited on May 5

Commit

0b34400

verified ·

1 Parent(s): 5bc3a57

Update app.py

Browse files

Files changed (1) hide show

app.py +261 -506

app.py CHANGED Viewed

@@ -7,27 +7,24 @@
 import copy
 import os
 from datetime import datetime
-import gradio as gr
-# Removed GPU-specific environment variable setting
-# os.environ["TORCH_CUDNN_SDPA_ENABLED"] = "0,1,2,3,4,5,6,7"
 import tempfile
 import cv2
 import matplotlib.pyplot as plt
 import numpy as np
-# Removed spaces decorator import for CPU-only demo
-# import spaces # Removed spaces import
 import torch
 from moviepy.editor import ImageSequenceClip
 from PIL import Image
 from sam2.build_sam import build_sam2_video_predictor
 # Description
-title = "<center><strong><font size='8'>EdgeTAM<font></strong> <a href='https://github.com/facebookresearch/EdgeTAM'><font size='6'>[GitHub]</font></a> </center>"
 description_p = """# Instructions
                 <ol>
@@ -38,535 +35,314 @@ description_p = """# Instructions
                 </ol>
               """
-# examples - Keep examples, they are input files
 examples = [
     ["examples/01_dog.mp4"],
     ["examples/02_cups.mp4"],
     ["examples/03_blocks.mp4"],
     ["examples/04_coffee.mp4"],
     ["examples/05_default_juggle.mp4"],
-    ["examples/01_breakdancer.mp4"],
-    ["examples/02_hummingbird.mp4"],
-    ["examples/03_skateboarder.mp4"],
-    ["examples/04_octopus.mp4"],
-    ["examples/05_landing_dog_soccer.mp4"],
-    ["examples/06_pingpong.mp4"],
-    ["examples/07_snowboarder.mp4"],
-    ["examples/08_driving.mp4"],
-    ["examples/09_birdcartoon.mp4"],
-    ["examples/10_cloth_magic.mp4"],
-    ["examples/11_polevault.mp4"],
-    ["examples/12_hideandseek.mp4"],
-    ["examples/13_butterfly.mp4"],
-    ["examples/14_social_dog_training.mp4"],
-    ["examples/15_cricket.mp4"],
-    ["examples/16_robotarm.mp4"],
-    ["examples/17_childrendancing.mp4"],
-    ["examples/18_threedogs.mp4"],
-    ["examples/19_cyclist.mp4"],
-    ["examples/20_doughkneading.mp4"],
-    ["examples/21_biker.mp4"],
-    ["examples/22_dogskateboarder.mp4"],
-    ["examples/23_racecar.mp4"],
-    ["examples/24_clownfish.mp4"],
 ]
 OBJ_ID = 0
 sam2_checkpoint = "checkpoints/edgetam.pt"
 model_cfg = "edgetam.yaml"
-# Ensure predictor is explicitly built for CPU
 predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
-# Removed .to("cuda") - predictor is already on CPU from build_sam2_video_predictor
-# predictor.to("cuda")
 print("predictor loaded on CPU")
-# Removed CUDA specific autocast and backend settings
-# torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
-# if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:
-#     torch.backends.cuda.matmul.allow_tf32 = True
-#     torch.backends.cudnn.allow_tf32 = True
-# elif not torch.cuda.is_available():
-#     print("Warning: CUDA not available. Running on CPU.")
 def get_video_fps(video_path):
-    """Gets the frames per second of a video file."""
-    if video_path is None or not os.path.exists(video_path):
-         print(f"Warning: Video file not found at {video_path}")
-         return None
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
-        print(f"Error: Could not open video file {video_path}.")
-        return None
     fps = cap.get(cv2.CAP_PROP_FPS)
-    cap.release() # Release the capture object
     return fps
 def reset(session_state):
-    """Resets the UI and session state."""
-    print("Resetting demo.")
     session_state["input_points"] = []
     session_state["input_labels"] = []
-    # Reset the predictor state if it exists
     if session_state["inference_state"] is not None:
-        try:
-            # Assuming predictor.reset_state handles clearing current masks/features
-            predictor.reset_state(session_state["inference_state"])
-             # Explicitly delete or re-init the state object if a full reset is intended
-             # This depends on how predictor.reset_state works. Setting to None is safest for a full reset.
-            session_state["inference_state"] = None
-        except Exception as e:
-             print(f"Error resetting predictor state: {e}")
-             session_state["inference_state"] = None # Force-clear on error
     session_state["first_frame"] = None
     session_state["all_frames"] = None
-    session_state["inference_state"] = None # Ensure state is None after a full reset
-    # Also reset video path if stored
-    session_state["video_path"] = None
-    # Resetting UI components and disabling buttons
     return (
-        None, # video_in (clears the video player)
-        gr.update(open=True), # video_in_drawer (opens accordion)
-        None, # points_map (clears the image)
-        None, # output_image (clears the image)
-        gr.update(value=None, visible=False), # output_video (hides and clears)
-        gr.update(interactive=False), # propagate_btn disabled
-        gr.update(interactive=False), # clear_points_btn disabled
-        gr.update(interactive=False), # reset_btn disabled
-        session_state, # return updated session state
     )
 def clear_points(session_state):
-    """Clears selected points and resets segmentation on the first frame."""
-    print("Clearing points.")
     session_state["input_points"] = []
     session_state["input_labels"] = []
-    # Reset the predictor state to clear internal masks/features
-    # This typically doesn't remove the video context, just the mask predictions
-    if session_state["inference_state"] is not None:
-        try:
-            # Assuming reset_state handles clearing current masks/features
-            predictor.reset_state(session_state["inference_state"])
-            print("Predictor state reset for clearing points.")
-            # If you need to re-initialize the state for the *same* video after clearing points,
-            # you might need to call predictor.init_state again here, using the stored video_path.
-            # Since we are on CPU, device="cpu" is implicit now.
-            if session_state["video_path"] is not None:
-                 session_state["inference_state"] = predictor.init_state(video_path=session_state["video_path"])
-                 print("Predictor state re-initialized after clearing points.")
-            else:
-                 print("Warning: Could not re-initialize state after clear_points (video_path missing).")
-                 session_state["inference_state"] = None # Ensure state is None if video_path is gone
-        except Exception as e:
-             print(f"Error resetting predictor state during clear_points: {e}")
-             # If reset fails, this might leave old masks. Force-clear state on error.
-             session_state["inference_state"] = None
-    # Return the original first frame image for points_map and clear the output_image
-    first_frame_img = session_state["first_frame"] if session_state["first_frame"] is not None else None
     return (
-        first_frame_img, # points_map shows original first frame (no points yet)
-        None, # output_image cleared (no mask)
-        gr.update(value=None, visible=False), # output_video hidden
-        session_state, # return updated session state
     )
-# Removed @spaces.GPU decorator
 def preprocess_video_in(video_path, session_state):
-    """Loads video frames and initializes the predictor state."""
-    print(f"Processing video: {video_path}")
-    if video_path is None or not os.path.exists(video_path):
-        print("No video path provided or file not found.")
-        # Reset state and UI elements if input is invalid
-        # Need to return updates for the buttons as well
         return (
-            gr.update(open=True), None, None, gr.update(value=None, visible=False),
-            gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False),
-             { # Reset session state
-                "first_frame": None, "all_frames": None, "input_points": [],
-                "input_labels": [], "inference_state": None, "video_path": None,
-            }
         )
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
-        print(f"Error: Could not open video file {video_path}.")
         return (
-            gr.update(open=True), None, None, gr.update(value=None, visible=False),
-            gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False),
-            { # Reset session state
-                "first_frame": None, "all_frames": None, "input_points": [],
-                "input_labels": [], "inference_state": None, "video_path": None,
-            }
         )
     first_frame = None
     all_frames = []
     while True:
         ret, frame = cap.read()
         if not ret:
             break
-        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        all_frames.append(frame)
-        if first_frame is None:
-            first_frame = frame
     cap.release()
-    if not all_frames:
-        print(f"Error: No frames read from video file {video_path}.")
-        return (
-            gr.update(open=True), None, None, gr.update(value=None, visible=False),
-            gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False),
-            { # Reset session state
-                "first_frame": None, "all_frames": None, "input_points": [],
-                "input_labels": [], "inference_state": None, "video_path": None,
-            }
-        )
-    # Update session state with frames and path
-    session_state["first_frame"] = copy.deepcopy(first_frame) # Store a copy
     session_state["all_frames"] = all_frames
-    session_state["video_path"] = video_path # Store video path
     session_state["input_points"] = []
     session_state["input_labels"] = []
-    # Initialize state WITHOUT the device argument (uses predictor's device, which is CPU)
-    session_state["inference_state"] = predictor.init_state(video_path=video_path)
-    print("Video loaded and predictor state initialized on CPU.")
-    # Enable buttons after successful load
     return [
         gr.update(open=False),  # video_in_drawer
-        first_frame,  # points_map (shows first frame)
-        None,  # output_image (cleared initially)
-        gr.update(value=None, visible=False),  # output_video (hidden initially)
-        gr.update(interactive=True), # propagate_btn enabled
-        gr.update(interactive=True), # clear_points_btn enabled
-        gr.update(interactive=True), # reset_btn enabled
-        session_state, # session_state
     ]
-# Removed @spaces.GPU decorator
 def segment_with_points(
     point_type,
     session_state,
     evt: gr.SelectData,
 ):
-    """Adds a point prompt and performs segmentation on the first frame."""
-    # Ensure we have state and first frame
-    if session_state["first_frame"] is None or session_state["inference_state"] is None:
-         print("Error: Cannot segment. No video loaded or inference state missing.")
-         # Return current states to avoid errors, without changing UI much
-         return (
-             session_state.get("first_frame"), # points_map (show first frame if exists)
-             None, # output_image (keep cleared)
-             session_state,
-         )
-    # evt.index is the (x, y) coordinate tuple
-    click_coords = evt.index
-    print(f"Clicked at: {click_coords} ({point_type})")
-    session_state["input_points"].append(click_coords)
     if point_type == "include":
         session_state["input_labels"].append(1)
     elif point_type == "exclude":
         session_state["input_labels"].append(0)
-    # Get the first frame as a PIL image for drawing
-    first_frame_pil = Image.fromarray(session_state["first_frame"]).convert("RGBA")
-    w, h = first_frame_pil.size
-    # Define the circle radius
-    fraction = 0.01
-    radius = max(2, int(fraction * min(w, h))) # Ensure minimum radius of 2
-    # Create a transparent layer to draw points
-    transparent_layer_points = np.zeros((h, w, 4), dtype=np.uint8)
-    # Draw points on the transparent layer
     for index, track in enumerate(session_state["input_points"]):
-        # Ensure coordinates are integers for cv2.circle
-        point_coords = (int(track[0]), int(track[1]))
-        # Ensure color is RGBA (0-255)
         if session_state["input_labels"][index] == 1:
-            cv2.circle(transparent_layer_points, point_coords, radius, (0, 255, 0, 255), -1) # Green for include
         else:
-            cv2.circle(transparent_layer_points, point_coords, radius, (255, 0, 0, 255), -1) # Red for exclude
-    # Convert the transparent layer back to an image and composite onto the first frame
-    transparent_layer_points_pil = Image.fromarray(transparent_layer_points, "RGBA")
-    # Combine the first frame image with the points layer for the points_map output
-    # points_map shows the first frame *with the points you added*.
-    selected_point_map_img = Image.alpha_composite(
-        first_frame_pil.copy(), transparent_layer_points_pil
     )
-    # Prepare points and labels as tensors on the correct device (CPU in this version)
     points = np.array(session_state["input_points"], dtype=np.float32)
     labels = np.array(session_state["input_labels"], np.int32)
-    # Ensure tensors are on the correct device (CPU)
-    device = next(predictor.parameters()).device # Get the device the model is on (should be "cpu")
-    points_tensor = torch.tensor(points, dtype=torch.float32, device=device).unsqueeze(0) # Add batch dim
-    labels_tensor = torch.tensor(labels, dtype=torch.int32, device=device).unsqueeze(0) # Add batch dim
-    first_frame_output_img = None # Initialize output mask image as None in case of error
-    try:
-        # Note: predictor.add_new_points modifies the internal inference_state
-        _, _, out_mask_logits = predictor.add_new_points(
-            inference_state=session_state["inference_state"],
-            frame_idx=0, # Always segment on the first frame initially
-            obj_id=OBJ_ID,
-            points=points_tensor,
-            labels=labels_tensor,
-        )
-        # Process logits: detach from graph, move to CPU, apply threshold
-        # out_mask_logits is a list of tensors [tensor([batch_size, H, W])] for the requested obj_id
-        # Access the result for the first object (index 0) and the first item in batch (index 0)
-        mask_tensor = (out_mask_logits[0][0].detach().cpu() > 0.0) # Move to CPU before converting to numpy
-        mask_numpy = mask_tensor.numpy() # Convert to numpy
-        # Get the mask image (RGBA)
-        mask_image_pil = show_mask(mask_numpy, obj_id=OBJ_ID) # show_mask returns RGBA PIL Image
-        # Composite the mask onto the first frame for the output_image
-        # output_image shows the first frame *with the segmentation mask result*.
-        first_frame_output_img = Image.alpha_composite(first_frame_pil.copy(), mask_image_pil)
-    except Exception as e:
-        print(f"Error during segmentation on first frame: {e}")
-        # On error, first_frame_output_img remains None
-    # Removed CUDA cache clearing call
-    # if torch.cuda.is_available():
-    #     torch.cuda.empty_cache()
-    return selected_point_map_img, first_frame_output_img, session_state
 def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
-    """Helper function to visualize a mask."""
-    # Ensure mask is a numpy array (and boolean)
-    if isinstance(mask, torch.Tensor):
-         mask = mask.detach().cpu().numpy() # Ensure it's on CPU and converted to numpy
-    # Convert potential float/int mask to boolean mask
-    mask = mask.astype(bool)
     if random_color:
-        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) # RGBA with 0.6 alpha
     else:
         cmap = plt.get_cmap("tab10")
-        cmap_idx = 0 if obj_id is None else obj_id % 10 # Use modulo 10 for tab10 colors
-        color = np.array([*cmap(cmap_idx)[:3], 0.6]) # RGBA with 0.6 alpha
-    # Ensure mask has H, W dimensions
-    if mask.ndim == 3:
-        mask = mask.squeeze() # Remove singular dimensions like (H, W, 1)
-    if mask.ndim != 2:
-        print(f"Warning: show_mask received mask with shape {mask.shape}. Expected 2D.")
-        # Create an empty transparent image if mask shape is unexpected
-        h, w = mask.shape[:2] if mask.ndim >= 2 else (100, 100) # Use actual shape if possible, otherwise default
-        if convert_to_image:
-             return Image.fromarray(np.zeros((h, w, 4), dtype=np.uint8), "RGBA")
-        else:
-             return np.zeros((h, w, 4), dtype=np.uint8)
-    h, w = mask.shape
-    # Create an RGBA image from the mask and color
-    # Apply color where mask is True
-    # Need to reshape color to be broadcastable [1, 1, 4]
-    colored_mask = np.zeros((h, w, 4), dtype=np.float32) # Start with fully transparent black
-    # Apply the color only where the mask is True.
-    # This directly creates the colored overlay with transparency.
-    colored_mask[mask] = color
-    # Convert to uint8 [0-255]
-    colored_mask_uint8 = (colored_mask * 255).astype(np.uint8)
     if convert_to_image:
-        mask_img = Image.fromarray(colored_mask_uint8, "RGBA")
-        return mask_img
-    else:
-        return colored_mask_uint8
-# Removed @spaces.GPU decorator
 def propagate_to_all(
-    video_in, # Keep video_in path as in original
     session_state,
 ):
-    """Runs mask propagation through the video and generates the output video."""
-    print("Starting propagation...")
-    # Ensure state is ready
-    # Using session_state.get("video_path") is safer than video_in directly
-    current_video_path = session_state.get("video_path")
     if (
-        len(session_state["input_points"]) == 0 # Need at least one point
-        or session_state["all_frames"] is None
         or session_state["inference_state"] is None
-        or current_video_path is None # Ensure we have the original video path
     ):
-        print("Error: Cannot propagate. No points selected, video not loaded, or inference state missing.")
-        return (
-            gr.update(value=None, visible=False), # Hide output video on error
-            session_state,
-        )
-    # run propagation throughout the video and collect the results
-    video_segments = {}
-    try:
-        # This loop performs the core tracking prediction frame by frame
-        for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
-            session_state["inference_state"]
-        ):
-            # Process logits: detach from graph, move to CPU, convert to numpy boolean mask
-             # Ensure tensor is on CPU before converting to numpy
-             video_segments[out_frame_idx] = {
-                 # out_mask_logits is a list of tensors (one per object tracked in this frame)
-                 # Each tensor is [batch_size, H, W]. Batch size is 1 here.
-                 # Access the result for the first object (index i) and the first item in batch (index 0)
-                 out_obj_id: (out_mask_logits[i][0].detach().cpu() > 0.0).numpy()
-                 for i, out_obj_id in enumerate(out_obj_ids)
-             }
-             # Optional: print progress
-             # print(f"Processed frame {out_frame_idx+1}/{len(session_state['all_frames'])}")
-        print("Propagation finished.")
-    except Exception as e:
-        print(f"Error during propagation: {e}")
         return (
-            gr.update(value=None, visible=False), # Hide output video on error
             session_state,
         )
     output_frames = []
-    # Iterate through all original frames to generate output video
-    total_frames = len(session_state["all_frames"])
-    for out_frame_idx in range(total_frames):
-        original_frame_rgb = session_state["all_frames"][out_frame_idx]
-        # Convert original frame to RGBA for compositing
-        transparent_background = Image.fromarray(original_frame_rgb).convert("RGBA")
-        # Check if we have a mask for this frame and object ID
-        if out_frame_idx in video_segments and OBJ_ID in video_segments[out_frame_idx]:
-            current_mask_numpy = video_segments[out_frame_idx][OBJ_ID]
-            # Get the mask image (RGBA)
-            mask_image_pil = show_mask(current_mask_numpy, obj_id=OBJ_ID)
-            # Composite the mask onto the frame
-            output_frame_img_rgba = Image.alpha_composite(transparent_background, mask_image_pil)
-            # Convert back to numpy RGB (moviepy needs RGB or RGBA)
-            output_frame_np = np.array(output_frame_img_rgba.convert("RGB"))
-        else:
-             # If no mask for this frame/object, just use the original frame (converted to RGB)
-             # Note: all_frames are already RGB numpy arrays, so just use them directly.
-             # print(f"Warning: No mask found for frame {out_frame_idx} and object {OBJ_ID}. Using original frame.")
-             output_frame_np = original_frame_rgb # Already RGB numpy array
-        output_frames.append(output_frame_np)
-    # Removed CUDA cache clearing call
-    # if torch.cuda.is_available():
-    #     torch.cuda.empty_cache()
-    # Define output path in a temporary directory
-    unique_id = datetime.now().strftime("%Y%m%d%H%M%S%f") # Use microseconds for more uniqueness
-    final_vid_filename = f"output_video_{unique_id}.mp4"
-    final_vid_output_path = os.path.join(tempfile.gettempdir(), final_vid_filename)
-    print(f"Output video path: {final_vid_output_path}")
     # Create a video clip from the image sequence
-    # Get original FPS from the stored video path
-    original_fps = get_video_fps(current_video_path)
-    fps = original_fps if original_fps is not None and original_fps > 0 else 30 # Default to 30 if detection fails or is zero
-    print(f"Creating output video with FPS: {fps}")
-    # Check if there are frames to process
-    if not output_frames:
-         print("No output frames generated.")
-         return (
-            gr.update(value=None, visible=False), # Hide output video
-            session_state,
-         )
-    # Create ImageSequenceClip from the list of numpy arrays
-    try:
-        clip = ImageSequenceClip(output_frames, fps=fps)
-    except Exception as e:
-        print(f"Error creating ImageSequenceClip: {e}")
-        return (
-            gr.update(value=None, visible=False), # Hide output video on error
-            session_state,
-        )
-    # Write the result to a file. Use 'libx264' codec for broad compatibility.
-    # Added CPU optimization parameters for moviepy write
-    try:
-        print(f"Writing video file with codec='libx264', fps={fps}, preset='medium', threads='auto'")
-        clip.write_videofile(
-            final_vid_output_path,
-            codec="libx264",
-            fps=fps, # Ensure correct FPS is used during writing
-            preset="medium", # CPU optimization: 'fast', 'faster', 'veryfast' are options for speed vs size
-            threads="auto", # CPU optimization: Use multiple cores
-            logger=None # Suppress moviepy output
-        )
-        print("Video writing complete.")
-        # Return the path and make the video player visible
-        return (
-            gr.update(value=final_vid_output_path, visible=True),
-            session_state,
-        )
-    except Exception as e:
-        print(f"Error writing video file: {e}")
-        # Clean up potentially created partial file
-        if os.path.exists(final_vid_output_path):
-             try:
-                 os.remove(final_vid_output_path)
-                 print(f"Removed partial video file: {final_vid_output_path}")
-             except Exception as clean_e:
-                 print(f"Error removing partial file: {clean_e}")
-        # Return None if writing fails
-        return (
-            gr.update(value=None, visible=False),
-            session_state,
-        )
-def update_output_video_visibility():
-    """Simply returns a Gradio update to make the output video visible."""
     return gr.update(visible=True)
 with gr.Blocks() as demo:
-    # Session state dictionary to hold video frames, points, labels, and predictor state
     session_state = gr.State(
         {
-            "first_frame": None, # numpy array (RGB)
-            "all_frames": None,  # list of numpy arrays (RGB)
-            "input_points": [],  # list of (x, y) tuples/lists
-            "input_labels": [],  # list of 1s and 0s
-            "inference_state": None, # EdgeTAM predictor state object
-            "video_path": None, # Store the input video path
         }
     )
@@ -580,7 +356,7 @@ with gr.Blocks() as demo:
                 gr.Markdown(description_p)
                 with gr.Accordion("Input Video", open=True) as video_in_drawer:
-                    video_in = gr.Video(label="Input Video", format="mp4") # Will hold the video file path
                 with gr.Row():
                     point_type = gr.Radio(
@@ -588,142 +364,121 @@ with gr.Blocks() as demo:
                         choices=["include", "exclude"],
                         value="include",
                         scale=2,
-                        interactive=True, # Make interactive
                     )
-                    # Buttons are initially disabled until a video is loaded
-                    propagate_btn = gr.Button("Track", scale=1, variant="primary", interactive=False)
-                    clear_points_btn = gr.Button("Clear Points", scale=1, interactive=False)
-                    reset_btn = gr.Button("Reset", scale=1, interactive=False)
-                # points_map is where users click to add points. Needs to be interactive.
-                # Shows the first frame with points drawn on it.
                 points_map = gr.Image(
-                    label="Click on the First Frame to Add Points", # Clearer label
-                    type="numpy",
-                    interactive=True, # <--- CHANGED TO True to enable clicking
-                    height=400, # Set a fixed height for better UI
-                    width="auto", # Let width adjust
-                    show_share_button=False,
-                    show_download_button=False,
                 )
             with gr.Column():
                 gr.Markdown("# Try some of the examples below ⬇️")
                 gr.Examples(
                     examples=examples,
-                    inputs=[video_in],
-                    examples_per_page=8,
-                    cache_examples=False, # Do not cache processed examples, as state is involved
-                )
-                # Removed extra blank lines
-                # output_image shows the segmentation mask prediction on the *first* frame
-                output_image = gr.Image(
-                    label="Segmentation Mask on First Frame", # Clearer label
-                    type="numpy",
-                    interactive=False, # Not interactive, just displays the mask
-                    height=400, # Match height of points_map
-                    width="auto", # Let width adjust
-                    show_share_button=False,
-                    show_download_button=False,
                 )
-                # output_video shows the final tracking result
-                output_video = gr.Video(visible=False, label="Tracking Result")
-    # --- Event Handlers ---
-    # When a new video file is uploaded via the file browser
-    # Added postprocess to update button interactivity based on whether video loaded
     video_in.upload(
         fn=preprocess_video_in,
-        inputs=[video_in, session_state],
         outputs=[
-            video_in_drawer, points_map, output_image, output_video,
-            propagate_btn, clear_points_btn, reset_btn, session_state,
         ],
-        queue=False, # Process immediately
     )
-    # When an example video is selected (change event)
-    # Added postprocess to update button interactivity
     video_in.change(
         fn=preprocess_video_in,
-        inputs=[video_in, session_state],
         outputs=[
-            video_in_drawer, points_map, output_image, output_video,
-            propagate_btn, clear_points_btn, reset_btn, session_state,
         ],
-        queue=False, # Process immediately
     )
-    # Triggered when a user clicks on the points_map image
     points_map.select(
         fn=segment_with_points,
         inputs=[
-            point_type,  # "include" or "exclude" radio button value
-            session_state, # Pass session state
         ],
         outputs=[
-            points_map,      # Updated image with points drawn
-            output_image,    # Updated image with first frame segmentation mask
-            session_state,   # Updated session state (points/labels added)
         ],
-        queue=False, # Process clicks immediately
     )
-    # Button to clear all selected points and reset the first frame mask
     clear_points_btn.click(
         fn=clear_points,
-        inputs=[session_state], # Pass session state
         outputs=[
-            points_map,    # points_map shows original first frame without points
-            output_image,  # output_image cleared (or shows original first frame without mask)
-            output_video,  # Hide output video
-            session_state, # Updated session state (points/labels cleared, inference state reset)
         ],
-        queue=False, # Process immediately
     )
-    # Button to reset the entire demo state and UI
     reset_btn.click(
         fn=reset,
-        inputs=[session_state], # Pass session state
         outputs=[
-            video_in, video_in_drawer, points_map, output_image, output_video,
-            propagate_btn, clear_points_btn, reset_btn, session_state,
         ],
-        queue=False, # Process immediately
     )
-    # Button to start mask propagation through the video
     propagate_btn.click(
-        fn=update_output_video_visibility, # First, make the output video player visible
         inputs=[],
-        outputs=[output_video],
-        queue=False, # Process this UI update immediately
-    ).then( # Then, run the propagation function
         fn=propagate_to_all,
         inputs=[
-            video_in,      # Get the input video path (can also get from session_state["video_path"])
-            session_state, # Pass session state (contains frames, points, inference_state, video_path)
         ],
         outputs=[
-            output_video,  # Update output video player with result
-            session_state, # Update session state
         ],
-        # CPU Optimization: Limit concurrency to 1 to prevent resource exhaustion.
-        # Queue=True ensures requests wait if another is processing.
-        concurrency_limit=1,
-        queue=True,
     )
-# Launch the Gradio demo
-demo.queue() # Enable queuing for sequential processing under concurrency limits
-print("Gradio demo starting...")
-# Removed share=True for local debugging unless you specifically need a public link
-demo.launch()
-print("Gradio demo launched.")

 import copy
 import os
 from datetime import datetime
 import tempfile
 import cv2
 import matplotlib.pyplot as plt
 import numpy as np
+import gradio as gr
 import torch
 from moviepy.editor import ImageSequenceClip
 from PIL import Image
 from sam2.build_sam import build_sam2_video_predictor
+# Remove CUDA environment variables
+if 'TORCH_CUDNN_SDPA_ENABLED' in os.environ:
+    del os.environ["TORCH_CUDNN_SDPA_ENABLED"]
 # Description
+title = "<center><strong><font size='8'>EdgeTAM CPU<font></strong> <a href='https://github.com/facebookresearch/EdgeTAM'><font size='6'>[GitHub]</font></a> </center>"
 description_p = """# Instructions
                 <ol>
                 </ol>
               """
+# examples - keeping fewer examples to reduce memory footprint
 examples = [
     ["examples/01_dog.mp4"],
     ["examples/02_cups.mp4"],
     ["examples/03_blocks.mp4"],
     ["examples/04_coffee.mp4"],
     ["examples/05_default_juggle.mp4"],
 ]
 OBJ_ID = 0
+# Initialize model on CPU
 sam2_checkpoint = "checkpoints/edgetam.pt"
 model_cfg = "edgetam.yaml"
 predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
 print("predictor loaded on CPU")
+# Function to get video frame rate
 def get_video_fps(video_path):
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
+        print("Error: Could not open video.")
+        return 30.0  # Default fallback value
     fps = cap.get(cv2.CAP_PROP_FPS)
+    cap.release()
     return fps
 def reset(session_state):
     session_state["input_points"] = []
     session_state["input_labels"] = []
     if session_state["inference_state"] is not None:
+        predictor.reset_state(session_state["inference_state"])
     session_state["first_frame"] = None
     session_state["all_frames"] = None
+    session_state["inference_state"] = None
     return (
+        None,
+        gr.update(open=True),
+        None,
+        None,
+        gr.update(value=None, visible=False),
+        session_state,
     )
 def clear_points(session_state):
     session_state["input_points"] = []
     session_state["input_labels"] = []
+    if session_state["inference_state"] is not None and session_state["inference_state"].get("tracking_has_started", False):
+        predictor.reset_state(session_state["inference_state"])
     return (
+        session_state["first_frame"],
+        None,
+        gr.update(value=None, visible=False),
+        session_state,
     )
 def preprocess_video_in(video_path, session_state):
+    if video_path is None:
         return (
+            gr.update(open=True),  # video_in_drawer
+            None,  # points_map
+            None,  # output_image
+            gr.update(value=None, visible=False),  # output_video
+            session_state,
         )
+    # Read the first frame
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
+        print("Error: Could not open video.")
         return (
+            gr.update(open=True),  # video_in_drawer
+            None,  # points_map
+            None,  # output_image
+            gr.update(value=None, visible=False),  # output_video
+            session_state,
         )
+    # For CPU optimization - determine video properties
+    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Determine if we need to resize for CPU performance
+    target_width = 640  # Target width for processing on CPU
+    scale_factor = 1.0
+    if frame_width > target_width:
+        scale_factor = target_width / frame_width
+        frame_width = target_width
+        frame_height = int(frame_height * scale_factor)
+    # Read frames - for CPU we'll be more selective about which frames to keep
+    frame_number = 0
     first_frame = None
     all_frames = []
+    # For CPU optimization, skip frames if video is too long
+    frame_stride = 1
+    if total_frames > 300:  # If more than 300 frames
+        frame_stride = max(1, int(total_frames / 300))  # Process at most ~300 frames
     while True:
         ret, frame = cap.read()
         if not ret:
             break
+        if frame_number % frame_stride == 0:  # Process every frame_stride frames
+            # Resize the frame if needed
+            if scale_factor != 1.0:
+                frame = cv2.resize(frame, (frame_width, frame_height), interpolation=cv2.INTER_AREA)
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = np.array(frame)
+            # Store the first frame
+            if first_frame is None:
+                first_frame = frame
+            all_frames.append(frame)
+        frame_number += 1
     cap.release()
+    session_state["first_frame"] = copy.deepcopy(first_frame)
     session_state["all_frames"] = all_frames
+    session_state["frame_stride"] = frame_stride
+    session_state["scale_factor"] = scale_factor
+    session_state["original_dimensions"] = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
+                                          int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
+    session_state["inference_state"] = predictor.init_state(video_path=video_path)
     session_state["input_points"] = []
     session_state["input_labels"] = []
     return [
         gr.update(open=False),  # video_in_drawer
+        first_frame,  # points_map
+        None,  # output_image
+        gr.update(value=None, visible=False),  # output_video
+        session_state,
     ]
 def segment_with_points(
     point_type,
     session_state,
     evt: gr.SelectData,
 ):
+    session_state["input_points"].append(evt.index)
+    print(f"TRACKING INPUT POINT: {session_state['input_points']}")
     if point_type == "include":
         session_state["input_labels"].append(1)
     elif point_type == "exclude":
         session_state["input_labels"].append(0)
+    print(f"TRACKING INPUT LABEL: {session_state['input_labels']}")
+    # Open the image and get its dimensions
+    transparent_background = Image.fromarray(session_state["first_frame"]).convert(
+        "RGBA"
+    )
+    w, h = transparent_background.size
+    # Define the circle radius as a fraction of the smaller dimension
+    fraction = 0.01  # You can adjust this value as needed
+    radius = int(fraction * min(w, h))
+    # Create a transparent layer to draw on
+    transparent_layer = np.zeros((h, w, 4), dtype=np.uint8)
     for index, track in enumerate(session_state["input_points"]):
         if session_state["input_labels"][index] == 1:
+            cv2.circle(transparent_layer, track, radius, (0, 255, 0, 255), -1)
         else:
+            cv2.circle(transparent_layer, track, radius, (255, 0, 0, 255), -1)
+    # Convert the transparent layer back to an image
+    transparent_layer = Image.fromarray(transparent_layer, "RGBA")
+    selected_point_map = Image.alpha_composite(
+        transparent_background, transparent_layer
     )
+    # Let's add a positive click at (x, y) = (210, 350) to get started
     points = np.array(session_state["input_points"], dtype=np.float32)
+    # for labels, `1` means positive click and `0` means negative click
     labels = np.array(session_state["input_labels"], np.int32)
+    # For CPU optimization, we'll process with smaller batch size
+    _, _, out_mask_logits = predictor.add_new_points(
+        inference_state=session_state["inference_state"],
+        frame_idx=0,
+        obj_id=OBJ_ID,
+        points=points,
+        labels=labels,
+    )
+    mask_image = show_mask((out_mask_logits[0] > 0.0).cpu().numpy())
+    first_frame_output = Image.alpha_composite(transparent_background, mask_image)
+    return selected_point_map, first_frame_output, session_state
 def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
     if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
     else:
         cmap = plt.get_cmap("tab10")
+        cmap_idx = 0 if obj_id is None else obj_id
+        color = np.array([*cmap(cmap_idx)[:3], 0.6])
+    h, w = mask.shape[-2:]
+    mask = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    mask = (mask * 255).astype(np.uint8)
     if convert_to_image:
+        mask = Image.fromarray(mask, "RGBA")
+    return mask
 def propagate_to_all(
+    video_in,
     session_state,
 ):
     if (
+        len(session_state["input_points"]) == 0
+        or video_in is None
         or session_state["inference_state"] is None
     ):
         return (
+            None,
             session_state,
         )
+    # For CPU optimization: process in smaller batches
+    chunk_size = 5  # Process 5 frames at a time to avoid memory issues
+    # run propagation throughout the video and collect the results in a dict
+    video_segments = {}  # video_segments contains the per-frame segmentation results
+    print("starting propagate_in_video on CPU")
+    # Get the frames in chunks for CPU memory optimization
+    for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
+        session_state["inference_state"]
+    ):
+        video_segments[out_frame_idx] = {
+            out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+            for i, out_obj_id in enumerate(out_obj_ids)
+        }
+        # Free up memory after processing each frame
+        if len(video_segments) % chunk_size == 0:
+            torch.cuda.empty_cache() if torch.cuda.is_available() else None
+    # obtain the segmentation results every few frames
+    # For CPU optimization: increase stride to reduce processing
+    vis_frame_stride = max(1, len(video_segments) // 100)  # Limit to ~100 frames in output
     output_frames = []
+    for out_frame_idx in range(0, len(video_segments), vis_frame_stride):
+        transparent_background = Image.fromarray(
+            session_state["all_frames"][out_frame_idx]
+        ).convert("RGBA")
+        out_mask = video_segments[out_frame_idx][OBJ_ID]
+        mask_image = show_mask(out_mask)
+        output_frame = Image.alpha_composite(transparent_background, mask_image)
+        output_frame = np.array(output_frame)
+        output_frames.append(output_frame)
     # Create a video clip from the image sequence
+    original_fps = get_video_fps(video_in)
+    fps = original_fps  # Frames per second
+    # For CPU optimization - lower FPS if original is high
+    if fps > 24:
+        fps = 24
+    clip = ImageSequenceClip(output_frames, fps=fps)
+    # Write the result to a file - use lower quality for CPU
+    unique_id = datetime.now().strftime("%Y%m%d%H%M%S")
+    final_vid_output_path = f"output_video_{unique_id}.mp4"
+    final_vid_output_path = os.path.join(tempfile.gettempdir(), final_vid_output_path)
+    # Lower bitrate for CPU processing
+    clip.write_videofile(final_vid_output_path, codec="libx264", bitrate="1000k")
+    return (
+        gr.update(value=final_vid_output_path),
+        session_state,
+    )
+def update_ui():
     return gr.update(visible=True)
 with gr.Blocks() as demo:
     session_state = gr.State(
         {
+            "first_frame": None,
+            "all_frames": None,
+            "input_points": [],
+            "input_labels": [],
+            "inference_state": None,
+            "frame_stride": 1,
+            "scale_factor": 1.0,
+            "original_dimensions": None,
         }
     )
                 gr.Markdown(description_p)
                 with gr.Accordion("Input Video", open=True) as video_in_drawer:
+                    video_in = gr.Video(label="Input Video", format="mp4")
                 with gr.Row():
                     point_type = gr.Radio(
                         choices=["include", "exclude"],
                         value="include",
                         scale=2,
                     )
+                    propagate_btn = gr.Button("Track", scale=1, variant="primary")
+                    clear_points_btn = gr.Button("Clear Points", scale=1)
+                    reset_btn = gr.Button("Reset", scale=1)
                 points_map = gr.Image(
+                    label="Frame with Point Prompt", type="numpy", interactive=False
                 )
             with gr.Column():
                 gr.Markdown("# Try some of the examples below ⬇️")
                 gr.Examples(
                     examples=examples,
+                    inputs=[
+                        video_in,
+                    ],
+                    examples_per_page=5,
                 )
+                output_image = gr.Image(label="Reference Mask")
+                output_video = gr.Video(visible=False)
+    # When new video is uploaded
     video_in.upload(
         fn=preprocess_video_in,
+        inputs=[
+            video_in,
+            session_state,
+        ],
         outputs=[
+            video_in_drawer,  # Accordion to hide uploaded video player
+            points_map,  # Image component where we add new tracking points
+            output_image,
+            output_video,
+            session_state,
         ],
+        queue=False,
     )
     video_in.change(
         fn=preprocess_video_in,
+        inputs=[
+            video_in,
+            session_state,
+        ],
         outputs=[
+            video_in_drawer,  # Accordion to hide uploaded video player
+            points_map,  # Image component where we add new tracking points
+            output_image,
+            output_video,
+            session_state,
         ],
+        queue=False,
     )
+    # triggered when we click on image to add new points
     points_map.select(
         fn=segment_with_points,
         inputs=[
+            point_type,  # "include" or "exclude"
+            session_state,
         ],
         outputs=[
+            points_map,  # updated image with points
+            output_image,
+            session_state,
         ],
+        queue=False,
     )
+    # Clear every points clicked and added to the map
     clear_points_btn.click(
         fn=clear_points,
+        inputs=session_state,
         outputs=[
+            points_map,
+            output_image,
+            output_video,
+            session_state,
         ],
+        queue=False,
     )
     reset_btn.click(
         fn=reset,
+        inputs=session_state,
         outputs=[
+            video_in,
+            video_in_drawer,
+            points_map,
+            output_image,
+            output_video,
+            session_state,
         ],
+        queue=False,
     )
     propagate_btn.click(
+        fn=update_ui,
         inputs=[],
+        outputs=output_video,
+        queue=False,
+    ).then(
         fn=propagate_to_all,
         inputs=[
+            video_in,
+            session_state,
         ],
         outputs=[
+            output_video,
+            session_state,
         ],
+        queue=True,  # Use queue for CPU processing
     )
+demo.queue()
+demo.launch()