diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..5dc8af2b1c216ae074434c644fc24a41bca9a839 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +images/background.mp4 filter=lfs diff=lfs merge=lfs -text +images/genai[[:space:]]shaolin.mp4 filter=lfs diff=lfs merge=lfs -text +images/image_annote.mp4 filter=lfs diff=lfs merge=lfs -text +images/image_aug.mp4 filter=lfs diff=lfs merge=lfs -text +images/pix_output_video[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text +images/redhulk.mp4 filter=lfs diff=lfs merge=lfs -text +images/with_replacement_output_video.mp4 filter=lfs diff=lfs merge=lfs -text +images/zoe.mp4 filter=lfs diff=lfs merge=lfs -text +sam_2_image_generation.ipynb filter=lfs diff=lfs merge=lfs -text diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..60498de22f69d49e03cdebac9ae5d02c33b72ab6 --- /dev/null +++ b/app.py @@ -0,0 +1,138 @@ +import streamlit as st +import base64 + +# Set the page configuration +st.set_page_config( + page_title="MetaMorph AI", + page_icon="๐ŸŒ‰", + initial_sidebar_state="expanded", + layout="wide", + menu_items={ + 'Get help': 'https://www.linkedin.com/in/gaurav-verma-4696bb106/', + 'About': "MetaMorph: Revolutionize your media with cutting-edge image and video augmentation using the META Sam-2 model for stunning visual transformations!" + } +) + +# Function to load video as base64 +def get_base64_video(video_path): + with open(video_path, 'rb') as video_file: + video_bytes = video_file.read() + return base64.b64encode(video_bytes).decode('utf-8') + +# Video file path +video_path = 'images/background.mp4' + +# Get the base64 video +video_base64 = get_base64_video(video_path) + +# Add video as background +background_video = f""" + +
+ +
+ """ +st.markdown(background_video, unsafe_allow_html=True) + +# Content goes here +with st.container(): + + # Title + html_code = """ +
+
+

+ MetaMorphix AI ๐Ÿฆโ€๐Ÿ”ฅ +

+
+
+ + + """ + st.markdown(html_code, unsafe_allow_html=True) + + # Additional content + +# Functionality for pages +from home import home_page +from image_augmentation import image_augmentation_page +from video_augmentation import image_annoter +from use_cases import use_case +def main(): + st.sidebar.title("Navigation") + page = st.sidebar.selectbox("Go to", ("Home","Use Cases", "Image Augmentation", "Video Augmentation")) + + if page == "Home": + home_page() + elif page == "Use Cases": + use_case() + elif page == "Image Augmentation": + image_augmentation_page() + elif page == "Video Augmentation": + image_annoter() + +if __name__ == "__main__": + main() diff --git a/home.py b/home.py new file mode 100644 index 0000000000000000000000000000000000000000..0a7d6b5659503bdbc0f0db30e1739cf5fcc74e70 --- /dev/null +++ b/home.py @@ -0,0 +1,41 @@ +import streamlit as st + + + +def home_page(): + st.title("Welcome to MetaMorphix AI") + st.write(""" + This application uses the **META Sam-2 model** to perform advanced augmentation on images and videos., + \n**YOLO** trained and pretrained model for Object Detection. + \n**Stability AI API** for Generative AI - Image to Image generation on mask. + \n**Image Annoter** for YOLO training Folder Input, Process Replica That of Roboflow app. + + Navigate to the desired section using the sidebar. + + \nScroll down to see the tutorial. + + """) + st.divider() + st.header("For Image Augmentation") + st.write("""1. Navigate to Image Augmentation page & Upload a Image. + \n2. Mark coordinates on canvas **(green for Inclusive points & red for Exclusive points).** + \n3. Select Augmentaion method [Pixelated, Hue Change, Mask Replacement, Img2Img Generation] and proceed.""") + st.video("images/image_aug.mp4") + + st.divider() + st.header("For Image Annotation on an Image Directory") + st.write("""1. Navigate to Video Augmentation page & Paste Local Directory link where train images are to annoted. + \n2. create Bounding box on canvas. + \n3. click on save annoptation and navigate through next button""") + st.video("images/image_annote.mp4") + + st.warning("As of now Video Augmentation can only be happen on Jupyter notebook due to certain Limitation") + st.write("Go to following link to access Notebook and Use Kaggle GPU") + # Define the profile link + profile_url = "https://www.kaggle.com/code/gauravverma069/sam-2-meta-video-augmentation-with-yolo-and-genai" + st.markdown(f"[Visit my Kaggle Notebook link]({profile_url})") + + + + + \ No newline at end of file diff --git a/image_augmentation.py b/image_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..67fce7ff3c168485390bc49772d0e69f29bbfdc0 --- /dev/null +++ b/image_augmentation.py @@ -0,0 +1,296 @@ +import streamlit as st +from streamlit_drawable_canvas import st_canvas +from PIL import Image +import numpy as np +import matplotlib.pyplot as plt +import image_mask_gen +import torch +from sam2.build_sam import build_sam2 +from sam2.sam2_image_predictor import SAM2ImagePredictor +import os +import io +import warnings +from stability_sdk import client +import stability_sdk.interfaces.gooseai.generation.generation_pb2 as generation + +import streamlit as st +import base64 + + +# Function to display points on the image using matplotlib +def show_points(coords, labels, ax, marker_size=375): + pos_points = coords[labels == 1] + neg_points = coords[labels == 0] + ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25) + ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25) + +def remove_duplicates(coords, labels): + unique_coords = [] + unique_labels = [] + seen = set() + + for coord, label in zip(coords, labels): + coord_tuple = tuple(coord) + if coord_tuple not in seen: + seen.add(coord_tuple) + unique_coords.append(coord) + unique_labels.append(label) + + return unique_coords, unique_labels + + +def image_augmentation_page(): + pass + st.title("Image Augmentation") + st.write("Upload an image to apply augmentation techniques.") + + # Initialize session state variables + if "inclusive_points" not in st.session_state: + st.session_state.inclusive_points = [] + if "exclusive_points" not in st.session_state: + st.session_state.exclusive_points = [] + + # Upload an image + uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"]) + + if uploaded_file is not None: + # Open the uploaded image + image = Image.open(uploaded_file) + + # Set the maximum width for display + max_display_width = 700 # You can adjust this value + + # Calculate the scaling factor + scale_factor = min(max_display_width / image.size[0], 1) + + # Resize the image for display + display_width = int(image.size[0] * scale_factor) + display_height = int(image.size[1] * scale_factor) + resized_image = image.resize((display_width, display_height)) + + # Inclusive Points Phase + st.subheader("Select Inclusive Points (Green)") + canvas_inclusive = st_canvas( + fill_color="rgba(0, 0, 0, 0)", # Transparent fill + stroke_width=1, # Stroke width for drawing + stroke_color="blue", # Color for the outline of clicks + background_image=resized_image, + update_streamlit=True, + height=display_height, + width=display_width, + drawing_mode="circle", # Drawing mode to capture clicks as circles + point_display_radius=3, # Radius of the circle that represents a click + key="canvas_inclusive" + ) + + # Process inclusive clicks + if canvas_inclusive.json_data is not None: + objects = canvas_inclusive.json_data["objects"] + new_clicks = [[(obj["left"] + obj["radius"]) / scale_factor, (obj["top"] + obj["radius"]) / scale_factor] for obj in objects] + st.session_state.inclusive_points.extend(new_clicks) + + # Plot the inclusive points on the original image using Matplotlib + fig_inclusive, ax = plt.subplots() + ax.imshow(image) + ax.axis('off') # Hide the axes + + # Prepare data for plotting + inclusive_points = np.array(st.session_state.inclusive_points) + labels_inclusive = np.array([1] * len(st.session_state.inclusive_points)) + + # Call the function to show inclusive points + if len(inclusive_points) > 0: + show_points(inclusive_points, labels_inclusive, ax) + + st.pyplot(fig_inclusive) + + # Divider + st.divider() + + # Exclusive Points Phase + st.subheader("Select Exclusive Points (Red)") + canvas_exclusive = st_canvas( + fill_color="rgba(0, 0, 0, 0)", # Transparent fill + stroke_width=1, # Stroke width for drawing + stroke_color="blue", # Color for the outline of clicks + background_image=resized_image, + update_streamlit=True, + height=display_height, + width=display_width, + drawing_mode="circle", # Drawing mode to capture clicks as circles + point_display_radius=3, # Radius of the circle that represents a click + key="canvas_exclusive" + ) + + # Process exclusive clicks + if canvas_exclusive.json_data is not None: + objects = canvas_exclusive.json_data["objects"] + new_clicks = [[(obj["left"] + obj["radius"]) / scale_factor, (obj["top"] + obj["radius"]) / scale_factor] for obj in objects] + st.session_state.exclusive_points.extend(new_clicks) + + # Plot the exclusive points on the original image using Matplotlib + fig_exclusive, ax = plt.subplots() + ax.imshow(image) + ax.axis('off') # Hide the axes + + # Prepare data for plotting + exclusive_points = np.array(st.session_state.exclusive_points) + labels_exclusive = np.array([0] * len(st.session_state.exclusive_points)) + + # Call the function to show exclusive points + if len(exclusive_points) > 0: + show_points(exclusive_points, labels_exclusive, ax) + + st.pyplot(fig_exclusive) + + # Grouping coordinates and labels + coordinates = st.session_state.inclusive_points + st.session_state.exclusive_points + labels = [1] * len(st.session_state.inclusive_points) + [0] * len(st.session_state.exclusive_points) + + # # Display grouped coordinates and labels + # st.subheader("Coordinates and Labels") + # st.write("Coordinates: ", tuple(coordinates)) + # st.write("Labels: ", labels) + + # Provide an option to clear the coordinates + if st.button("Clear All Points"): + st.session_state.inclusive_points = [] + st.session_state.exclusive_points = [] + # global unique_coordinates, unique_labels + unique_coordinates, unique_labels = remove_duplicates(coordinates, labels) + + st.write("Unique Coordinates:", tuple(unique_coordinates)) + st.write("Unique Labels:", tuple(unique_labels)) + + # image_mask_gen.show_masks(image, masks, scores, point_coords=input_point, input_labels=input_label) + sam2_checkpoint = "sam2_hiera_base_plus.pt" + model_cfg = "sam2_hiera_b+.yaml" + + sam2_model = build_sam2(model_cfg, sam2_checkpoint, device="cpu") + + predictor = SAM2ImagePredictor(sam2_model) + + image = image + predictor.set_image(image) + + input_point = np.array(unique_coordinates) + input_label = np.array(unique_labels) + + masks, scores, logits = predictor.predict( + point_coords=input_point, + point_labels=input_label, + multimask_output=True, + ) + sorted_ind = np.argsort(scores)[::-1] + masks = masks[sorted_ind] + scores = scores[sorted_ind] + logits = logits[sorted_ind] + + mask_input = logits[np.argmax(scores), :, :] + + masks, scores, _ = predictor.predict( + point_coords=input_point, + point_labels=input_label, + mask_input=mask_input[None, :, :], + multimask_output=False, + ) + image_mask_gen.show_masks(image, masks, scores, point_coords=input_point, input_labels=input_label) + + + # Get masked images + original_image = Image.open(uploaded_file) + # st.image(original_image, caption='Original Image', use_column_width=True) + + with st.container(border=True):# Display masked images + col1, col2 = st.columns(2) + with col1: + mask_images = image_mask_gen.show_masks_1(original_image, masks, scores) + for idx, (img, score) in enumerate(mask_images): + st.image(img, caption=f'Mask {idx+1}, Score: {score:.3f}', use_column_width=True) + with col2: + inverse_mask_images = image_mask_gen.show_inverse_masks(original_image, masks, scores) + for idx, (img, score) in enumerate(inverse_mask_images): + st.image(img, caption=f'Inverse Mask {idx+1}, Score: {score:.3f}', use_column_width=True) + + if st.checkbox("Proceed to Image Augmentation"): + + image_aug_select = st.sidebar.selectbox("Select Augmentation for Mask",["Pixelate","Hue Change","Mask Replacement","Generative Img2Img"]) + if image_aug_select == "Pixelate": + + if st.sidebar.toggle("Proceed to Pixelate Mask"): + pixelation_level = st.slider("Select Pixelation Level", min_value=5, max_value=50, value=10) + combined_image = image_mask_gen.combine_pixelated_mask(original_image, masks[0], pixelation_level) + st.image(combined_image, caption="Combined Pixelated Image", use_column_width=True) + elif image_aug_select == "Hue Change": + + if st.sidebar.toggle("Proceed to Hue Change"): + # Hue shift slider + hue_shift = st.slider("Select Hue Shift", min_value=-180, max_value=180, value=0) + # Apply hue change and show the result + combined_image = image_mask_gen.combine_hue_changed_mask(original_image, masks[0], hue_shift) # Assuming single mask + st.image(combined_image, caption="Combined Hue Changed Image", use_column_width=True) + elif image_aug_select == "Mask Replacement": + + if st.sidebar.toggle("Proceed to replace Mask"): + replacement_file = st.file_uploader("Upload the replacement image", type=["png", "jpg", "jpeg"]) + if replacement_file is not None: + replacement_image = Image.open(replacement_file) #.convert("RGBA") + combined_image = image_mask_gen.combine_mask_replaced_image(original_image, replacement_image, masks[0]) # Assuming single mask + st.image(combined_image, caption="Masked Area Replaced Image", use_column_width=True) + elif image_aug_select == "Generative Img2Img": + + msk_img = None + mask_images_x = image_mask_gen.show_masks_1(original_image, masks, scores) + for idx, (img, score) in enumerate(mask_images_x): + msk_img = img + # st.image(img, caption=f'Mask {idx+1}, Score: {score:.3f}', use_column_width=True) + + rgb_image = msk_img.convert("RGB") + # st.image(rgb_image) + resized_image = image_mask_gen.resize_image(rgb_image) + # st.image(resized_image, caption=f"Resized size: {resized_image.size[0]}x{resized_image.size[1]}", use_column_width=True) + width, height = resized_image.size + + # User input for the prompt and API key + prompt = st.text_input("Enter your prompt:", "A Beautiful day, in the style reference of starry night by vincent van gogh") + api_key = st.text_input("Enter your Stability AI API key:") + + if prompt and api_key: + # Set up our connection to the API. + os.environ['STABILITY_KEY'] = api_key + stability_api = client.StabilityInference( + key=os.environ['STABILITY_KEY'], # API Key reference. + verbose=True, # Print debug messages. + engine="stable-diffusion-xl-1024-v1-0", # Set the engine to use for generation. + ) + style_preset_selector = st.sidebar.selectbox("Select Style Preset",["3d-model", "analog-film", "anime", "cinematic", "comic-book", "digital-art", "enhance", "fantasy-art", "isometric", "line-art", "low-poly", "modeling-compound", "neon-punk", + "origami", "photographic", "pixel-art", "tile-texture"],index = 5) + if st.sidebar.toggle("Proceed to Generate Image"): + # Set up our initial generation parameters. + answers2 = stability_api.generate( + prompt=prompt, + init_image=resized_image, # Assign our uploaded image as our Initial Image for transformation. + start_schedule=0.6, + steps=250, + cfg_scale=10.0, + width=width, + height=height, + sampler=generation.SAMPLER_K_DPMPP_SDE, + style_preset=style_preset_selector + ) + + # Process the response from the API + for resp in answers2: + for artifact in resp.artifacts: + if artifact.finish_reason == generation.FILTER: + warnings.warn( + "Your request activated the API's safety filters and could not be processed." + "Please modify the prompt and try again.") + if artifact.type == generation.ARTIFACT_IMAGE: + img2 = Image.open(io.BytesIO(artifact.binary)) + # Display the generated image + st.image(img2, caption="Generated Image", use_column_width=True) + + # Combine the generated image with the original image using the mask + combined_img = image_mask_gen.combine_mask_and_inverse_gen(original_image, img2, masks[0]) + st.image(combined_img, caption="Combined Image", use_column_width=True) \ No newline at end of file diff --git a/image_mask_gen.py b/image_mask_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..d27685a98c80152396c9e6a7633cff89f2d6f81c --- /dev/null +++ b/image_mask_gen.py @@ -0,0 +1,285 @@ +import streamlit as st +import cv2 +import numpy as np +from PIL import Image + +def apply_mask(image_cv, mask, color=(0, 255, 0), alpha=0.5): + """ Apply a mask to an image with given color and alpha blend """ + mask_bgr = np.zeros_like(image_cv) + mask_bgr[mask > 0] = color + return cv2.addWeighted(image_cv, 1 - alpha, mask_bgr, alpha, 0) + +def draw_points(image_cv, points, labels): + """ Draw points on the image with different colors based on labels """ + for coord, label in zip(points, labels): + color = (0, 255, 0) if label == 1 else (255, 0, 0) # Green for inclusive, Red for exclusive + cv2.circle(image_cv, tuple(map(int, coord)), 5, color, -1) + return image_cv + +def draw_boxes(image_cv, boxes): + """ Draw boxes on the image """ + for box in boxes: + x, y, w, h = map(int, box) + cv2.rectangle(image_cv, (x, y), (x + w, y + h), (255, 0, 0), 2) # Red boxes + return image_cv + +def show_masks(image, masks, scores, point_coords=None, box_coords=None, input_labels=None, borders=True): + image_cv = np.array(image.convert("RGB"))[..., ::-1] # Convert PIL image to BGR format for OpenCV + + for i, (mask, score) in enumerate(zip(masks, scores)): + image_with_mask = apply_mask(image_cv, mask) + + if point_coords is not None: + assert input_labels is not None + image_with_mask = draw_points(image_with_mask, point_coords, input_labels) + + if box_coords is not None: + image_with_mask = draw_boxes(image_with_mask, box_coords) + + # Convert back to RGB and then to PIL for Streamlit + image_with_mask = cv2.cvtColor(image_with_mask, cv2.COLOR_BGR2RGB) + image_pil = Image.fromarray(image_with_mask) + + # Display the final image with all overlays + st.image(image_pil, caption=f"Mask {i+1}, Score: {score:.3f}", use_column_width=True) + + +def apply_mask_to_image(image, mask): + # Ensure the image is a NumPy array in BGR format + if isinstance(image, Image.Image): + image = np.array(image) + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + + # Create an alpha channel based on the mask + alpha_channel = (mask * 255).astype(np.uint8) + + # Create an image with the mask applied only on masked areas + masked_image = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8) + for c in range(3): # Apply the mask only to the RGB channels + masked_image[..., c] = image[..., c] * mask + + # Add the alpha channel to make areas outside the mask transparent + masked_image[..., 3] = alpha_channel + + return masked_image + +def show_masks_1(image, masks, scores): + mask_images = [] + for i, (mask, score) in enumerate(zip(masks, scores)): + # Apply the mask to the image + masked_image = apply_mask_to_image(image, mask) + + # Convert the masked image to PIL format for Streamlit + pil_image = Image.fromarray(cv2.cvtColor(masked_image, cv2.COLOR_BGRA2RGBA)) + mask_images.append((pil_image, score)) + + return mask_images + + +def apply_inverse_mask_to_image(image, mask): + # Ensure the image is a NumPy array in BGR format + if isinstance(image, Image.Image): + image = np.array(image) + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + + # Create an alpha channel that is transparent inside the mask and opaque outside + alpha_channel = (1 - mask) * 255 + + # Create an image with the mask applied to the inverse areas + inverse_masked_image = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8) + for c in range(3): # Apply the inverse mask to RGB channels + inverse_masked_image[..., c] = image[..., c] * (1 - mask) + + # Add the alpha channel to make areas inside the mask transparent + inverse_masked_image[..., 3] = alpha_channel.astype(np.uint8) + + return inverse_masked_image + +def show_inverse_masks(image, masks, scores): + mask_images = [] + for i, (mask, score) in enumerate(zip(masks, scores)): + # Apply the inverse mask to the image + inverse_masked_image = apply_inverse_mask_to_image(image, mask) + + # Convert the masked image to PIL format for Streamlit + pil_image = Image.fromarray(cv2.cvtColor(inverse_masked_image, cv2.COLOR_BGRA2RGBA)) + mask_images.append((pil_image, score)) + + return mask_images + +import streamlit as st +import cv2 +import numpy as np +from PIL import Image + +def combine_mask_and_inverse(image, mask): + + # Ensure the image is a NumPy array in BGR format + if isinstance(image, Image.Image): + image = np.array(image) + image = cv2.cvtColor(image, cv2.COLOR_RGBA2BGR) + + # Apply the mask to get the masked region (in original color) + masked_region = cv2.bitwise_and(image, image, mask=mask.astype(np.uint8)) + + # Apply the inverse mask to get the inverse-masked region (in original color) + inverse_mask = 1 - mask + inverse_masked_region = cv2.bitwise_and(image, image, mask=inverse_mask.astype(np.uint8)) + + # Combine both masked and inverse-masked regions + combined_image = cv2.add(masked_region, inverse_masked_region) + + # Convert to RGBA format for transparency + combined_image_rgba = cv2.cvtColor(combined_image, cv2.COLOR_BGR2RGBA) + + return combined_image_rgba + +def show_combined_masks(image, masks, scores): + + mask_images = [] + for i, (mask, score) in enumerate(zip(masks, scores)): + # Combine masked and inverse masked areas + combined_image = combine_mask_and_inverse(image, mask) + + # Convert the combined image to PIL format for Streamlit + pil_image = Image.fromarray(combined_image) + mask_images.append((pil_image, score)) + + return mask_images + + +def pixelate_area(image, mask, pixelation_level): + """ + Apply pixelation to the masked area of an image. + """ + pixelated_image = image.copy() + h, w, _ = image.shape + + for y in range(0, h, pixelation_level): + for x in range(0, w, pixelation_level): + block = (slice(y, min(y + pixelation_level, h)), slice(x, min(x + pixelation_level, w))) + if np.any(mask[block]): + mean_color = image[block].mean(axis=(0, 1)).astype(int) + pixelated_image[block] = mean_color + + return pixelated_image + +def combine_pixelated_mask(image, mask, pixelation_level=10): + """ + Combine the pixelated masked areas with the original image. + """ + image_np = np.array(image) + mask_np = np.array(mask) + + pixelated_mask = pixelate_area(image_np, mask_np, pixelation_level) + combined_image = Image.fromarray(pixelated_mask) + return combined_image + + +def change_hue(image, mask, hue_shift): + + # Convert the image from RGB to HSV + hsv_image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB) + hsv_image = cv2.cvtColor(hsv_image, cv2.COLOR_RGB2HSV) + + # Apply the hue shift to the masked area + hsv_image[..., 0] = (hsv_image[..., 0] + hue_shift) % 180 + + # Convert back to RGB format + rgb_image = cv2.cvtColor(hsv_image, cv2.COLOR_HSV2RGB) + + # Combine the hue-changed area with the original image using the mask + hue_changed_image = np.array(image).copy() + hue_changed_image[mask] = np.concatenate((rgb_image[mask], hue_changed_image[mask][..., 3:]), axis=-1) + + return hue_changed_image + +def combine_hue_changed_mask(image, mask, hue_shift): + + image_np = np.array(image) + mask_np = np.array(mask).astype(bool) + + hue_changed_area = change_hue(image_np, mask_np, hue_shift) + combined_image = Image.fromarray(hue_changed_area) + + return combined_image + +def replace_masked_area(original_image, replacement_image, mask): + # Ensure the replacement image is the same size as the original image + replacement_image = cv2.resize(replacement_image, (original_image.shape[1], original_image.shape[0])) + + # Create a copy of the original image + replaced_image = original_image.copy() + + # Replace the masked area with the corresponding area from the replacement image + replaced_image[mask] = replacement_image[mask] + + return replaced_image + +def combine_mask_replaced_image(original_image, replacement_image, mask): + + # Convert images to NumPy arrays + original_np = np.array(original_image) + replacement_np = np.array(replacement_image) + mask_np = np.array(mask).astype(bool) + + # Replace the masked area + replaced_area = replace_masked_area(original_np, replacement_np, mask_np) + combined_image = Image.fromarray(replaced_area) + + return combined_image + +import streamlit as st +from PIL import Image + +def resize_image(image, max_size=1024): + # Get the current width and height of the image + width, height = image.size + + # Calculate the scaling factor + if width > height: + scaling_factor = max_size / width + else: + scaling_factor = max_size / height + + # Only resize if the image is larger than the max_size + if scaling_factor < 1: + # Calculate new dimensions + new_width = int(width * scaling_factor) + new_height = int(height * scaling_factor) + + # Resize the image + image_resized = image.resize((new_width, new_height)) + return image_resized + else: + # Return the original image if it's already within the size limits + return image + + +def combine_mask_and_inverse_gen(original_img, generated_img, mask): + # Ensure images are in RGBA mode + original_img = original_img.convert("RGBA") + generated_img = generated_img.convert("RGBA") + + # Resize the generated image to match the original image size + generated_img = generated_img.resize(original_img.size) + + # Convert images to arrays + orig_array = np.array(original_img) + gen_array = np.array(generated_img) + + # Resize the mask to match the original image size + mask = Image.fromarray((mask * 255).astype(np.uint8)) # Convert mask to image for resizing + mask = mask.resize(original_img.size, Image.NEAREST) # Resize the mask + bool_mask = np.array(mask).astype(bool) + + # Ensure the mask has the correct shape (H, W, 1) + if bool_mask.ndim == 2: + bool_mask = bool_mask[:, :, np.newaxis] + + # Combine images using the mask + combined_array = np.where(bool_mask, gen_array, orig_array) + + # Convert combined array back to image + combined_img = Image.fromarray(combined_array, "RGBA") + return combined_img diff --git a/images/background.mp4 b/images/background.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a3722fb2078e439a8c1febb9029de0add332ed76 --- /dev/null +++ b/images/background.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92ca11934ec6540cf3fb0d5225aff2742683ce986f6269852ed18a751fb76a54 +size 28245879 diff --git a/images/genai shaolin.mp4 b/images/genai shaolin.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..bad2525aa0675f99e8c32a2393e482a8fe1bda59 --- /dev/null +++ b/images/genai shaolin.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:699d86e4be50ca525808198a816a14fdb584bfc3bcaff61afa755c368ed8fb82 +size 1060558 diff --git a/images/image_annote.mp4 b/images/image_annote.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d1d233a668965d4616026deccd33e0117ed2ca59 --- /dev/null +++ b/images/image_annote.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6acf56d66dbb2fa3fc2f4f0ba9e4591282f41cf0829c4a81686e89010963a66f +size 30740936 diff --git a/images/image_aug.mp4 b/images/image_aug.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..f964388ab2640e98d7624ea0333420b6df557dcf --- /dev/null +++ b/images/image_aug.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02edb90435a0024388ec09a6c6a28cf7e670e42de0da5792e29592460c4f44dd +size 70042465 diff --git a/images/pix_output_video (1).mp4 b/images/pix_output_video (1).mp4 new file mode 100644 index 0000000000000000000000000000000000000000..faa0017ebe94a90fd73306089eb32c6b0104becd --- /dev/null +++ b/images/pix_output_video (1).mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d68ee725423da70f72c91ee747f566364c1298303e3e0c2d2c863f0a0b4e01a +size 2042041 diff --git a/images/redhulk.mp4 b/images/redhulk.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..73034b3e936905c9c94e47165a989ca2a80187c2 --- /dev/null +++ b/images/redhulk.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d98e9dae8d17acfec3baad33cc7f6445309e9eaf270ced2284b93d17eb42666f +size 2452133 diff --git a/images/with_replacement_output_video.mp4 b/images/with_replacement_output_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..17b216134cf9f56121c15f6b6ac7735e45cdc577 --- /dev/null +++ b/images/with_replacement_output_video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70884dd88bb7935ff2d492df82a1940487dc8f2bb0194547b7236f43b009faa9 +size 8324371 diff --git a/images/zoe.mp4 b/images/zoe.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c4dbf28de8a9d19d73916b909352ca76c33e6ad6 --- /dev/null +++ b/images/zoe.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e24090d2db21ddd34892666e9b1eb907bd1bf3cfe5516c268b8b522180a6eb16 +size 2368843 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..bca84ed6136409b99e60007ecd33ced53333df7d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +torch>=2.3.1 +torchvision>=0.18.1 +numpy>=1.24.4 +tqdm>=4.66.1 +hydra-core>=1.3.2 +iopath>=0.1.10 +pillow>=9.4.0 +streamlit-drawable-canvas>=0.9.3 +opencv-python>=4.10.0.84 +stability-sdk>=0.8.6 \ No newline at end of file diff --git a/sam-2-meta-video-augmentation-with-yolo-and-genai.ipynb b/sam-2-meta-video-augmentation-with-yolo-and-genai.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..81ab0a44f65e53b0f445596287ff25429389a20c --- /dev/null +++ b/sam-2-meta-video-augmentation-with-yolo-and-genai.ipynb @@ -0,0 +1 @@ +{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30762,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Video Augmentation using META SAM-2 Model with YOLO model and Stability AI","metadata":{}},{"cell_type":"markdown","source":"### Importing Images with Annoted text file for Yolov8n Model Training","metadata":{}},{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### upload your image directory with .txt annoted file in the format required by yolo model for training, with video on which model has to predict.\n\n### incase if wants to use pre_trained YOLO model, jump to section of pretrained model., or incase want to manually put coordinates on a frame jump to section of video segmenting.","metadata":{}},{"cell_type":"markdown","source":"### Installing Required Libraries","metadata":{}},{"cell_type":"code","source":"!pip install ultralytics opencv-python\n!pip install -U ipywidgets","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Yolov8n Model training ","metadata":{}},{"cell_type":"markdown","source":"## Yaml file creation and model training\n","metadata":{}},{"cell_type":"code","source":"from ultralytics import YOLO\nimport cv2\nimport matplotlib.pyplot as plt\n\n# Load YOLOv8 model configuration (e.g., YOLOv8 nano model)\nmodel = YOLO('yolov8n.yaml')\n\n# Create a dataset.yaml file for YOLOv8 training\ndataset_yaml_content = \"\"\"\ntrain: \"/kaggle/input/yolov-train-data/Bottle\"\nval: \"/kaggle/input/yolov-train-data/Bottle\"\nnc: 1 # Number of classes (1 in this case)\nnames: ['bottle']\n\"\"\"\n\n# Save the dataset.yaml file\nwith open('dataset.yaml', 'w') as f:\n f.write(dataset_yaml_content)\n\n \n\n# Train the model with the specified dataset and parameters\nmodel.train(\n data='dataset.yaml', # Path to the dataset.yaml file\n epochs=100, # Increase epochs for better results with small datasets\n imgsz=1024, # Use the resized image dimensions\n batch=1, # Set batch size to 4 due to limited data\n patience=50, # Early stopping if no improvement\n lr0=0.0001, # Start with a lower learning rate\n augment=True, # Enable data augmentation\n# weights='yolov8n.pt' # Start training with pre-trained weights (optional)\n)\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Note: You may have to enter wandb.ai api if using Kaggle","metadata":{}},{"cell_type":"markdown","source":"## prediction on an Image","metadata":{}},{"cell_type":"code","source":"# Load a test image\nimg = cv2.imread('/kaggle/input/yolov-train-data/Bottle/IMG202408142240012.jpg')\n\n# Predict\nresults = model.predict(img)\n\n# Alternatively, you can use matplotlib to display the results\nplt.imshow(results[0].plot()) # `plot` returns an image with bounding boxes drawn\nplt.axis('off')\nplt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Predicting on Video & detecting the First Frame, and its center coordinates","metadata":{}},{"cell_type":"code","source":"# Process the video\nvideo_path = '/kaggle/input/yolov-train-data/VID202408142242002.mp4'\ncap = cv2.VideoCapture(video_path)\n\nx_center=0\ny_center=0\nframe_number = 0\nobject_detected = False\n\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n\n frame_number += 1\n\n # Run YOLOv8 detection\n results = model(frame)\n\n for r in results:\n if r.boxes: # Check if any object is detected\n for box in r.boxes:\n # Get the bounding box coordinates\n x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()\n\n # Calculate the center coordinates\n x_center = int((x1 + x2) / 2)\n y_center = int((y1 + y2) / 2)\n \n # Print the first frame number and center coordinates\n print(f\"First detection at frame: {frame_number}\")\n print(f\"Center coordinates: (x={x_center}, y={y_center})\")\n\n object_detected = True\n break\n\n if object_detected:\n break\n\ncap.release()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(\"x_center:\",x_center)\nprint(\"y_center:\",y_center)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Using Yolov8s pretrained model for direct detection and getting the frame","metadata":{}},{"cell_type":"markdown","source":"#### just mention class name and it will return frame no. and coordinates","metadata":{}},{"cell_type":"code","source":"# Load the YOLOv8s model\nmodel = YOLO('yolov8s.pt') # Make sure the model is trained on the \"bottle\" class\n\n# Process the video\nvideo_path = '/kaggle/input/yolov-train-data/VID202408142242002.mp4'\ncap = cv2.VideoCapture(video_path)\n\nx_center = 0\ny_center = 0\nframe_number = 0\nobject_detected = False\nconfidence_threshold = 0.8 # Set the confidence threshold\n\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n\n frame_number += 1\n\n # Run YOLOv8 detection\n results = model(frame)\n\n for r in results:\n for box in r.boxes:\n # Get the class label for the detected object\n cls = int(box.cls[0].cpu().numpy())\n class_name = model.names[cls]\n\n # Check if the detected object is a \"bottle\" and has confidence > 0.8\n confidence = box.conf[0].cpu().numpy()\n if class_name == 'bottle' and confidence > confidence_threshold:\n # Get the bounding box coordinates\n x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()\n\n # Calculate the center coordinates\n x_center = int((x1 + x2) / 2)\n y_center = int((y1 + y2) / 2)\n \n # Print the first frame number and center coordinates\n print(f\"First bottle detection at frame: {frame_number}\")\n print(f\"Center coordinates: (x={x_center}, y={y_center}) with confidence {confidence:.2f}\")\n\n object_detected = True\n break # Exit the loop after the first detection\n\n if object_detected:\n break # Exit the main loop after the first detection\n\ncap.release()\n\n# If no bottle was detected with confidence > 0.8\nif not object_detected:\n print(\"No requested Object detected in the video with confidence greater than 0.8.\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(\"x_center:\",x_center)\nprint(\"y_center:\",y_center)\nprint(\"Frame No.:\",frame_number)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### clearing GPU cache","metadata":{}},{"cell_type":"code","source":"import torch\ntorch.cuda.empty_cache()\nprint(\"Done\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Video segmenting","metadata":{}},{"cell_type":"markdown","source":"### importing SAM-2 model (may take a while to download)","metadata":{}},{"cell_type":"code","source":"!git clone https://github.com/facebookresearch/segment-anything-2.git\n%cd /kaggle/working/segment-anything-2\n%pip install -e .\n%cd /kaggle/working/segment-anything-2/checkpoints\n!bash /kaggle/working/segment-anything-2/checkpoints/download_ckpts.sh\n%cd /kaggle/working/segment-anything-2","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import numpy as np\nimport torch\nimport matplotlib.pyplot as plt\nfrom PIL import Image","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# use bfloat16 for the entire notebook\ntorch.autocast(device_type=\"cuda\", dtype=torch.float16).__enter__()\n\nif torch.cuda.get_device_properties(0).major >= 8:\n # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)\n torch.backends.cuda.matmul.allow_tf32 = True\n torch.backends.cudnn.allow_tf32 = True","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## video to frames","metadata":{}},{"cell_type":"code","source":"import cv2\nimport os\nimport shutil\n\ndef video_to_frames(video_path, output_folder):\n # Ensure the output folder is clean\n if os.path.exists(output_folder):\n shutil.rmtree(output_folder)\n os.makedirs(output_folder)\n \n # Open the video file\n video_capture = cv2.VideoCapture(video_path)\n \n frame_count = 0\n success = True\n\n while success:\n success, frame = video_capture.read()\n if success:\n # Save the frame with a consistent naming convention\n frame_filename = os.path.join(output_folder, f\"{frame_count:05d}.jpg\")\n cv2.imwrite(frame_filename, frame)\n frame_count += 1\n\n video_capture.release()\n print(f\"Extracted {frame_count} frames to {output_folder}\")\n return frame_count\n\n# Example usage\nvideo_path = \"/kaggle/input/shaolin-soccer/Untitled video - Made with Clipchamp.mp4\"\noutput_folder = \"/kaggle/working/output_frames\"\ntotal_frames = video_to_frames(video_path, output_folder)\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## reordering Frames to video propagation\n","metadata":{}},{"cell_type":"code","source":"frame_number =0 ","metadata":{"execution":{"iopub.status.busy":"2024-08-23T05:45:01.624801Z","iopub.execute_input":"2024-08-23T05:45:01.625582Z","iopub.status.idle":"2024-08-23T05:45:01.636025Z","shell.execute_reply.started":"2024-08-23T05:45:01.625533Z","shell.execute_reply":"2024-08-23T05:45:01.634951Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"markdown","source":"### (replace it with **frame_number** if using YOLO model)\n\n#### frame_number = frame_number","metadata":{}},{"cell_type":"code","source":"import os\nimport shutil\n\ndef reorder_frames(video_dir, ann_frame_idx, output_dir):\n # Ensure the output directory is clean\n if os.path.exists(output_dir):\n shutil.rmtree(output_dir)\n os.makedirs(output_dir)\n \n # Get and sort the list of frame filenames\n frame_names = [\n p for p in os.listdir(video_dir)\n if os.path.splitext(p)[-1] in [\".jpg\", \".jpeg\", \".JPG\", \".JPEG\"]\n ]\n frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))\n \n total_frames = len(frame_names)\n \n # Copy and reorder the frames to the new directory\n for i in range(total_frames):\n if i >= ann_frame_idx:\n new_idx = i - ann_frame_idx\n else:\n new_idx = total_frames - ann_frame_idx + i\n old_path = os.path.join(video_dir, frame_names[i])\n new_path = os.path.join(output_dir, f\"{new_idx:05d}.jpg\")\n shutil.copy2(old_path, new_path)\n \n print(f\"Frames reordered and copied to {output_dir} successfully.\")\n return len(os.listdir(output_dir))\n\n# Example usage\nreordered_dir = \"/kaggle/working/reordered_frames\"\nann_frame_idx = frame_number # Frame index to start as 0\nreordered_count = reorder_frames(output_folder, ann_frame_idx, reordered_dir)\n\n# Verify total frame consistency\nif total_frames == reordered_count:\n print(\"Frame count matches after reordering.\")\nelse:\n print(f\"Frame count mismatch! Extracted: {total_frames}, Reordered: {reordered_count}\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Importing Model and creating predictor","metadata":{}},{"cell_type":"code","source":"from sam2.build_sam import build_sam2_video_predictor\n\nsam2_checkpoint = \"/kaggle/working/segment-anything-2/checkpoints/sam2_hiera_base_plus.pt\"\nmodel_cfg = \"sam2_hiera_b+.yaml\"\n\npredictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## checking image where object is detected","metadata":{}},{"cell_type":"code","source":"frame_no = frame_number\n\ndef show_mask(mask, ax, obj_id=None, random_color=False):\n if random_color:\n color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)\n else:\n cmap = plt.get_cmap(\"tab10\")\n cmap_idx = 0 if obj_id is None else obj_id\n color = np.array([*cmap(cmap_idx)[:3], 0.6])\n h, w = mask.shape[-2:]\n mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)\n ax.imshow(mask_image)\n\n\ndef show_points(coords, labels, ax, marker_size=200):\n pos_points = coords[labels==1]\n neg_points = coords[labels==0]\n ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)\n ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)\n \n# `video_dir` a directory of JPEG frames with filenames like `.jpg`\nvideo_dir = \"/kaggle/working/reordered_frames\"\n\n# scan all the JPEG frame names in this directory\nframe_names = [\n p for p in os.listdir(video_dir)\n if os.path.splitext(p)[-1] in [\".jpg\", \".jpeg\", \".JPG\", \".JPEG\"]\n]\nframe_names.sort(key=lambda p: int(os.path.splitext(p)[0]))\n\n# take a look the first video frame\nframe_idx = frame_no\nplt.figure(figsize=(12, 8))\nplt.title(f\"frame {frame_idx}\")\nplt.imshow(Image.open(os.path.join(video_dir, frame_names[frame_idx])))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"inference_state = predictor.init_state(video_path=video_dir)\npredictor.reset_state(inference_state)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Masking the image object where object is detected in frame with coordinates","metadata":{}},{"cell_type":"code","source":"x_center= 1050\ny_center = 650","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### in case using Yolo model replace,\n\n### x_center =x_center\n### y_center =y_center","metadata":{}},{"cell_type":"code","source":"ann_frame_idx = 0 # the frame index we interact with\nann_obj_id = 1 # give a unique id to each object we interact with (it can be any integers)\nx = x_center\ny = y_center\n\npoints = np.array([[x,y]], dtype=np.float32)\nlabels = np.array([1], np.int32)\n_, out_obj_ids, out_mask_logits = predictor.add_new_points(\n inference_state=inference_state,\n frame_idx=ann_frame_idx,\n obj_id=ann_obj_id,\n points=points,\n labels=labels,\n)\n\nplt.figure(figsize=(12, 8))\nplt.title(f\"frame {ann_frame_idx}\")\nplt.imshow(Image.open(os.path.join(video_dir, frame_names[ann_frame_idx])))\nshow_points(points, labels, plt.gca())\nshow_mask((out_mask_logits[0] > 0.0).cpu().numpy(), plt.gca(), obj_id=out_obj_ids[0])","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Note: provide additional points if object not detected properly\n\n### in the format\n#### points = np.array([[x,y],[x1,y1],[x2,y2]], dtype=np.float32)\n#### labels = np.array([1,1,1], np.int32)\n\n#### in labels 1 indicate inclusive and 0 excluding point","metadata":{}},{"cell_type":"code","source":"def count_files_in_folder(folder_path):\n \"\"\"\n Count the number of files in a given folder.\n \n Args:\n - folder_path (str): Path to the folder.\n \n Returns:\n - int: Number of files in the folder.\n \"\"\"\n return len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])\n\n# Example usage\nfolder_path = \"/kaggle/working/reordered_frames\" # Replace with your actual folder path\nnum_files = count_files_in_folder(folder_path)\nprint(f\"Number of files in the folder: {num_files}\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Mask generation\n### Propagating into Video with reordered Frames","metadata":{}},{"cell_type":"markdown","source":"### if Addition points are provided also change them in below code","metadata":{}},{"cell_type":"code","source":"import os\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom PIL import Image\nimport shutil # Importing shutil to remove directories\n\ndef apply_mask_to_image(frame, mask):\n \"\"\"\n Apply a mask to an image frame, setting non-mask areas to zero.\n \"\"\"\n h, w, _ = frame.shape\n mask_resized = np.resize(mask, (h, w)) # Resize mask to match frame dimensions\n mask_3d = np.repeat(mask_resized[:, :, np.newaxis], 3, axis=2) # Expand mask dimensions for RGB channels\n masked_frame = frame * mask_3d # Apply the mask to the frame\n return masked_frame\n\ndef show_mask(mask, ax, obj_id=None, random_color=False):\n if random_color:\n color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)\n else:\n cmap = plt.get_cmap(\"tab10\")\n cmap_idx = 0 if obj_id is None else obj_id\n color = np.array([*cmap(cmap_idx)[:3], 0.6])\n h, w = mask.shape[-2:]\n mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)\n ax.imshow(mask_image)\n\ndef show_points(coords, labels, ax, marker_size=200):\n pos_points = coords[labels == 1]\n neg_points = coords[labels == 0]\n ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)\n ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)\n\n# `video_dir` a directory of JPEG frames with filenames like `.jpg`\nvideo_dir = \"/kaggle/working/reordered_frames\"\n\n# Scan all the JPEG frame names in this directory\nframe_names = [\n p for p in os.listdir(video_dir)\n if os.path.splitext(p)[-1] in [\".jpg\", \".jpeg\", \".JPG\", \".JPEG\"]\n]\nframe_names.sort(key=lambda p: int(os.path.splitext(p)[0]))\n\n# Initialize predictor and inference state\ninference_state = predictor.init_state(video_path=video_dir)\n\n# Reset the predictor state\npredictor.reset_state(inference_state)\n\n# Frame and object IDs\nann_frame_idx = 0 # frames are reordered\nann_obj_id = 1 # Give a unique ID to each object we interact with (can be any integer)\n\n# Add a 2nd positive click at (x, y) = (250, 220) to refine the mask\npoints = np.array([[x,y]], dtype=np.float32)\nlabels = np.array([1], np.int32) # 1 means positive click, 0 means negative click\n_, out_obj_ids, out_mask_logits = predictor.add_new_points(\n inference_state=inference_state,\n frame_idx=ann_frame_idx,\n obj_id=ann_obj_id,\n points=points,\n labels=labels,\n)\n\n# Run propagation throughout the video and collect the results in a dict\nvideo_segments = {} # video_segments contains the per-frame segmentation results\nfor out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):\n video_segments[out_frame_idx] = {\n out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()\n for i, out_obj_id in enumerate(out_obj_ids)\n }\n\n# Create an output directory for images\noutput_dir = '/kaggle/working/mask_segmentation_images'\nif not os.path.exists(output_dir):\n os.makedirs(output_dir)\nelse:\n # If the directory exists, clear its kaggle/workings\n for filename in os.listdir(output_dir):\n file_path = os.path.join(output_dir, filename)\n try:\n if os.path.isfile(file_path) or os.path.islink(file_path):\n os.unlink(file_path)\n elif os.path.isdir(file_path):\n shutil.rmtree(file_path)\n except Exception as e:\n print(f\"Failed to delete {file_path}. Reason: {e}\")\n\n# Render and save masked images every few frames\nvis_frame_stride = 1\nplt.close(\"all\")\nfor out_frame_idx in range(0, len(frame_names), vis_frame_stride):\n frame = np.array(Image.open(os.path.join(video_dir, frame_names[out_frame_idx])))\n masked_frame = frame.copy() # Create a copy of the frame for modification\n for out_obj_id, out_mask in video_segments[out_frame_idx].items():\n masked_frame = apply_mask_to_image(masked_frame, out_mask)\n\n # Convert masked frame to Image object for saving\n masked_image = Image.fromarray(masked_frame.astype('uint8'))\n masked_image.save(os.path.join(output_dir, f'frame_{out_frame_idx}.png'))\n\n # Optional: Display the masked frame\n# plt.figure(figsize=(6, 4))\n# plt.title(f\"frame {out_frame_idx}\")\n# plt.imshow(masked_frame)\n# plt.show()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### we can also display the masked frame(s) by un-commenting the last 4 rows","metadata":{}},{"cell_type":"markdown","source":"## restore Original order of the video frames\n\n### this will restore the original order of the frames","metadata":{}},{"cell_type":"code","source":"import os\nimport shutil\n\ndef restore_original_order(video_dir, ann_frame_idx, output_dir):\n \"\"\"\n Restore the original order of frames from a directory and save them into a new directory.\n \n Args:\n - video_dir (str): Directory containing the reordered frames.\n - ann_frame_idx (int): The frame index used to start the reordering.\n - output_dir (str): Directory to save the restored frames.\n \"\"\"\n # Ensure the output directory is clean\n if os.path.exists(output_dir):\n shutil.rmtree(output_dir)\n os.makedirs(output_dir)\n \n # Get a list of all frame filenames in the original directory\n frame_names = [\n p for p in os.listdir(video_dir)\n if p.endswith(\".png\") and p.startswith(\"frame_\")\n ]\n \n # Ensure frames are sorted numerically by extracting the number from the filename\n frame_names.sort(key=lambda p: int(p.split('_')[-1].split('.')[0]))\n\n # Calculate total number of frames\n total_frames = len(frame_names)\n\n # Calculate the original frame indices\n original_indices = {}\n for i in range(total_frames):\n if i < (total_frames - ann_frame_idx):\n original_idx = i + ann_frame_idx\n else:\n original_idx = i - (total_frames - ann_frame_idx)\n original_indices[frame_names[i]] = f\"frame_{original_idx:03d}.png\"\n \n # Copy and rename the files into the new directory\n for old_name, new_name in original_indices.items():\n old_path = os.path.join(video_dir, old_name)\n new_path = os.path.join(output_dir, new_name)\n shutil.copy2(old_path, new_path)\n \n print(f\"Frames restored to original order and saved to {output_dir} successfully.\")\n\n# Example usage\nvideo_dir = \"/kaggle/working/mask_segmentation_images\" # Replace with your original frames directory\nann_frame_idx = 0 # The frame index used to start the reordering\noutput_dir = \"/kaggle/working/restored_frames\" # Replace with your desired output folder path\nrestore_original_order(video_dir, ann_frame_idx, output_dir)\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## converting mask Frames back to video","metadata":{}},{"cell_type":"code","source":"import cv2\nimport os\n\ndef frames_to_video(frames_folder, output_video_path, fps=30):\n # Check if the output video file already exists and delete it\n if os.path.exists(output_video_path):\n try:\n os.remove(output_video_path)\n print(f\"Existing file {output_video_path} removed.\")\n except Exception as e:\n print(f\"Failed to remove {output_video_path}. Reason: {e}\")\n return\n\n # Get a list of frame files and sort them by name\n frame_files = [f for f in os.listdir(frames_folder) if f.endswith('.png')]\n frame_files.sort(key=lambda f: int(f.split('_')[-1].split('.')[0])) # Sort by frame number\n\n # Check if there are any frames to process\n if not frame_files:\n print(\"No frames found in the specified folder.\")\n return\n\n # Read the first frame to get the dimensions\n first_frame_path = os.path.join(frames_folder, frame_files[0])\n first_frame = cv2.imread(first_frame_path)\n if first_frame is None:\n print(f\"Failed to read the first frame at {first_frame_path}\")\n return\n height, width, _ = first_frame.shape\n\n # Initialize the video writer\n fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for mp4 format\n video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))\n\n # Write each frame to the video\n for frame_file in frame_files:\n frame_path = os.path.join(frames_folder, frame_file)\n frame = cv2.imread(frame_path)\n if frame is None:\n print(f\"Failed to read frame at {frame_path}\")\n continue\n video_writer.write(frame)\n\n # Release the video writer\n video_writer.release()\n print(f\"Video saved to {output_video_path}\")\n\n# Example usage\nframes_folder = r'/kaggle/working/restored_frames' # Replace with the folder containing your frames\noutput_video_path = r\"/kaggle/working/mask_output_video.mp4\" # Desired output video file path\n\nframes_to_video(frames_folder, output_video_path, fps=30)\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Inverse Mask Generation","metadata":{}},{"cell_type":"markdown","source":"### similarly in case of additional points make changes here also","metadata":{}},{"cell_type":"code","source":"def clear_output_directory(directory):\n \"\"\"\n Remove all files in the given directory.\n \"\"\"\n if os.path.exists(directory):\n for file in os.listdir(directory):\n file_path = os.path.join(directory, file)\n try:\n if os.path.isfile(file_path):\n os.unlink(file_path)\n except Exception as e:\n print(f\"Failed to delete {file_path}. Reason: {e}\")\n\ndef apply_inverse_mask_to_image(frame, mask):\n \"\"\"\n Apply the inverse of a mask to an image frame, setting mask areas to zero.\n \"\"\"\n h, w, _ = frame.shape\n mask_resized = np.resize(mask, (h, w)) # Resize mask to match frame dimensions\n inverse_mask = 1 - mask_resized # Invert the mask\n mask_3d = np.repeat(inverse_mask[:, :, np.newaxis], 3, axis=2) # Expand mask dimensions for RGB channels\n masked_frame = frame * mask_3d # Apply the inverse mask to the frame\n return masked_frame\n\ndef save_masked_image(masked_frame, out_frame_idx, output_dir):\n \"\"\"\n Save the masked image to the output directory.\n \"\"\"\n # Convert masked frame to Image object for saving\n masked_image = Image.fromarray(masked_frame.astype('uint8'))\n masked_image.save(os.path.join(output_dir, f'frame_{out_frame_idx}.png'))\n\ndef show_mask(mask, ax, obj_id=None, random_color=False):\n if random_color:\n color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)\n else:\n cmap = plt.get_cmap(\"tab10\")\n cmap_idx = 0 if obj_id is None else obj_id\n color = np.array([*cmap(cmap_idx)[:3], 0.6])\n h, w = mask.shape[-2:]\n mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)\n ax.imshow(mask_image)\n\ndef show_points(coords, labels, ax, marker_size=200):\n pos_points = coords[labels == 1]\n neg_points = coords[labels == 0]\n ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)\n ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)\n\n# `video_dir` a directory of JPEG frames with filenames like `.jpg`\nvideo_dir = \"/kaggle/working/reordered_frames\"\n\n# Scan all the JPEG frame names in this directory\nframe_names = [\n p for p in os.listdir(video_dir)\n if os.path.splitext(p)[-1] in [\".jpg\", \".jpeg\", \".JPG\", \".JPEG\"]\n]\nframe_names.sort(key=lambda p: int(os.path.splitext(p)[0]))\n\n# Initialize predictor and inference state\ninference_state = predictor.init_state(video_path=video_dir)\n\n# Reset the predictor state\npredictor.reset_state(inference_state)\n\n# Frame and object IDs\nann_frame_idx = 0 # The frame index we interact with\nann_obj_id = 1 # Give a unique ID to each object we interact with (can be any integer)\n\n# Add a 2nd positive click at (x, y) = (250, 220) to refine the mask\npoints = np.array([[x,y]], dtype=np.float32)\nlabels = np.array([1], np.int32) # 1 means positive click, 0 means negative click\n_, out_obj_ids, out_mask_logits = predictor.add_new_points(\n inference_state=inference_state,\n frame_idx=ann_frame_idx,\n obj_id=ann_obj_id,\n points=points,\n labels=labels,\n)\n\n# Run propagation throughout the video and collect the results in a dict\nvideo_segments = {} # video_segments contains the per-frame segmentation results\nfor out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):\n video_segments[out_frame_idx] = {\n out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()\n for i, out_obj_id in enumerate(out_obj_ids)\n }\n\n# Create an output directory for images\noutput_dir = '/kaggle/working/inverse_segmentation_images'\nos.makedirs(output_dir, exist_ok=True)\n\n# Clear the output directory\nclear_output_directory(output_dir)\n\n# Render and save inverse masked images every few frames\nvis_frame_stride = 1\nplt.close(\"all\")\nfor out_frame_idx in range(0, len(frame_names), vis_frame_stride):\n frame = np.array(Image.open(os.path.join(video_dir, frame_names[out_frame_idx])))\n masked_frame = frame.copy() # Create a copy of the frame for modification\n for out_obj_id, out_mask in video_segments[out_frame_idx].items():\n masked_frame = apply_inverse_mask_to_image(masked_frame, out_mask)\n\n # Save the inverse masked frame\n save_masked_image(masked_frame, out_frame_idx, output_dir)\n\n # Optional: Display the inverse masked frame\n # plt.figure(figsize=(6, 4))\n # plt.title(f\"frame {out_frame_idx}\")\n # plt.imshow(masked_frame)\n # plt.show()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## restoring to original frames of inverse mask","metadata":{}},{"cell_type":"code","source":"video_dir = \"/kaggle/working/inverse_segmentation_images\" # Replace with your original frames directory\nann_frame_idx = 0 # The frame index used to start the reordering\noutput_dir = \"/kaggle/working/inverse_restored_frames\" # Replace with your desired output folder path\nrestore_original_order(video_dir, ann_frame_idx, output_dir)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## converting inverse mask frames to video","metadata":{}},{"cell_type":"code","source":"frames_folder = r'/kaggle/working/inverse_restored_frames' # Replace with the folder containing your frames\noutput_video_path = r\"/kaggle/working/inverse_mask_output_video.mp4\" # Desired output video file path\n\nframes_to_video(frames_folder, output_video_path, fps=30)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Video mask Pixelation","metadata":{}},{"cell_type":"code","source":"def pixelate_area(image, mask, pixelation_level):\n \"\"\"\n Apply pixelation to the masked area of an image.\n\n Parameters:\n - image: NumPy array of the image to be pixelated.\n - mask: Boolean NumPy array indicating the masked area.\n - pixelation_level: Int, the size of the blocks used for pixelation.\n \"\"\"\n # Create a copy of the image to modify\n pixelated_image = image.copy()\n\n # Get image dimensions\n h, w, _ = image.shape\n\n # Loop through the masked area and apply pixelation\n for y in range(0, h, pixelation_level):\n for x in range(0, w, pixelation_level):\n # Define the block area\n block = (slice(y, min(y + pixelation_level, h)), slice(x, min(x + pixelation_level, w)))\n\n # Check if the block is within the masked area\n if np.any(mask[block]):\n # Compute the mean color of the block\n mean_color = image[block].mean(axis=(0, 1)).astype(int)\n\n # Apply the mean color to the block\n pixelated_image[block] = mean_color\n\n return pixelated_image\n\ndef combine_pixelated_mask(masked_image_path, inverse_masked_image_path, save_path, pixelation_level=10):\n \"\"\"\n Combine the pixelated masked areas from the masked image with the inverse-masked image.\n\n Parameters:\n - masked_image_path: String, path to the masked image.\n - inverse_masked_image_path: String, path to the inverse-masked image.\n - save_path: String, path where the combined image will be saved.\n - pixelation_level: Int, the size of the blocks used for pixelation.\n \"\"\"\n # Open images\n masked_image = Image.open(masked_image_path).convert(\"RGBA\")\n inverse_masked_image = Image.open(inverse_masked_image_path).convert(\"RGBA\")\n\n # Ensure images are the same size by resizing the inverse image\n if masked_image.size != inverse_masked_image.size:\n inverse_masked_image = inverse_masked_image.resize(masked_image.size)\n\n # Convert images to numpy arrays\n masked_array = np.array(masked_image)\n inverse_masked_array = np.array(inverse_masked_image)\n\n # Create a mask where the original mask was applied (non-zero areas in any color channel)\n mask = np.any(masked_array[..., :3] > 0, axis=-1)\n\n # Pixelate the masked area\n pixelated_mask = pixelate_area(masked_array, mask, pixelation_level)\n\n # Replace inverse-masked image values with pixelated masked image values where mask is true\n combined_array = inverse_masked_array.copy()\n combined_array[mask] = pixelated_mask[mask]\n\n # Convert back to image\n combined_image = Image.fromarray(combined_array)\n\n # Save the combined image\n combined_image.save(save_path)\n print(f\"Combined image saved as {save_path}\")\n\n# # Display the combined image\n# plt.imshow(combined_image)\n# plt.axis('off')\n# plt.show()\n\n# Directory paths\nmasked_images_dir = \"/kaggle/working/restored_frames\"\ninverse_images_dir = \"/kaggle/working/inverse_restored_frames\"\noutput_dir = \"/kaggle/working/pixelated_combined_images\"\n\n# Ensure the output directory exists\nos.makedirs(output_dir, exist_ok=True)\n\n# Get and sort the list of image files\nimage_files = [f for f in os.listdir(masked_images_dir) if f.startswith(\"frame_\") and f.endswith(\".png\")]\nimage_files.sort(key=lambda f: int(f.split('_')[-1].split('.')[0]))\n\n# Iterate over the sorted files\nfor image_name in image_files:\n masked_image_path = os.path.join(masked_images_dir, image_name)\n inverse_image_path = os.path.join(inverse_images_dir, image_name)\n save_path = os.path.join(output_dir, f\"pixelated_combined_{image_name}\")\n\n # Check if the corresponding inverse image exists before combining\n if os.path.exists(inverse_image_path):\n combine_pixelated_mask(masked_image_path, inverse_image_path, save_path, pixelation_level=20)\n else:\n print(f\"Warning: Missing inverse file for {image_name}. Skipping combination.\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## converting frames of pixels to video","metadata":{}},{"cell_type":"code","source":"def frames_to_video(frames_folder, output_video_path, fps=30):\n # Get a list of frame files and sort them by name\n frame_files = [f for f in os.listdir(frames_folder) if f.endswith('.png')]\n\n # Sort by frame number, assuming the filename format is \"frame_.png\"\n frame_files.sort(key=lambda f: int(f.split('_')[-1].split('.')[0]))\n\n if not frame_files:\n print(\"No frame files found in the specified directory.\")\n return\n\n # Read the first frame to get the dimensions\n first_frame_path = os.path.join(frames_folder, frame_files[0])\n first_frame = cv2.imread(first_frame_path)\n if first_frame is None:\n print(f\"Error reading the first frame: {first_frame_path}\")\n return\n\n height, width, _ = first_frame.shape\n\n # Initialize the video writer\n fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for mp4 format\n video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))\n\n # Write each frame to the video\n for frame_file in frame_files:\n frame_path = os.path.join(frames_folder, frame_file)\n frame = cv2.imread(frame_path)\n if frame is not None:\n video_writer.write(frame)\n else:\n print(f\"Error reading frame: {frame_path}\")\n\n # Release the video writer\n video_writer.release()\n print(f\"Video saved to {output_video_path}\")\n\n# Example usage\nframes_folder = '/kaggle/working/pixelated_combined_images' # Replace with the folder containing your frames\noutput_video_path = \"/kaggle/working/pixelated_combined_images_output_video.mp4\" # Desired output video file path\n\nframes_to_video(frames_folder, output_video_path, fps=30)\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## side by side video of original with pixelated video.","metadata":{}},{"cell_type":"code","source":"from PIL import Image\nimport os\nimport subprocess\nimport shutil\n\n# Directories for the input frames and output combined frames (switched)\ndir1 = '/kaggle/working/output_frames' # Formerly dir2https://accounts.google.com/b/0/AddMailService\ndir2 = '/kaggle/working/pixelated_combined_images' # Formerly dir1\noutput_dir = '/kaggle/working/combined_frames_pix'\nvideo_output = '/kaggle/working/pix_output_video.mp4'\n\n# Ensure the output directory exists and is empty\nif os.path.exists(output_dir):\n shutil.rmtree(output_dir) # Remove the directory and its contents\nos.makedirs(output_dir) # Recreate the empty directory\n\n# Remove the previous video if it exists\nif os.path.exists(video_output):\n os.remove(video_output)\n\n# Get sorted lists of the frames\nframes1 = sorted([f for f in os.listdir(dir1) if f.endswith('.jpg')])\nframes2 = sorted([f for f in os.listdir(dir2) if f.endswith('.png')])\n\n# Iterate over both directories and combine images\nfor idx, (f1, f2) in enumerate(zip(frames1, frames2), start=1):\n img1 = Image.open(os.path.join(dir1, f1))\n img2 = Image.open(os.path.join(dir2, f2))\n \n # Assuming both images have the same height, concatenate side by side\n combined_img = Image.new('RGB', (img1.width + img2.width, img1.height))\n combined_img.paste(img1, (0, 0))\n combined_img.paste(img2, (img1.width, 0))\n \n # Save combined image with a sequential name like combined_frame_001.png\n combined_img.save(os.path.join(output_dir, f\"combined_frame_{idx:03d}.png\"))\n\nprint(f\"Frames combined and saved in {output_dir}\")\n\n# List the files in the output directory to verify they exist\nprint(\"Files in output directory:\", os.listdir(output_dir))\n\n# Convert the combined frames into a video using ffmpeg\nsubprocess.run([\n 'ffmpeg', '-framerate', '30', '-i', \n f'{output_dir}/combined_frame_%03d.png', '-c:v', \n 'libx264', '-pix_fmt', 'yuv420p', video_output\n])\n\nprint(f\"Video saved as {video_output}\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Masked area Hue change in video","metadata":{}},{"cell_type":"code","source":"import matplotlib.colors as mcolors\n\ndef change_hue(image, mask, hue_shift):\n \"\"\"\n Change the hue of the masked area in an image.\n\n Parameters:\n - image: NumPy array of the image to be modified (in RGB).\n - mask: Boolean NumPy array indicating the masked area.\n - hue_shift: Float, amount to shift the hue (0 to 1 for a complete cycle).\n \"\"\"\n # Convert the image to float in the range [0, 1]\n float_image = image.astype('float32') / 255.0\n\n # Convert to HSV\n hsv_image = mcolors.rgb_to_hsv(float_image)\n\n # Change the hue in the masked area\n hsv_image[..., 0][mask] = (hsv_image[..., 0][mask] + hue_shift) % 1.0\n\n # Convert back to RGB\n modified_float_image = mcolors.hsv_to_rgb(hsv_image)\n\n # Scale back to [0, 255]\n modified_image = (modified_float_image * 255).astype('uint8')\n\n return modified_image\n\ndef combine_hue_modified_mask(masked_image_path, inverse_masked_image_path, save_path, hue_shift=0.1):\n \"\"\"\n Combine the hue-modified masked areas from the masked image with the inverse-masked image.\n\n Parameters:\n - masked_image_path: String, path to the masked image.\n - inverse_masked_image_path: String, path to the inverse-masked image.\n - save_path: String, path where the combined image will be saved.\n - hue_shift: Float, amount to shift the hue (0 to 1 for a complete cycle).\n \"\"\"\n # Open images\n masked_image = Image.open(masked_image_path).convert(\"RGBA\")\n inverse_masked_image = Image.open(inverse_masked_image_path).convert(\"RGBA\")\n\n # Ensure images are the same size by resizing the inverse image\n if masked_image.size != inverse_masked_image.size:\n inverse_masked_image = inverse_masked_image.resize(masked_image.size)\n\n # Convert images to numpy arrays\n masked_array = np.array(masked_image)\n inverse_masked_array = np.array(inverse_masked_image)\n\n # Create a mask where the original mask was applied (non-zero areas in any color channel)\n mask = np.any(masked_array[..., :3] > 0, axis=-1)\n\n # Change the hue of the masked area\n hue_modified_mask = change_hue(masked_array[..., :3], mask, hue_shift)\n\n # Replace inverse-masked image values with hue-modified masked image values where mask is true\n combined_array = inverse_masked_array.copy()\n combined_array[mask] = np.dstack((hue_modified_mask, masked_array[..., 3]))[mask] # Preserve alpha channel\n\n # Convert back to image\n combined_image = Image.fromarray(combined_array)\n\n # Save the combined image\n combined_image.save(save_path)\n print(f\"Combined image saved as {save_path}\")\n\n# # Display the combined image\n# plt.imshow(combined_image)\n# plt.axis('off')\n# plt.show()\n\n# Directory paths\nmasked_images_dir = \"/kaggle/working/restored_frames\"\ninverse_images_dir = \"/kaggle/working/inverse_restored_frames\"\noutput_dir = \"/kaggle/working/hue_combined_images\"\n\n# Ensure the output directory exists\nos.makedirs(output_dir, exist_ok=True)\n\n# Get and sort the list of image files\nimage_files = [f for f in os.listdir(masked_images_dir) if f.startswith(\"frame_\") and f.endswith(\".png\")]\nimage_files.sort(key=lambda f: int(f.split('_')[-1].split('.')[0]))\n\n# Iterate over the sorted files\nfor image_name in image_files:\n masked_image_path = os.path.join(masked_images_dir, image_name)\n inverse_image_path = os.path.join(inverse_images_dir, image_name)\n save_path = os.path.join(output_dir, f\"hue_modified_combined_{image_name}\")\n\n # Check if the corresponding inverse image exists before combining\n if os.path.exists(inverse_image_path):\n combine_hue_modified_mask(masked_image_path, inverse_image_path, save_path, hue_shift=0.25)\n else:\n print(f\"Warning: Missing inverse file for {image_name}. Skipping combination.\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## converting back hue change to video","metadata":{}},{"cell_type":"code","source":"# Example usage\nframes_folder = '/kaggle/working/hue_combined_images' # Replace with the folder containing your frames\noutput_video_path = \"/kaggle/working/hue_combined_images_output_video.mp4\" # Desired output video file path\n\nframes_to_video(frames_folder, output_video_path, fps=30)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## side by side video of original with Hue video.","metadata":{}},{"cell_type":"code","source":"from PIL import Image\nimport os\nimport subprocess\nimport shutil\n\n# Directories for the input frames and output combined frames (switched)\ndir1 = '/kaggle/working/output_frames' # Formerly dir2https://accounts.google.com/b/0/AddMailService\ndir2 = '/kaggle/working/hue_combined_images' # Formerly dir1\noutput_dir = '/kaggle/working/hue_with_og_combined_frames'\nvideo_output = '/kaggle/working/hue_with_og_output_video.mp4'\n\n# Ensure the output directory exists and is empty\nif os.path.exists(output_dir):\n shutil.rmtree(output_dir) # Remove the directory and its contents\nos.makedirs(output_dir) # Recreate the empty directory\n\n# Remove the previous video if it exists\nif os.path.exists(video_output):\n os.remove(video_output)\n\n# Get sorted lists of the frames\nframes1 = sorted([f for f in os.listdir(dir1) if f.endswith('.jpg')])\nframes2 = sorted([f for f in os.listdir(dir2) if f.endswith('.png')])\n\n# Iterate over both directories and combine images\nfor idx, (f1, f2) in enumerate(zip(frames1, frames2), start=1):\n img1 = Image.open(os.path.join(dir1, f1))\n img2 = Image.open(os.path.join(dir2, f2))\n \n # Assuming both images have the same height, concatenate side by side\n combined_img = Image.new('RGB', (img1.width + img2.width, img1.height))\n combined_img.paste(img1, (0, 0))\n combined_img.paste(img2, (img1.width, 0))\n \n # Save combined image with a sequential name like combined_frame_001.png\n combined_img.save(os.path.join(output_dir, f\"combined_frame_{idx:03d}.png\"))\n\nprint(f\"Frames combined and saved in {output_dir}\")\n\n# List the files in the output directory to verify they exist\nprint(\"Files in output directory:\", os.listdir(output_dir))\n\n# Convert the combined frames into a video using ffmpeg\nsubprocess.run([\n 'ffmpeg', '-framerate', '30', '-i', \n f'{output_dir}/combined_frame_%03d.png', '-c:v', \n 'libx264', '-pix_fmt', 'yuv420p', video_output\n])\n\nprint(f\"Video saved as {video_output}\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Mask replacement with another video","metadata":{}},{"cell_type":"markdown","source":"### replacement video Link required","metadata":{}},{"cell_type":"code","source":"import os\nimport numpy as np\nfrom PIL import Image\nimport cv2\n\ndef replace_area_with_frames(image, mask, replacement_frames, frame_idx):\n \"\"\"\n Replace the masked area of an image with a different video frame.\n\n Parameters:\n - image: NumPy array of the image to modify.\n - mask: Boolean NumPy array indicating the masked area.\n - replacement_frames: List of NumPy arrays, each representing a video frame to use as a replacement.\n - frame_idx: Int, the index of the current frame in the replacement sequence.\n \"\"\"\n # Create a copy of the image to modify\n modified_image = image.copy()\n\n # Get the replacement frame, use the last one if index exceeds available frames\n replacement_frame = replacement_frames[min(frame_idx, len(replacement_frames) - 1)]\n\n # Resize the replacement frame to match the image size\n replacement_frame_resized = cv2.resize(replacement_frame, (image.shape[1], image.shape[0]))\n\n # Replace the masked area with the replacement frame\n modified_image[mask] = replacement_frame_resized[mask]\n\n return modified_image\n\ndef combine_mask_with_frames(masked_image_path, inverse_masked_image_path, replacement_frames, save_path, frame_idx):\n \"\"\"\n Combine the masked areas from the masked image with the inverse-masked image, using video frames to fill the masked area.\n\n Parameters:\n - masked_image_path: String, path to the masked image.\n - inverse_masked_image_path: String, path to the inverse-masked image.\n - replacement_frames: List of NumPy arrays, each representing a video frame to use as a replacement.\n - save_path: String, path where the combined image will be saved.\n - frame_idx: Int, the index of the current frame in the replacement sequence.\n \"\"\"\n # Open images\n masked_image = Image.open(masked_image_path).convert(\"RGBA\")\n inverse_masked_image = Image.open(inverse_masked_image_path).convert(\"RGBA\")\n\n # Ensure images are the same size by resizing the inverse image\n if masked_image.size != inverse_masked_image.size:\n inverse_masked_image = inverse_masked_image.resize(masked_image.size)\n\n # Convert images to numpy arrays\n masked_array = np.array(masked_image)\n inverse_masked_array = np.array(inverse_masked_image)\n\n # Create a mask where the original mask was applied (non-zero areas in any color channel)\n mask = np.any(masked_array[..., :3] > 0, axis=-1)\n\n # Replace the masked area with frames from the video\n replaced_area = replace_area_with_frames(masked_array, mask, replacement_frames, frame_idx)\n\n # Replace inverse-masked image values with the replaced area image values where mask is true\n combined_array = inverse_masked_array.copy()\n combined_array[mask] = replaced_area[mask]\n\n # Convert back to image\n combined_image = Image.fromarray(combined_array)\n\n # Save the combined image\n combined_image.save(save_path)\n print(f\"Combined image saved as {save_path}\")\n\n# Directory paths\nmasked_images_dir = \"/kaggle/working/restored_frames\"\ninverse_images_dir = \"/kaggle/working/inverse_restored_frames\"\noutput_dir = \"/kaggle/working/mask_replaced_combined_images\"\nreplacement_video_path = \"/kaggle/input/viedo-with-replacementy/Untitled video - Made with Clipchamp (1).mp4\" # input replacement video link\n\n# Ensure the output directory exists\nos.makedirs(output_dir, exist_ok=True)\n\n# Load the replacement video frames\nreplacement_frames = []\ncap = cv2.VideoCapture(replacement_video_path)\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n replacement_frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGBA))\ncap.release()\n\n# Get and sort the list of image files\nimage_files = [f for f in os.listdir(masked_images_dir) if f.startswith(\"frame_\") and f.endswith(\".png\")]\nimage_files.sort(key=lambda f: int(f.split('_')[-1].split('.')[0]))\n\n# Iterate over the sorted files\nfor frame_idx, image_name in enumerate(image_files):\n masked_image_path = os.path.join(masked_images_dir, image_name)\n inverse_image_path = os.path.join(inverse_images_dir, image_name)\n save_path = os.path.join(output_dir, f\"frame_combined_{image_name}\")\n\n # Check if the corresponding inverse image exists before combining\n if os.path.exists(inverse_image_path):\n combine_mask_with_frames(masked_image_path, inverse_image_path, replacement_frames, save_path, frame_idx)\n else:\n print(f\"Warning: Missing inverse file for {image_name}. Skipping combination.\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### replaced mask to video ","metadata":{}},{"cell_type":"code","source":"# Example usage\nframes_folder = '/kaggle/working/mask_replaced_combined_images' # Replace with the folder containing your frames\noutput_video_path = \"/kaggle/working/mask_replaced_combined_images_output_video.mp4\" # Desired output video file path\n\nframes_to_video(frames_folder, output_video_path, fps=30)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## side by side video of original with mask replaced video.","metadata":{}},{"cell_type":"code","source":"from PIL import Image\nimport os\nimport subprocess\nimport shutil\n\n# Directories for the input frames and output combined frames (switched)\ndir1 = '/kaggle/working/output_frames' \ndir2 = '/kaggle/working/mask_replaced_combined_images' \noutput_dir = '/kaggle/working/mask_replacement_with_orginal_combined_frames'\nvideo_output = '/kaggle/working/mask_replacement_with_orginal_output_video.mp4'\n\n# Ensure the output directory exists and is empty\nif os.path.exists(output_dir):\n shutil.rmtree(output_dir) # Remove the directory and its contents\nos.makedirs(output_dir) # Recreate the empty directory\n\n# Remove the previous video if it exists\nif os.path.exists(video_output):\n os.remove(video_output)\n\n# Get sorted lists of the frames\nframes1 = sorted([f for f in os.listdir(dir1) if f.endswith('.jpg')])\nframes2 = sorted([f for f in os.listdir(dir2) if f.endswith('.png')])\n\n# Iterate over both directories and combine images\nfor idx, (f1, f2) in enumerate(zip(frames1, frames2), start=1):\n img1 = Image.open(os.path.join(dir1, f1))\n img2 = Image.open(os.path.join(dir2, f2))\n \n # Assuming both images have the same height, concatenate side by side\n combined_img = Image.new('RGB', (img1.width + img2.width, img1.height))\n combined_img.paste(img1, (0, 0))\n combined_img.paste(img2, (img1.width, 0))\n \n # Save combined image with a sequential name like combined_frame_001.png\n combined_img.save(os.path.join(output_dir, f\"combined_frame_{idx:03d}.png\"))\n\nprint(f\"Frames combined and saved in {output_dir}\")\n\n# List the files in the output directory to verify they exist\nprint(\"Files in output directory:\", os.listdir(output_dir))\n\n# Convert the combined frames into a video using ffmpeg\nsubprocess.run([\n 'ffmpeg', '-framerate', '30', '-i', \n f'{output_dir}/combined_frame_%03d.png', '-c:v', \n 'libx264', '-pix_fmt', 'yuv420p', video_output\n])\n\nprint(f\"Video saved as {video_output}\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Masked area glow effect in video","metadata":{}},{"cell_type":"code","source":"from PIL import Image, ImageFilter\n\ndef apply_blur_to_masked_area(image, mask, blur_radius=10):\n \"\"\"\n Apply a blur effect to the masked area of an image.\n\n Parameters:\n - image: PIL Image object of the original image.\n - mask: Boolean NumPy array indicating the masked area.\n - blur_radius: Integer, the radius of the Gaussian blur for the blur effect.\n \"\"\"\n # Convert image to numpy array\n image_array = np.array(image)\n\n # Create a mask image\n mask_image = Image.fromarray((mask * 255).astype('uint8'), mode='L')\n\n # Apply a Gaussian blur to the mask image\n blurred_mask_image = mask_image.filter(ImageFilter.GaussianBlur(radius=blur_radius))\n\n # Convert the blurred mask to RGB\n blurred_mask_image = blurred_mask_image.convert('RGB')\n blurred_mask_array = np.array(blurred_mask_image)\n\n # Create an image with the same dimensions as the original image\n blurred_area = np.zeros_like(image_array[..., :3])\n blurred_area[mask] = blurred_mask_array[mask]\n\n # Combine the blurred area with the original image\n combined_array = np.where(blurred_area > 0, blurred_area, image_array[..., :3])\n combined_image = Image.fromarray(np.uint8(combined_array))\n\n # Preserve the alpha channel from the original image\n alpha_channel = image_array[..., 3]\n combined_image = Image.fromarray(np.dstack((combined_array, alpha_channel)))\n\n return combined_image\n\ndef combine_and_apply_blur(masked_image_path, inverse_masked_image_path, save_path, blur_radius):\n \"\"\"\n Apply a blur effect to the masked image and save the result.\n\n Parameters:\n - masked_image_path: String, path to the masked image (used to extract the mask).\n - inverse_masked_image_path: String, path to the inverse-masked image.\n - save_path: String, path where the final image will be saved.\n - blur_radius: Integer, the radius of the Gaussian blur for the blur effect.\n \"\"\"\n # Open inverse-masked image\n inverse_masked_image = Image.open(inverse_masked_image_path).convert(\"RGBA\")\n\n # Extract the mask from the masked image\n masked_image = Image.open(masked_image_path).convert(\"L\")\n mask = np.array(masked_image) > 0\n\n # Apply blur effect to the masked area\n blurred_image = apply_blur_to_masked_area(inverse_masked_image, mask, blur_radius)\n\n # Save the final image\n blurred_image.save(save_path)\n print(f\"Final image with blur effect saved as {save_path}\")\n\n# # Display the final image\n# plt.imshow(blurred_image)\n# plt.axis('off')\n# plt.show()\n\n# Directory paths\nmasked_images_dir = \"/kaggle/working/restored_frames\"\ninverse_images_dir = \"/kaggle/working/inverse_restored_frames\"\noutput_dir = \"/kaggle/working/blur_combined_images\"\n\n# Ensure the output directory exists\nos.makedirs(output_dir, exist_ok=True)\n\n# Get and sort the list of image files\nimage_files = [f for f in os.listdir(masked_images_dir) if f.startswith(\"frame_\") and f.endswith(\".png\")]\nimage_files.sort(key=lambda f: int(f.split('_')[-1].split('.')[0]))\n\n# Define blur radius\nblur_radius = 10\n\n# Iterate over the sorted files\nfor image_name in image_files:\n masked_image_path = os.path.join(masked_images_dir, image_name)\n inverse_image_path = os.path.join(inverse_images_dir, image_name)\n save_path = os.path.join(output_dir, f\"blur_combined_{image_name}\")\n\n # Check if the corresponding inverse image exists before combining\n if os.path.exists(inverse_image_path):\n combine_and_apply_blur(masked_image_path, inverse_image_path, save_path, blur_radius)\n else:\n print(f\"Warning: Missing inverse file for {image_name}. Skipping combination.\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### converting glow effect frames into video ","metadata":{}},{"cell_type":"code","source":"# Example usage\nframes_folder = '/kaggle/working/blur_combined_images' # Replace with the folder containing your frames\noutput_video_path = \"/kaggle/working/blur_combined_images_output_video.mp4\" # Desired output video file path\n\nframes_to_video(frames_folder, output_video_path, fps=30)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Generative AI propagation in video","metadata":{}},{"cell_type":"code","source":"!pip install stability-sdk","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Single API mask video generation","metadata":{}},{"cell_type":"markdown","source":"### single API key code uses less stability AI credits can generate upto ~110 frames using 25 credits at below given configuration in code.\n\n### to generate API key from stability AI , signup on statbility ai platform (gives 25 $ free credit on new account) , copy API key and paste in the below code","metadata":{}},{"cell_type":"markdown","source":"#### Note: Due to generate high no. of frames quality is significantly poor for single API key","metadata":{}},{"cell_type":"code","source":"import os\nimport io\nimport warnings\nfrom PIL import Image\nfrom stability_sdk import client\nimport stability_sdk.interfaces.gooseai.generation.generation_pb2 as generation\n\n# Our Host URL should not be prepended with \"https\" nor should it have a trailing slash.\nos.environ['STABILITY_HOST'] = 'grpc.stability.ai:443'\n\n# Sign up for an account at the following link to get an API Key.\n# https://platform.stability.ai/\n\n# Click on the following link once you have created an account to be taken to your API Key.\n# https://platform.stability.ai/account/keys\n\n# Paste your API Key below.\n\nos.environ['STABILITY_KEY'] = 'sk-23mieeVXXXXXXXXXAegcZW3DZpGIz0M5'","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Set up our connection to the API.\nstability_api = client.StabilityInference(\n key=os.environ['STABILITY_KEY'], # API Key reference.\n verbose=True, # Print debug messages.\n engine=\"stable-diffusion-xl-1024-v1-0\", # Set the engine to use for generation.\n # Check out the following link for a list of available engines: https://platform.stability.ai/docs/features/api-parameters#engine\n)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import os\nimport io\nimport warnings\nfrom PIL import Image\nimport matplotlib.pyplot as plt\n\ndef clear_output_directory(directory):\n \"\"\"\n Remove all files in the given directory.\n \"\"\"\n if os.path.exists(directory):\n for file in os.listdir(directory):\n file_path = os.path.join(directory, file)\n try:\n if os.path.isfile(file_path):\n os.unlink(file_path)\n except Exception as e:\n print(f\"Failed to delete {file_path}. Reason: {e}\")\n\ndef resize_image(image_path, output_path, max_size=1024):\n \"\"\"\n Resize an image if it exceeds the max_size dimension.\n \"\"\"\n # Open the image\n image = Image.open(image_path)\n\n # Get the current width and height of the image\n width, height = image.size\n\n # Calculate the scaling factor\n if width > height:\n scaling_factor = max_size / width\n else:\n scaling_factor = max_size / height\n\n # Only resize if the image is larger than the max_size\n if scaling_factor < 1:\n # Calculate new dimensions\n new_width = int(width * scaling_factor)\n new_height = int(height * scaling_factor)\n\n # Resize the image\n image_resized = image.resize((new_width, new_height))\n\n # Save the resized image\n image_resized.save(output_path)\n print(f\"Image resized to {new_width}x{new_height} and saved as {output_path}\")\n else:\n # Save the original image without resizing\n image.save(output_path)\n print(f\"Image is already within the size limits and saved as {output_path}\")\n\ndef generate_image_from_masked(input_image_path, output_image_path):\n \"\"\"\n Generate a new image from a masked image using an image-to-image model.\n \"\"\"\n # Open and possibly resize the image\n resized_image_path = '/kaggle/working/temp_resized_image.jpg'\n resize_image(input_image_path, resized_image_path)\n\n # Open the resized image\n img = Image.open(resized_image_path)\n\n # Get the dimensions of the image\n width, height = img.size\n\n # Set up our initial generation parameters.\n answers = stability_api.generate(\n prompt=\"bottle with glowing effect holding magical potion, alphonse mucha and simon stalenhag style\",\n seed = 69696969,\n init_image=img, # Assign our previously generated img as our Initial Image for transformation.\n start_schedule=0.6, # Set the strength of our prompt in relation to our initial image.\n steps=30, # Amount of inference steps performed on image generation. Defaults to 30.\n cfg_scale=10.0, # Influences how strongly your generation is guided to match your prompt.\n width=width, # Generation width\n height=height, # Generation height\n sampler=generation.SAMPLER_DDIM, # Sampler type\n style_preset=\"comic-book\" # Style preset\n )\n\n # Process the response and save the image\n for resp in answers:\n for artifact in resp.artifacts:\n if artifact.finish_reason == generation.FILTER:\n warnings.warn(\n \"Your request activated the API's safety filters and could not be processed.\"\n \"Please modify the prompt and try again.\")\n if artifact.type == generation.ARTIFACT_IMAGE:\n img2 = Image.open(io.BytesIO(artifact.binary))\n img2.save(output_image_path)\n print(f\"Generated image saved as {output_image_path}\")\n\n# Directory paths\nmasked_images_dir = '/kaggle/working/restored_frames'\noutput_gen_dir = '/kaggle/working/mask_gen'\nos.makedirs(output_gen_dir, exist_ok=True)\n\n# Clear the output directory\nclear_output_directory(output_gen_dir)\n\n# Iterate over each masked image and apply image-to-image generation\nfor masked_image_name in os.listdir(masked_images_dir):\n masked_image_path = os.path.join(masked_images_dir, masked_image_name)\n output_image_path = os.path.join(output_gen_dir, f\"gen_{masked_image_name}\")\n\n # Generate new image from the masked image\n generate_image_from_masked(masked_image_path, output_image_path)\n\n # Optional: Display the generated image\n out_img = Image.open(output_image_path)\n plt.imshow(out_img)\n plt.title(f\"Generated from {masked_image_name}\")\n plt.show()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Example usage\nframes_folder = '/kaggle/working/mask_gen' # Replace with the folder containing your frames\noutput_video_path = \"/kaggle/working/mask_gen_output_video.mp4\" # Desired output video file path\n\nframes_to_video(frames_folder, output_video_path, fps=30)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from PIL import Image\nimport numpy as np\nimport os\nimport matplotlib.pyplot as plt\n\ndef combine_masked_regions(masked_image_path, inverse_masked_image_path, save_path):\n \"\"\"\n Combine the original mask areas from the masked image with the inverse-masked image.\n\n Parameters:\n - masked_image_path: String, path to the masked image.\n - inverse_masked_image_path: String, path to the inverse-masked image.\n - save_path: String, path where the combined image will be saved.\n \"\"\"\n # Open images\n masked_image = Image.open(masked_image_path).convert(\"RGBA\")\n inverse_masked_image = Image.open(inverse_masked_image_path).convert(\"RGBA\")\n\n # Ensure images are the same size by resizing the inverse image\n if masked_image.size != inverse_masked_image.size:\n inverse_masked_image = inverse_masked_image.resize(masked_image.size)\n\n # Convert images to numpy arrays\n masked_array = np.array(masked_image)\n inverse_masked_array = np.array(inverse_masked_image)\n\n # Create a mask where the original mask was applied (non-zero areas in any color channel)\n mask = np.any(masked_array[..., :3] > 30, axis=-1)\n\n # Replace inverse-masked image values with masked image values where mask is true\n combined_array = inverse_masked_array.copy()\n combined_array[mask] = masked_array[mask]\n\n # Convert back to image\n combined_image = Image.fromarray(combined_array)\n\n # Save the combined image\n combined_image.save(save_path)\n print(f\"Combined image saved as {save_path}\")\n\n# # Display the combined image\n# plt.imshow(combined_image)\n# plt.axis('off')\n# plt.show()\n\n# Define directory paths\nmasked_images_dir = \"/kaggle/working/mask_gen\"\ninverse_images_dir = \"/kaggle/working/inverse_restored_frames\"\noutput_dir = \"/kaggle/working/Generative_combined_images\"\n\n# Ensure the output directory exists\nos.makedirs(output_dir, exist_ok=True)\n\n# Get lists of files in the masked directory\nmasked_images = sorted(os.listdir(masked_images_dir))\n\n# Process files with matching names based on pattern\nfor masked_image_name in masked_images:\n if masked_image_name.startswith(\"gen_frame_\") and masked_image_name.endswith(\".png\"):\n # Extract the index number from the masked image name\n index = masked_image_name[len(\"gen_frame_\"):-len(\".png\")]\n\n # Generate the corresponding inverse image name\n inverse_image_name = f\"frame_{index}.png\"\n\n masked_image_path = os.path.join(masked_images_dir, masked_image_name)\n inverse_image_path = os.path.join(inverse_images_dir, inverse_image_name)\n save_path = os.path.join(output_dir, f\"combined_frame_{index}.png\")\n\n # Check if both files exist before combining\n if os.path.exists(masked_image_path) and os.path.exists(inverse_image_path):\n combine_masked_regions(masked_image_path, inverse_image_path, save_path)\n else:\n print(f\"Warning: Missing files for frame {index}. Skipping combination.\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### frames to video ","metadata":{}},{"cell_type":"code","source":"# Example usage\nframes_folder = '/kaggle/working/Generative_combined_images' # Replace with the folder containing your frames\noutput_video_path = \"/kaggle/working/Generative_combined_output_video.mp4\" # Desired output video file path\n\nframes_to_video(frames_folder, output_video_path, fps=30)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## generating using multiple APIs","metadata":{}},{"cell_type":"markdown","source":"### using Multiple keys with better output of image to image generation, the below code can generate ~ 50 frames per 25 credits or 1 free new signup. ","metadata":{}},{"cell_type":"code","source":"import os\nimport io\nimport warnings\nfrom PIL import Image\nimport matplotlib.pyplot as plt\nfrom stability_sdk import client\nimport stability_sdk.interfaces.gooseai.generation.generation_pb2 as generation\n\n# List of API keys\napi_keys = [\n 'sk-3GPp1EOphrXXXXXXXXXX3dmwrbji1iPK3',\n 'sk-6TygJFuBfiQWc7XXXXXXXXXXXqj8aMncmLYrYqpwE1Lv'\n # Add more API keys here\n]\n\n# Directory paths\nmasked_images_dir = '/kaggle/working/restored_frames'\noutput_gen_dir = '/kaggle/working/HD_mask_gen'\n\nos.makedirs(output_gen_dir, exist_ok=True)\n\ndef initialize_stability_api(api_key):\n \"\"\"\n Initialize the Stability API client with the given API key.\n \"\"\"\n return client.StabilityInference(\n key=api_key, # API Key reference.\n verbose=True, # Print debug messages.\n engine=\"stable-diffusion-xl-1024-v1-0\", # Set the engine to use for generation.\n )\n\ndef resize_image(image_path, output_path, max_size=1024):\n \"\"\"\n Resize an image if it exceeds the max_size dimension.\n \"\"\"\n # Open the image\n image = Image.open(image_path)\n\n # Get the current width and height of the image\n width, height = image.size\n\n # Calculate the scaling factor\n if width > height:\n scaling_factor = max_size / width\n else:\n scaling_factor = max_size / height\n\n # Only resize if the image is larger than the max_size\n if scaling_factor < 1:\n # Calculate new dimensions\n new_width = int(width * scaling_factor)\n new_height = int(height * scaling_factor)\n\n # Resize the image\n image_resized = image.resize((new_width, new_height))\n\n # Save the resized image\n image_resized.save(output_path)\n print(f\"Image resized to {new_width}x{new_height} and saved as {output_path}\")\n else:\n # Save the original image without resizing\n image.save(output_path)\n print(f\"Image is already within the size limits and saved as {output_path}\")\n\ndef generate_image_from_masked(api, input_image_path, output_image_path):\n \"\"\"\n Generate a new image from a masked image using an image-to-image model.\n \"\"\"\n # Open and possibly resize the image\n resized_image_path = '/kaggle/working/temp_resized_image.jpg'\n resize_image(input_image_path, resized_image_path)\n\n # Open the resized image\n img = Image.open(resized_image_path)\n\n # Get the dimensions of the image\n width, height = img.size\n\n # Set up our initial generation parameters.\n answers = api.generate(\n prompt=\"soccer ball covered in flames,blazing fireball,eldenring fireball,flames, shiny golden\",\n init_image=img, # Assign our previously generated img as our Initial Image for transformation.\n seed = 69696969,\n start_schedule=0.6, # Set the strength of our prompt in relation to our initial image.\n steps=65, # Amount of inference steps performed on image generation. Defaults to 30.\n cfg_scale=10.0, # Influences how strongly your generation is guided to match your prompt.\n width=width, # Generation width\n height=height, # Generation height\n sampler=generation.SAMPLER_K_DPMPP_SDE, # Sampler type\n style_preset=\"fantasy-art\" # Style preset\n )\n\n # Process the response and save the image\n for resp in answers:\n for artifact in resp.artifacts:\n if artifact.finish_reason == generation.FILTER:\n warnings.warn(\n \"Your request activated the API's safety filters and could not be processed.\"\n \"Please modify the prompt and try again.\")\n if artifact.type == generation.ARTIFACT_IMAGE:\n img2 = Image.open(io.BytesIO(artifact.binary))\n img2.save(output_image_path)\n print(f\"Generated image saved as {output_image_path}\")\n\n# Initialize the first Stability API client\nstability_api = initialize_stability_api(api_keys[0])\n\n# Iterate over each masked image and apply image-to-image generation\nfor i, masked_image_name in enumerate(os.listdir(masked_images_dir)):\n # Change API key every 50 frames\n if i > 0 and i % 50 == 0:\n api_index = (i // 50) % len(api_keys) # Calculate the API key index\n stability_api = initialize_stability_api(api_keys[api_index])\n\n masked_image_path = os.path.join(masked_images_dir, masked_image_name)\n output_image_path = os.path.join(output_gen_dir, f\"gen_{masked_image_name}\")\n\n # Generate new image from the masked image\n generate_image_from_masked(stability_api, masked_image_path, output_image_path)\n\n # Optional: Display the generated image\n out_img = Image.open(output_image_path)\n plt.imshow(out_img)\n plt.title(f\"Generated from {masked_image_name}\")\n plt.show()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Example usage\nframes_folder = '/kaggle/working/HD_mask_gen' # Replace with the folder containing your frames\noutput_video_path = \"/kaggle/working/HD_mask_gen_output_video.mp4\" # Desired output video file path\n\nframes_to_video(frames_folder, output_video_path, fps=30)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from PIL import Image\nimport numpy as np\nimport os\nimport matplotlib.pyplot as plt\n\ndef combine_masked_regions(masked_image_path, inverse_masked_image_path, save_path):\n \"\"\"\n Combine the original mask areas from the masked image with the inverse-masked image.\n\n Parameters:\n - masked_image_path: String, path to the masked image.\n - inverse_masked_image_path: String, path to the inverse-masked image.\n - save_path: String, path where the combined image will be saved.\n \"\"\"\n # Open images\n masked_image = Image.open(masked_image_path).convert(\"RGBA\")\n inverse_masked_image = Image.open(inverse_masked_image_path).convert(\"RGBA\")\n\n # Ensure images are the same size by resizing the inverse image\n if masked_image.size != inverse_masked_image.size:\n inverse_masked_image = inverse_masked_image.resize(masked_image.size)\n\n # Convert images to numpy arrays\n masked_array = np.array(masked_image)\n inverse_masked_array = np.array(inverse_masked_image)\n\n # Create a mask where the original mask was applied (non-zero areas in any color channel)\n mask = np.any(masked_array[..., :3] > 30, axis=-1)\n\n # Replace inverse-masked image values with masked image values where mask is true\n combined_array = inverse_masked_array.copy()\n combined_array[mask] = masked_array[mask]\n\n # Convert back to image\n combined_image = Image.fromarray(combined_array)\n\n # Save the combined image\n combined_image.save(save_path)\n print(f\"Combined image saved as {save_path}\")\n\n# # Display the combined image\n# plt.imshow(combined_image)\n# plt.axis('off')\n# plt.show()\n\n# Define directory paths\nmasked_images_dir = \"/kaggle/working/HD_mask_gen\"\ninverse_images_dir = \"/kaggle/working/inverse_restored_frames\"\noutput_dir = \"/kaggle/working/HD_Generative_combined_images\"\n\n# Ensure the output directory exists\nos.makedirs(output_dir, exist_ok=True)\n\n# Get lists of files in the masked directory\nmasked_images = sorted(os.listdir(masked_images_dir))\n\n# Process files with matching names based on pattern\nfor masked_image_name in masked_images:\n if masked_image_name.startswith(\"gen_frame_\") and masked_image_name.endswith(\".png\"):\n # Extract the index number from the masked image name\n index = masked_image_name[len(\"gen_frame_\"):-len(\".png\")]\n\n # Generate the corresponding inverse image name\n inverse_image_name = f\"frame_{index}.png\"\n\n masked_image_path = os.path.join(masked_images_dir, masked_image_name)\n inverse_image_path = os.path.join(inverse_images_dir, inverse_image_name)\n save_path = os.path.join(output_dir, f\"combined_frame_{index}.png\")\n\n # Check if both files exist before combining\n if os.path.exists(masked_image_path) and os.path.exists(inverse_image_path):\n combine_masked_regions(masked_image_path, inverse_image_path, save_path)\n else:\n print(f\"Warning: Missing files for frame {index}. Skipping combination.\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Example usage\nframes_folder = '/kaggle/working/HD_Generative_combined_images' # Replace with the folder containing your frames\noutput_video_path = \"/kaggle/working/HD_Generative_combined_output_video.mp4\" # Desired output video file path\n\nframes_to_video(frames_folder, output_video_path, fps=30)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## side by side video of original with Img2Img generated video.","metadata":{}},{"cell_type":"code","source":"from PIL import Image\nimport os\nimport subprocess\nimport shutil\n\n# Directories for the input frames and output combined frames (switched)\ndir1 = '/kaggle/working/output_frames' # Formerly dir2\ndir2 = '/kaggle/working/HD_Generative_combined_images' # Formerly dir1\noutput_dir = '/kaggle/working/genai_with_replacement_combined_frames'\nvideo_output = '/kaggle/working/genai_with_replacement_output_video.mp4'\n\n# Ensure the output directory exists and is empty\nif os.path.exists(output_dir):\n shutil.rmtree(output_dir) # Remove the directory and its contents\nos.makedirs(output_dir) # Recreate the empty directory\n\n# Remove the previous video if it exists\nif os.path.exists(video_output):\n os.remove(video_output)\n\n# Get sorted lists of the frames\nframes1 = sorted([f for f in os.listdir(dir1) if f.endswith('.jpg')])\nframes2 = sorted([f for f in os.listdir(dir2) if f.endswith('.png')])\n\n# Iterate over both directories and combine images\nfor idx, (f1, f2) in enumerate(zip(frames1, frames2), start=1):\n img1 = Image.open(os.path.join(dir1, f1))\n img2 = Image.open(os.path.join(dir2, f2))\n \n # Resize the larger image to match the height of the smaller one while maintaining the aspect ratio\n if img1.height > img2.height:\n img1 = img1.resize((int(img1.width * (img2.height / img1.height)), img2.height), Image.LANCZOS)\n elif img2.height > img1.height:\n img2 = img2.resize((int(img2.width * (img1.height / img2.height)), img1.height), Image.LANCZOS)\n \n # Combine images side by side\n combined_img = Image.new('RGB', (img1.width + img2.width, img1.height))\n combined_img.paste(img1, (0, 0))\n combined_img.paste(img2, (img1.width, 0))\n \n # Save combined image with a sequential name like combined_frame_001.png\n combined_img.save(os.path.join(output_dir, f\"combined_frame_{idx:03d}.png\"))\n\nprint(f\"Frames combined and saved in {output_dir}\")\n\n# List the files in the output directory to verify they exist\nprint(\"Files in output directory:\", os.listdir(output_dir))\n\n# Convert the combined frames into a video using ffmpeg\nsubprocess.run([\n 'ffmpeg', '-framerate', '30', '-i', \n f'{output_dir}/combined_frame_%03d.png', '-c:v', \n 'libx264', '-pix_fmt', 'yuv420p', video_output\n])\n\nprint(f\"Video saved as {video_output}\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Thank you!!!","metadata":{}},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]} \ No newline at end of file diff --git a/sam2/__init__.py b/sam2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1825bd8dbd155fbcd74b28aa7b159a349554fed3 --- /dev/null +++ b/sam2/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from hydra import initialize_config_module + +initialize_config_module("sam2_configs", version_base="1.2") diff --git a/sam2/__pycache__/__init__.cpython-312.pyc b/sam2/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2abde0733a28d4534e95c40b96545c53e0cadb25 Binary files /dev/null and b/sam2/__pycache__/__init__.cpython-312.pyc differ diff --git a/sam2/__pycache__/build_sam.cpython-312.pyc b/sam2/__pycache__/build_sam.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32059214283b3bf44b8782b64f6935e70f86e959 Binary files /dev/null and b/sam2/__pycache__/build_sam.cpython-312.pyc differ diff --git a/sam2/__pycache__/sam2_image_predictor.cpython-312.pyc b/sam2/__pycache__/sam2_image_predictor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20e962c3006887ea625651bb5018c64504fdb213 Binary files /dev/null and b/sam2/__pycache__/sam2_image_predictor.cpython-312.pyc differ diff --git a/sam2/__pycache__/sam2_video_predictor.cpython-312.pyc b/sam2/__pycache__/sam2_video_predictor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1da7c6996e1021c15e41ab3691e90575051c1a41 Binary files /dev/null and b/sam2/__pycache__/sam2_video_predictor.cpython-312.pyc differ diff --git a/sam2/automatic_mask_generator.py b/sam2/automatic_mask_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..2dcfd8ee088e1bcff08c6bebeb3ad70210c30943 --- /dev/null +++ b/sam2/automatic_mask_generator.py @@ -0,0 +1,434 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# Adapted from https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/automatic_mask_generator.py +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np +import torch +from torchvision.ops.boxes import batched_nms, box_area # type: ignore + +from sam2.modeling.sam2_base import SAM2Base +from sam2.sam2_image_predictor import SAM2ImagePredictor +from sam2.utils.amg import ( + area_from_rle, + batch_iterator, + batched_mask_to_box, + box_xyxy_to_xywh, + build_all_layer_point_grids, + calculate_stability_score, + coco_encode_rle, + generate_crop_boxes, + is_box_near_crop_edge, + mask_to_rle_pytorch, + MaskData, + remove_small_regions, + rle_to_mask, + uncrop_boxes_xyxy, + uncrop_masks, + uncrop_points, +) + + +class SAM2AutomaticMaskGenerator: + def __init__( + self, + model: SAM2Base, + points_per_side: Optional[int] = 32, + points_per_batch: int = 64, + pred_iou_thresh: float = 0.8, + stability_score_thresh: float = 0.95, + stability_score_offset: float = 1.0, + mask_threshold: float = 0.0, + box_nms_thresh: float = 0.7, + crop_n_layers: int = 0, + crop_nms_thresh: float = 0.7, + crop_overlap_ratio: float = 512 / 1500, + crop_n_points_downscale_factor: int = 1, + point_grids: Optional[List[np.ndarray]] = None, + min_mask_region_area: int = 0, + output_mode: str = "binary_mask", + use_m2m: bool = False, + multimask_output: bool = True, + ) -> None: + """ + Using a SAM 2 model, generates masks for the entire image. + Generates a grid of point prompts over the image, then filters + low quality and duplicate masks. The default settings are chosen + for SAM 2 with a HieraL backbone. + + Arguments: + model (Sam): The SAM 2 model to use for mask prediction. + points_per_side (int or None): The number of points to be sampled + along one side of the image. The total number of points is + points_per_side**2. If None, 'point_grids' must provide explicit + point sampling. + points_per_batch (int): Sets the number of points run simultaneously + by the model. Higher numbers may be faster but use more GPU memory. + pred_iou_thresh (float): A filtering threshold in [0,1], using the + model's predicted mask quality. + stability_score_thresh (float): A filtering threshold in [0,1], using + the stability of the mask under changes to the cutoff used to binarize + the model's mask predictions. + stability_score_offset (float): The amount to shift the cutoff when + calculated the stability score. + mask_threshold (float): Threshold for binarizing the mask logits + box_nms_thresh (float): The box IoU cutoff used by non-maximal + suppression to filter duplicate masks. + crop_n_layers (int): If >0, mask prediction will be run again on + crops of the image. Sets the number of layers to run, where each + layer has 2**i_layer number of image crops. + crop_nms_thresh (float): The box IoU cutoff used by non-maximal + suppression to filter duplicate masks between different crops. + crop_overlap_ratio (float): Sets the degree to which crops overlap. + In the first crop layer, crops will overlap by this fraction of + the image length. Later layers with more crops scale down this overlap. + crop_n_points_downscale_factor (int): The number of points-per-side + sampled in layer n is scaled down by crop_n_points_downscale_factor**n. + point_grids (list(np.ndarray) or None): A list over explicit grids + of points used for sampling, normalized to [0,1]. The nth grid in the + list is used in the nth crop layer. Exclusive with points_per_side. + min_mask_region_area (int): If >0, postprocessing will be applied + to remove disconnected regions and holes in masks with area smaller + than min_mask_region_area. Requires opencv. + output_mode (str): The form masks are returned in. Can be 'binary_mask', + 'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools. + For large resolutions, 'binary_mask' may consume large amounts of + memory. + use_m2m (bool): Whether to add a one step refinement using previous mask predictions. + multimask_output (bool): Whether to output multimask at each point of the grid. + """ + + assert (points_per_side is None) != ( + point_grids is None + ), "Exactly one of points_per_side or point_grid must be provided." + if points_per_side is not None: + self.point_grids = build_all_layer_point_grids( + points_per_side, + crop_n_layers, + crop_n_points_downscale_factor, + ) + elif point_grids is not None: + self.point_grids = point_grids + else: + raise ValueError("Can't have both points_per_side and point_grid be None.") + + assert output_mode in [ + "binary_mask", + "uncompressed_rle", + "coco_rle", + ], f"Unknown output_mode {output_mode}." + if output_mode == "coco_rle": + try: + from pycocotools import mask as mask_utils # type: ignore # noqa: F401 + except ImportError as e: + print("Please install pycocotools") + raise e + + self.predictor = SAM2ImagePredictor( + model, + max_hole_area=min_mask_region_area, + max_sprinkle_area=min_mask_region_area, + ) + self.points_per_batch = points_per_batch + self.pred_iou_thresh = pred_iou_thresh + self.stability_score_thresh = stability_score_thresh + self.stability_score_offset = stability_score_offset + self.mask_threshold = mask_threshold + self.box_nms_thresh = box_nms_thresh + self.crop_n_layers = crop_n_layers + self.crop_nms_thresh = crop_nms_thresh + self.crop_overlap_ratio = crop_overlap_ratio + self.crop_n_points_downscale_factor = crop_n_points_downscale_factor + self.min_mask_region_area = min_mask_region_area + self.output_mode = output_mode + self.use_m2m = use_m2m + self.multimask_output = multimask_output + + @torch.no_grad() + def generate(self, image: np.ndarray) -> List[Dict[str, Any]]: + """ + Generates masks for the given image. + + Arguments: + image (np.ndarray): The image to generate masks for, in HWC uint8 format. + + Returns: + list(dict(str, any)): A list over records for masks. Each record is + a dict containing the following keys: + segmentation (dict(str, any) or np.ndarray): The mask. If + output_mode='binary_mask', is an array of shape HW. Otherwise, + is a dictionary containing the RLE. + bbox (list(float)): The box around the mask, in XYWH format. + area (int): The area in pixels of the mask. + predicted_iou (float): The model's own prediction of the mask's + quality. This is filtered by the pred_iou_thresh parameter. + point_coords (list(list(float))): The point coordinates input + to the model to generate this mask. + stability_score (float): A measure of the mask's quality. This + is filtered on using the stability_score_thresh parameter. + crop_box (list(float)): The crop of the image used to generate + the mask, given in XYWH format. + """ + + # Generate masks + mask_data = self._generate_masks(image) + + # Encode masks + if self.output_mode == "coco_rle": + mask_data["segmentations"] = [ + coco_encode_rle(rle) for rle in mask_data["rles"] + ] + elif self.output_mode == "binary_mask": + mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]] + else: + mask_data["segmentations"] = mask_data["rles"] + + # Write mask records + curr_anns = [] + for idx in range(len(mask_data["segmentations"])): + ann = { + "segmentation": mask_data["segmentations"][idx], + "area": area_from_rle(mask_data["rles"][idx]), + "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(), + "predicted_iou": mask_data["iou_preds"][idx].item(), + "point_coords": [mask_data["points"][idx].tolist()], + "stability_score": mask_data["stability_score"][idx].item(), + "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(), + } + curr_anns.append(ann) + + return curr_anns + + def _generate_masks(self, image: np.ndarray) -> MaskData: + orig_size = image.shape[:2] + crop_boxes, layer_idxs = generate_crop_boxes( + orig_size, self.crop_n_layers, self.crop_overlap_ratio + ) + + # Iterate over image crops + data = MaskData() + for crop_box, layer_idx in zip(crop_boxes, layer_idxs): + crop_data = self._process_crop(image, crop_box, layer_idx, orig_size) + data.cat(crop_data) + + # Remove duplicate masks between crops + if len(crop_boxes) > 1: + # Prefer masks from smaller crops + scores = 1 / box_area(data["crop_boxes"]) + scores = scores.to(data["boxes"].device) + keep_by_nms = batched_nms( + data["boxes"].float(), + scores, + torch.zeros_like(data["boxes"][:, 0]), # categories + iou_threshold=self.crop_nms_thresh, + ) + data.filter(keep_by_nms) + data.to_numpy() + return data + + def _process_crop( + self, + image: np.ndarray, + crop_box: List[int], + crop_layer_idx: int, + orig_size: Tuple[int, ...], + ) -> MaskData: + # Crop the image and calculate embeddings + x0, y0, x1, y1 = crop_box + cropped_im = image[y0:y1, x0:x1, :] + cropped_im_size = cropped_im.shape[:2] + self.predictor.set_image(cropped_im) + + # Get points for this crop + points_scale = np.array(cropped_im_size)[None, ::-1] + points_for_image = self.point_grids[crop_layer_idx] * points_scale + + # Generate masks for this crop in batches + data = MaskData() + for (points,) in batch_iterator(self.points_per_batch, points_for_image): + batch_data = self._process_batch( + points, cropped_im_size, crop_box, orig_size, normalize=True + ) + data.cat(batch_data) + del batch_data + self.predictor.reset_predictor() + + # Remove duplicates within this crop. + keep_by_nms = batched_nms( + data["boxes"].float(), + data["iou_preds"], + torch.zeros_like(data["boxes"][:, 0]), # categories + iou_threshold=self.box_nms_thresh, + ) + data.filter(keep_by_nms) + + # Return to the original image frame + data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box) + data["points"] = uncrop_points(data["points"], crop_box) + data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))]) + + return data + + def _process_batch( + self, + points: np.ndarray, + im_size: Tuple[int, ...], + crop_box: List[int], + orig_size: Tuple[int, ...], + normalize=False, + ) -> MaskData: + orig_h, orig_w = orig_size + + # Run model on this batch + points = torch.as_tensor(points, device=self.predictor.device) + in_points = self.predictor._transforms.transform_coords( + points, normalize=normalize, orig_hw=im_size + ) + in_labels = torch.ones( + in_points.shape[0], dtype=torch.int, device=in_points.device + ) + masks, iou_preds, low_res_masks = self.predictor._predict( + in_points[:, None, :], + in_labels[:, None], + multimask_output=self.multimask_output, + return_logits=True, + ) + + # Serialize predictions and store in MaskData + data = MaskData( + masks=masks.flatten(0, 1), + iou_preds=iou_preds.flatten(0, 1), + points=points.repeat_interleave(masks.shape[1], dim=0), + low_res_masks=low_res_masks.flatten(0, 1), + ) + del masks + + if not self.use_m2m: + # Filter by predicted IoU + if self.pred_iou_thresh > 0.0: + keep_mask = data["iou_preds"] > self.pred_iou_thresh + data.filter(keep_mask) + + # Calculate and filter by stability score + data["stability_score"] = calculate_stability_score( + data["masks"], self.mask_threshold, self.stability_score_offset + ) + if self.stability_score_thresh > 0.0: + keep_mask = data["stability_score"] >= self.stability_score_thresh + data.filter(keep_mask) + else: + # One step refinement using previous mask predictions + in_points = self.predictor._transforms.transform_coords( + data["points"], normalize=normalize, orig_hw=im_size + ) + labels = torch.ones( + in_points.shape[0], dtype=torch.int, device=in_points.device + ) + masks, ious = self.refine_with_m2m( + in_points, labels, data["low_res_masks"], self.points_per_batch + ) + data["masks"] = masks.squeeze(1) + data["iou_preds"] = ious.squeeze(1) + + if self.pred_iou_thresh > 0.0: + keep_mask = data["iou_preds"] > self.pred_iou_thresh + data.filter(keep_mask) + + data["stability_score"] = calculate_stability_score( + data["masks"], self.mask_threshold, self.stability_score_offset + ) + if self.stability_score_thresh > 0.0: + keep_mask = data["stability_score"] >= self.stability_score_thresh + data.filter(keep_mask) + + # Threshold masks and calculate boxes + data["masks"] = data["masks"] > self.mask_threshold + data["boxes"] = batched_mask_to_box(data["masks"]) + + # Filter boxes that touch crop boundaries + keep_mask = ~is_box_near_crop_edge( + data["boxes"], crop_box, [0, 0, orig_w, orig_h] + ) + if not torch.all(keep_mask): + data.filter(keep_mask) + + # Compress to RLE + data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w) + data["rles"] = mask_to_rle_pytorch(data["masks"]) + del data["masks"] + + return data + + @staticmethod + def postprocess_small_regions( + mask_data: MaskData, min_area: int, nms_thresh: float + ) -> MaskData: + """ + Removes small disconnected regions and holes in masks, then reruns + box NMS to remove any new duplicates. + + Edits mask_data in place. + + Requires open-cv as a dependency. + """ + if len(mask_data["rles"]) == 0: + return mask_data + + # Filter small disconnected regions and holes + new_masks = [] + scores = [] + for rle in mask_data["rles"]: + mask = rle_to_mask(rle) + + mask, changed = remove_small_regions(mask, min_area, mode="holes") + unchanged = not changed + mask, changed = remove_small_regions(mask, min_area, mode="islands") + unchanged = unchanged and not changed + + new_masks.append(torch.as_tensor(mask).unsqueeze(0)) + # Give score=0 to changed masks and score=1 to unchanged masks + # so NMS will prefer ones that didn't need postprocessing + scores.append(float(unchanged)) + + # Recalculate boxes and remove any new duplicates + masks = torch.cat(new_masks, dim=0) + boxes = batched_mask_to_box(masks) + keep_by_nms = batched_nms( + boxes.float(), + torch.as_tensor(scores), + torch.zeros_like(boxes[:, 0]), # categories + iou_threshold=nms_thresh, + ) + + # Only recalculate RLEs for masks that have changed + for i_mask in keep_by_nms: + if scores[i_mask] == 0.0: + mask_torch = masks[i_mask].unsqueeze(0) + mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0] + mask_data["boxes"][i_mask] = boxes[i_mask] # update res directly + mask_data.filter(keep_by_nms) + + return mask_data + + def refine_with_m2m(self, points, point_labels, low_res_masks, points_per_batch): + new_masks = [] + new_iou_preds = [] + + for cur_points, cur_point_labels, low_res_mask in batch_iterator( + points_per_batch, points, point_labels, low_res_masks + ): + best_masks, best_iou_preds, _ = self.predictor._predict( + cur_points[:, None, :], + cur_point_labels[:, None], + mask_input=low_res_mask[:, None, :], + multimask_output=False, + return_logits=True, + ) + new_masks.append(best_masks) + new_iou_preds.append(best_iou_preds) + masks = torch.cat(new_masks, dim=0) + return masks, torch.cat(new_iou_preds, dim=0) diff --git a/sam2/build_sam.py b/sam2/build_sam.py new file mode 100644 index 0000000000000000000000000000000000000000..50f6c91cf442394573d2eaccb0d7112d6995f684 --- /dev/null +++ b/sam2/build_sam.py @@ -0,0 +1,89 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch +from hydra import compose +from hydra.utils import instantiate +from omegaconf import OmegaConf + + +def build_sam2( + config_file, + ckpt_path=None, + device="cuda", + mode="eval", + hydra_overrides_extra=[], + apply_postprocessing=True, +): + + if apply_postprocessing: + hydra_overrides_extra = hydra_overrides_extra.copy() + hydra_overrides_extra += [ + # dynamically fall back to multi-mask if the single mask is not stable + "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true", + "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05", + "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98", + ] + # Read config and init model + cfg = compose(config_name=config_file, overrides=hydra_overrides_extra) + OmegaConf.resolve(cfg) + model = instantiate(cfg.model, _recursive_=True) + _load_checkpoint(model, ckpt_path) + model = model.to(device) + if mode == "eval": + model.eval() + return model + + +def build_sam2_video_predictor( + config_file, + ckpt_path=None, + device="cuda", + mode="eval", + hydra_overrides_extra=[], + apply_postprocessing=True, +): + hydra_overrides = [ + "++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictor", + ] + if apply_postprocessing: + hydra_overrides_extra = hydra_overrides_extra.copy() + hydra_overrides_extra += [ + # dynamically fall back to multi-mask if the single mask is not stable + "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true", + "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05", + "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98", + # the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking + "++model.binarize_mask_from_pts_for_mem_enc=true", + # fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution) + "++model.fill_hole_area=8", + ] + hydra_overrides.extend(hydra_overrides_extra) + + # Read config and init model + cfg = compose(config_name=config_file, overrides=hydra_overrides) + OmegaConf.resolve(cfg) + model = instantiate(cfg.model, _recursive_=True) + _load_checkpoint(model, ckpt_path) + model = model.to(device) + if mode == "eval": + model.eval() + return model + + +def _load_checkpoint(model, ckpt_path): + if ckpt_path is not None: + sd = torch.load(ckpt_path, map_location="cpu")["model"] + missing_keys, unexpected_keys = model.load_state_dict(sd) + if missing_keys: + logging.error(missing_keys) + raise RuntimeError() + if unexpected_keys: + logging.error(unexpected_keys) + raise RuntimeError() + logging.info("Loaded checkpoint sucessfully") diff --git a/sam2/csrc/connected_components.cu b/sam2/csrc/connected_components.cu new file mode 100644 index 0000000000000000000000000000000000000000..6e3fbee0eba762c7198ace220660ceadd13d7402 --- /dev/null +++ b/sam2/csrc/connected_components.cu @@ -0,0 +1,289 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. + +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. + +// adapted from https://github.com/zsef123/Connected_components_PyTorch +// with license found in the LICENSE_cctorch file in the root directory. +#include +#include +#include +#include +#include +#include + +// 2d +#define BLOCK_ROWS 16 +#define BLOCK_COLS 16 + +namespace cc2d { + +template +__device__ __forceinline__ unsigned char hasBit(T bitmap, unsigned char pos) { + return (bitmap >> pos) & 1; +} + +__device__ int32_t find(const int32_t* s_buf, int32_t n) { + while (s_buf[n] != n) + n = s_buf[n]; + return n; +} + +__device__ int32_t find_n_compress(int32_t* s_buf, int32_t n) { + const int32_t id = n; + while (s_buf[n] != n) { + n = s_buf[n]; + s_buf[id] = n; + } + return n; +} + +__device__ void union_(int32_t* s_buf, int32_t a, int32_t b) { + bool done; + do { + a = find(s_buf, a); + b = find(s_buf, b); + + if (a < b) { + int32_t old = atomicMin(s_buf + b, a); + done = (old == b); + b = old; + } else if (b < a) { + int32_t old = atomicMin(s_buf + a, b); + done = (old == a); + a = old; + } else + done = true; + + } while (!done); +} + +__global__ void +init_labeling(int32_t* label, const uint32_t W, const uint32_t H) { + const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2; + const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2; + const uint32_t idx = row * W + col; + + if (row < H && col < W) + label[idx] = idx; +} + +__global__ void +merge(uint8_t* img, int32_t* label, const uint32_t W, const uint32_t H) { + const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2; + const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2; + const uint32_t idx = row * W + col; + + if (row >= H || col >= W) + return; + + uint32_t P = 0; + + if (img[idx]) + P |= 0x777; + if (row + 1 < H && img[idx + W]) + P |= 0x777 << 4; + if (col + 1 < W && img[idx + 1]) + P |= 0x777 << 1; + + if (col == 0) + P &= 0xEEEE; + if (col + 1 >= W) + P &= 0x3333; + else if (col + 2 >= W) + P &= 0x7777; + + if (row == 0) + P &= 0xFFF0; + if (row + 1 >= H) + P &= 0xFF; + + if (P > 0) { + // If need check about top-left pixel(if flag the first bit) and hit the + // top-left pixel + if (hasBit(P, 0) && img[idx - W - 1]) { + union_(label, idx, idx - 2 * W - 2); // top left block + } + + if ((hasBit(P, 1) && img[idx - W]) || (hasBit(P, 2) && img[idx - W + 1])) + union_(label, idx, idx - 2 * W); // top bottom block + + if (hasBit(P, 3) && img[idx + 2 - W]) + union_(label, idx, idx - 2 * W + 2); // top right block + + if ((hasBit(P, 4) && img[idx - 1]) || (hasBit(P, 8) && img[idx + W - 1])) + union_(label, idx, idx - 2); // just left block + } +} + +__global__ void compression(int32_t* label, const int32_t W, const int32_t H) { + const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2; + const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2; + const uint32_t idx = row * W + col; + + if (row < H && col < W) + find_n_compress(label, idx); +} + +__global__ void final_labeling( + const uint8_t* img, + int32_t* label, + const int32_t W, + const int32_t H) { + const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y) * 2; + const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x) * 2; + const uint32_t idx = row * W + col; + + if (row >= H || col >= W) + return; + + int32_t y = label[idx] + 1; + + if (img[idx]) + label[idx] = y; + else + label[idx] = 0; + + if (col + 1 < W) { + if (img[idx + 1]) + label[idx + 1] = y; + else + label[idx + 1] = 0; + + if (row + 1 < H) { + if (img[idx + W + 1]) + label[idx + W + 1] = y; + else + label[idx + W + 1] = 0; + } + } + + if (row + 1 < H) { + if (img[idx + W]) + label[idx + W] = y; + else + label[idx + W] = 0; + } +} + +__global__ void init_counting( + const int32_t* label, + int32_t* count_init, + const int32_t W, + const int32_t H) { + const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y); + const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x); + const uint32_t idx = row * W + col; + + if (row >= H || col >= W) + return; + + int32_t y = label[idx]; + if (y > 0) { + int32_t count_idx = y - 1; + atomicAdd(count_init + count_idx, 1); + } +} + +__global__ void final_counting( + const int32_t* label, + const int32_t* count_init, + int32_t* count_final, + const int32_t W, + const int32_t H) { + const uint32_t row = (blockIdx.y * blockDim.y + threadIdx.y); + const uint32_t col = (blockIdx.x * blockDim.x + threadIdx.x); + const uint32_t idx = row * W + col; + + if (row >= H || col >= W) + return; + + int32_t y = label[idx]; + if (y > 0) { + int32_t count_idx = y - 1; + count_final[idx] = count_init[count_idx]; + } else { + count_final[idx] = 0; + } +} + +} // namespace cc2d + +std::vector get_connected_componnets( + const torch::Tensor& inputs) { + AT_ASSERTM(inputs.is_cuda(), "inputs must be a CUDA tensor"); + AT_ASSERTM(inputs.ndimension() == 4, "inputs must be [N, 1, H, W] shape"); + AT_ASSERTM( + inputs.scalar_type() == torch::kUInt8, "inputs must be a uint8 type"); + + const uint32_t N = inputs.size(0); + const uint32_t C = inputs.size(1); + const uint32_t H = inputs.size(2); + const uint32_t W = inputs.size(3); + + AT_ASSERTM(C == 1, "inputs must be [N, 1, H, W] shape"); + AT_ASSERTM((H % 2) == 0, "height must be an even number"); + AT_ASSERTM((W % 2) == 0, "width must be an even number"); + + // label must be uint32_t + auto label_options = + torch::TensorOptions().dtype(torch::kInt32).device(inputs.device()); + torch::Tensor labels = torch::zeros({N, C, H, W}, label_options); + torch::Tensor counts_init = torch::zeros({N, C, H, W}, label_options); + torch::Tensor counts_final = torch::zeros({N, C, H, W}, label_options); + + dim3 grid = dim3( + ((W + 1) / 2 + BLOCK_COLS - 1) / BLOCK_COLS, + ((H + 1) / 2 + BLOCK_ROWS - 1) / BLOCK_ROWS); + dim3 block = dim3(BLOCK_COLS, BLOCK_ROWS); + dim3 grid_count = + dim3((W + BLOCK_COLS) / BLOCK_COLS, (H + BLOCK_ROWS) / BLOCK_ROWS); + dim3 block_count = dim3(BLOCK_COLS, BLOCK_ROWS); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + for (int n = 0; n < N; n++) { + uint32_t offset = n * H * W; + + cc2d::init_labeling<<>>( + labels.data_ptr() + offset, W, H); + cc2d::merge<<>>( + inputs.data_ptr() + offset, + labels.data_ptr() + offset, + W, + H); + cc2d::compression<<>>( + labels.data_ptr() + offset, W, H); + cc2d::final_labeling<<>>( + inputs.data_ptr() + offset, + labels.data_ptr() + offset, + W, + H); + + // get the counting of each pixel + cc2d::init_counting<<>>( + labels.data_ptr() + offset, + counts_init.data_ptr() + offset, + W, + H); + cc2d::final_counting<<>>( + labels.data_ptr() + offset, + counts_init.data_ptr() + offset, + counts_final.data_ptr() + offset, + W, + H); + } + + // returned values are [labels, counts] + std::vector outputs; + outputs.push_back(labels); + outputs.push_back(counts_final); + return outputs; +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def( + "get_connected_componnets", + &get_connected_componnets, + "get_connected_componnets"); +} diff --git a/sam2/modeling/__init__.py b/sam2/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4547e070da2f3ddc5bf2f466cb2242e6135c7dc3 --- /dev/null +++ b/sam2/modeling/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/sam2/modeling/__pycache__/__init__.cpython-312.pyc b/sam2/modeling/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2520acd7aecca0c5820c75fdc1cf13d1e2c08d1 Binary files /dev/null and b/sam2/modeling/__pycache__/__init__.cpython-312.pyc differ diff --git a/sam2/modeling/__pycache__/memory_attention.cpython-312.pyc b/sam2/modeling/__pycache__/memory_attention.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1775f8f23a7ef920aedd32315dcbd974af40c52e Binary files /dev/null and b/sam2/modeling/__pycache__/memory_attention.cpython-312.pyc differ diff --git a/sam2/modeling/__pycache__/memory_encoder.cpython-312.pyc b/sam2/modeling/__pycache__/memory_encoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9323ff15f80afe6ae192bde57758ae24dd9322f7 Binary files /dev/null and b/sam2/modeling/__pycache__/memory_encoder.cpython-312.pyc differ diff --git a/sam2/modeling/__pycache__/position_encoding.cpython-312.pyc b/sam2/modeling/__pycache__/position_encoding.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44a06fbab65c0724d95a247128e4b177d0e7b258 Binary files /dev/null and b/sam2/modeling/__pycache__/position_encoding.cpython-312.pyc differ diff --git a/sam2/modeling/__pycache__/sam2_base.cpython-312.pyc b/sam2/modeling/__pycache__/sam2_base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a5d0b26441e991a3243b8cdb06b1c30b4066001 Binary files /dev/null and b/sam2/modeling/__pycache__/sam2_base.cpython-312.pyc differ diff --git a/sam2/modeling/__pycache__/sam2_utils.cpython-312.pyc b/sam2/modeling/__pycache__/sam2_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93e1d831d9986d6b6f6782ae5c27a9ad122a2959 Binary files /dev/null and b/sam2/modeling/__pycache__/sam2_utils.cpython-312.pyc differ diff --git a/sam2/modeling/backbones/__init__.py b/sam2/modeling/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4547e070da2f3ddc5bf2f466cb2242e6135c7dc3 --- /dev/null +++ b/sam2/modeling/backbones/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/sam2/modeling/backbones/__pycache__/__init__.cpython-312.pyc b/sam2/modeling/backbones/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9a9cb8fad15e10ce5ae3507951974850d81c78e Binary files /dev/null and b/sam2/modeling/backbones/__pycache__/__init__.cpython-312.pyc differ diff --git a/sam2/modeling/backbones/__pycache__/hieradet.cpython-312.pyc b/sam2/modeling/backbones/__pycache__/hieradet.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5593446a6642be9a88c742a417497369a1fc3a4 Binary files /dev/null and b/sam2/modeling/backbones/__pycache__/hieradet.cpython-312.pyc differ diff --git a/sam2/modeling/backbones/__pycache__/image_encoder.cpython-312.pyc b/sam2/modeling/backbones/__pycache__/image_encoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02620a1bc46cf4271f1756cad995f453ddd17b98 Binary files /dev/null and b/sam2/modeling/backbones/__pycache__/image_encoder.cpython-312.pyc differ diff --git a/sam2/modeling/backbones/__pycache__/utils.cpython-312.pyc b/sam2/modeling/backbones/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6595b877a2a42c36c3c0aaecc6e475f5addf8115 Binary files /dev/null and b/sam2/modeling/backbones/__pycache__/utils.cpython-312.pyc differ diff --git a/sam2/modeling/backbones/hieradet.py b/sam2/modeling/backbones/hieradet.py new file mode 100644 index 0000000000000000000000000000000000000000..f8dea37b8dbc6cd7660e27faa6a855c1c926adbe --- /dev/null +++ b/sam2/modeling/backbones/hieradet.py @@ -0,0 +1,295 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from functools import partial +from typing import List, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from sam2.modeling.backbones.utils import ( + PatchEmbed, + window_partition, + window_unpartition, +) + +from sam2.modeling.sam2_utils import DropPath, MLP + + +def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor: + if pool is None: + return x + # (B, H, W, C) -> (B, C, H, W) + x = x.permute(0, 3, 1, 2) + x = pool(x) + # (B, C, H', W') -> (B, H', W', C) + x = x.permute(0, 2, 3, 1) + if norm: + x = norm(x) + + return x + + +class MultiScaleAttention(nn.Module): + def __init__( + self, + dim: int, + dim_out: int, + num_heads: int, + q_pool: nn.Module = None, + ): + super().__init__() + + self.dim = dim + self.dim_out = dim_out + + self.num_heads = num_heads + head_dim = dim_out // num_heads + self.scale = head_dim**-0.5 + + self.q_pool = q_pool + self.qkv = nn.Linear(dim, dim_out * 3) + self.proj = nn.Linear(dim_out, dim_out) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, H, W, _ = x.shape + # qkv with shape (B, H * W, 3, nHead, C) + qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1) + # q, k, v with shape (B, H * W, nheads, C) + q, k, v = torch.unbind(qkv, 2) + + # Q pooling (for downsample at stage changes) + if self.q_pool: + q = do_pool(q.reshape(B, H, W, -1), self.q_pool) + H, W = q.shape[1:3] # downsampled shape + q = q.reshape(B, H * W, self.num_heads, -1) + + # Torch's SDPA expects [B, nheads, H*W, C] so we transpose + x = F.scaled_dot_product_attention( + q.transpose(1, 2), + k.transpose(1, 2), + v.transpose(1, 2), + ) + # Transpose back + x = x.transpose(1, 2) + x = x.reshape(B, H, W, -1) + + x = self.proj(x) + + return x + + +class MultiScaleBlock(nn.Module): + def __init__( + self, + dim: int, + dim_out: int, + num_heads: int, + mlp_ratio: float = 4.0, + drop_path: float = 0.0, + norm_layer: Union[nn.Module, str] = "LayerNorm", + q_stride: Tuple[int, int] = None, + act_layer: nn.Module = nn.GELU, + window_size: int = 0, + ): + super().__init__() + + if isinstance(norm_layer, str): + norm_layer = partial(getattr(nn, norm_layer), eps=1e-6) + + self.dim = dim + self.dim_out = dim_out + self.norm1 = norm_layer(dim) + + self.window_size = window_size + + self.pool, self.q_stride = None, q_stride + if self.q_stride: + self.pool = nn.MaxPool2d( + kernel_size=q_stride, stride=q_stride, ceil_mode=False + ) + + self.attn = MultiScaleAttention( + dim, + dim_out, + num_heads=num_heads, + q_pool=self.pool, + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim_out) + self.mlp = MLP( + dim_out, + int(dim_out * mlp_ratio), + dim_out, + num_layers=2, + activation=act_layer, + ) + + if dim != dim_out: + self.proj = nn.Linear(dim, dim_out) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + shortcut = x # B, H, W, C + x = self.norm1(x) + + # Skip connection + if self.dim != self.dim_out: + shortcut = do_pool(self.proj(x), self.pool) + + # Window partition + window_size = self.window_size + if window_size > 0: + H, W = x.shape[1], x.shape[2] + x, pad_hw = window_partition(x, window_size) + + # Window Attention + Q Pooling (if stage change) + x = self.attn(x) + if self.q_stride: + # Shapes have changed due to Q pooling + window_size = self.window_size // self.q_stride[0] + H, W = shortcut.shape[1:3] + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + pad_hw = (H + pad_h, W + pad_w) + + # Reverse window partition + if self.window_size > 0: + x = window_unpartition(x, window_size, pad_hw, (H, W)) + + x = shortcut + self.drop_path(x) + # MLP + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class Hiera(nn.Module): + """ + Reference: https://arxiv.org/abs/2306.00989 + """ + + def __init__( + self, + embed_dim: int = 96, # initial embed dim + num_heads: int = 1, # initial number of heads + drop_path_rate: float = 0.0, # stochastic depth + q_pool: int = 3, # number of q_pool stages + q_stride: Tuple[int, int] = (2, 2), # downsample stride bet. stages + stages: Tuple[int, ...] = (2, 3, 16, 3), # blocks per stage + dim_mul: float = 2.0, # dim_mul factor at stage shift + head_mul: float = 2.0, # head_mul factor at stage shift + window_pos_embed_bkg_spatial_size: Tuple[int, int] = (14, 14), + # window size per stage, when not using global att. + window_spec: Tuple[int, ...] = ( + 8, + 4, + 14, + 7, + ), + # global attn in these blocks + global_att_blocks: Tuple[int, ...] = ( + 12, + 16, + 20, + ), + return_interm_layers=True, # return feats from every stage + ): + super().__init__() + + assert len(stages) == len(window_spec) + self.window_spec = window_spec + + depth = sum(stages) + self.q_stride = q_stride + self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)] + assert 0 <= q_pool <= len(self.stage_ends[:-1]) + self.q_pool_blocks = [x + 1 for x in self.stage_ends[:-1]][:q_pool] + self.return_interm_layers = return_interm_layers + + self.patch_embed = PatchEmbed( + embed_dim=embed_dim, + ) + # Which blocks have global att? + self.global_att_blocks = global_att_blocks + + # Windowed positional embedding (https://arxiv.org/abs/2311.05613) + self.window_pos_embed_bkg_spatial_size = window_pos_embed_bkg_spatial_size + self.pos_embed = nn.Parameter( + torch.zeros(1, embed_dim, *self.window_pos_embed_bkg_spatial_size) + ) + self.pos_embed_window = nn.Parameter( + torch.zeros(1, embed_dim, self.window_spec[0], self.window_spec[0]) + ) + + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + + cur_stage = 1 + self.blocks = nn.ModuleList() + + for i in range(depth): + dim_out = embed_dim + # lags by a block, so first block of + # next stage uses an initial window size + # of previous stage and final window size of current stage + window_size = self.window_spec[cur_stage - 1] + + if self.global_att_blocks is not None: + window_size = 0 if i in self.global_att_blocks else window_size + + if i - 1 in self.stage_ends: + dim_out = int(embed_dim * dim_mul) + num_heads = int(num_heads * head_mul) + cur_stage += 1 + + block = MultiScaleBlock( + dim=embed_dim, + dim_out=dim_out, + num_heads=num_heads, + drop_path=dpr[i], + q_stride=self.q_stride if i in self.q_pool_blocks else None, + window_size=window_size, + ) + + embed_dim = dim_out + self.blocks.append(block) + + self.channel_list = ( + [self.blocks[i].dim_out for i in self.stage_ends[::-1]] + if return_interm_layers + else [self.blocks[-1].dim_out] + ) + + def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor: + h, w = hw + window_embed = self.pos_embed_window + pos_embed = F.interpolate(self.pos_embed, size=(h, w), mode="bicubic") + pos_embed = pos_embed + window_embed.tile( + [x // y for x, y in zip(pos_embed.shape, window_embed.shape)] + ) + pos_embed = pos_embed.permute(0, 2, 3, 1) + return pos_embed + + def forward(self, x: torch.Tensor) -> List[torch.Tensor]: + x = self.patch_embed(x) + # x: (B, H, W, C) + + # Add pos embed + x = x + self._get_pos_embed(x.shape[1:3]) + + outputs = [] + for i, blk in enumerate(self.blocks): + x = blk(x) + if (i == self.stage_ends[-1]) or ( + i in self.stage_ends and self.return_interm_layers + ): + feats = x.permute(0, 3, 1, 2) + outputs.append(feats) + + return outputs diff --git a/sam2/modeling/backbones/image_encoder.py b/sam2/modeling/backbones/image_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..5625fd33ab4b49904ef6056c42f26afd2ad8f1ad --- /dev/null +++ b/sam2/modeling/backbones/image_encoder.py @@ -0,0 +1,133 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ImageEncoder(nn.Module): + def __init__( + self, + trunk: nn.Module, + neck: nn.Module, + scalp: int = 0, + ): + super().__init__() + self.trunk = trunk + self.neck = neck + self.scalp = scalp + assert ( + self.trunk.channel_list == self.neck.backbone_channel_list + ), f"Channel dims of trunk and neck do not match. Trunk: {self.trunk.channel_list}, neck: {self.neck.backbone_channel_list}" + + def forward(self, sample: torch.Tensor): + # Forward through backbone + features, pos = self.neck(self.trunk(sample)) + if self.scalp > 0: + # Discard the lowest resolution features + features, pos = features[: -self.scalp], pos[: -self.scalp] + + src = features[-1] + output = { + "vision_features": src, + "vision_pos_enc": pos, + "backbone_fpn": features, + } + return output + + +class FpnNeck(nn.Module): + """ + A modified variant of Feature Pyramid Network (FPN) neck + (we remove output conv and also do bicubic interpolation similar to ViT + pos embed interpolation) + """ + + def __init__( + self, + position_encoding: nn.Module, + d_model: int, + backbone_channel_list: List[int], + kernel_size: int = 1, + stride: int = 1, + padding: int = 0, + fpn_interp_model: str = "bilinear", + fuse_type: str = "sum", + fpn_top_down_levels: Optional[List[int]] = None, + ): + """Initialize the neck + :param trunk: the backbone + :param position_encoding: the positional encoding to use + :param d_model: the dimension of the model + :param neck_norm: the normalization to use + """ + super().__init__() + self.position_encoding = position_encoding + self.convs = nn.ModuleList() + self.backbone_channel_list = backbone_channel_list + for dim in backbone_channel_list: + current = nn.Sequential() + current.add_module( + "conv", + nn.Conv2d( + in_channels=dim, + out_channels=d_model, + kernel_size=kernel_size, + stride=stride, + padding=padding, + ), + ) + + self.convs.append(current) + self.fpn_interp_model = fpn_interp_model + assert fuse_type in ["sum", "avg"] + self.fuse_type = fuse_type + + # levels to have top-down features in its outputs + # e.g. if fpn_top_down_levels is [2, 3], then only outputs of level 2 and 3 + # have top-down propagation, while outputs of level 0 and level 1 have only + # lateral features from the same backbone level. + if fpn_top_down_levels is None: + # default is to have top-down features on all levels + fpn_top_down_levels = range(len(self.convs)) + self.fpn_top_down_levels = list(fpn_top_down_levels) + + def forward(self, xs: List[torch.Tensor]): + + out = [None] * len(self.convs) + pos = [None] * len(self.convs) + assert len(xs) == len(self.convs) + # fpn forward pass + # see https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/fpn.py + prev_features = None + # forward in top-down order (from low to high resolution) + n = len(self.convs) - 1 + for i in range(n, -1, -1): + x = xs[i] + lateral_features = self.convs[n - i](x) + if i in self.fpn_top_down_levels and prev_features is not None: + top_down_features = F.interpolate( + prev_features.to(dtype=torch.float32), + scale_factor=2.0, + mode=self.fpn_interp_model, + align_corners=( + None if self.fpn_interp_model == "nearest" else False + ), + antialias=False, + ) + prev_features = lateral_features + top_down_features + if self.fuse_type == "avg": + prev_features /= 2 + else: + prev_features = lateral_features + x_out = prev_features + out[i] = x_out + pos[i] = self.position_encoding(x_out).to(x_out.dtype) + + return out, pos diff --git a/sam2/modeling/backbones/utils.py b/sam2/modeling/backbones/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b7807b275256f83a83e5d1baa6c045ad6c124807 --- /dev/null +++ b/sam2/modeling/backbones/utils.py @@ -0,0 +1,95 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Some utilities for backbones, in particular for windowing""" + +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def window_partition(x, window_size): + """ + Partition into non-overlapping windows with padding if needed. + Args: + x (tensor): input tokens with [B, H, W, C]. + window_size (int): window size. + Returns: + windows: windows after partition with [B * num_windows, window_size, window_size, C]. + (Hp, Wp): padded height and width before partition + """ + B, H, W, C = x.shape + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + if pad_h > 0 or pad_w > 0: + x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) + Hp, Wp = H + pad_h, W + pad_w + + x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) + windows = ( + x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + ) + return windows, (Hp, Wp) + + +def window_unpartition(windows, window_size, pad_hw, hw): + """ + Window unpartition into original sequences and removing padding. + Args: + x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. + window_size (int): window size. + pad_hw (Tuple): padded height and width (Hp, Wp). + hw (Tuple): original height and width (H, W) before padding. + Returns: + x: unpartitioned sequences with [B, H, W, C]. + """ + Hp, Wp = pad_hw + H, W = hw + B = windows.shape[0] // (Hp * Wp // window_size // window_size) + x = windows.view( + B, Hp // window_size, Wp // window_size, window_size, window_size, -1 + ) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) + + if Hp > H or Wp > W: + x = x[:, :H, :W, :].contiguous() + return x + + +class PatchEmbed(nn.Module): + """ + Image to Patch Embedding. + """ + + def __init__( + self, + kernel_size: Tuple[int, ...] = (7, 7), + stride: Tuple[int, ...] = (4, 4), + padding: Tuple[int, ...] = (3, 3), + in_chans: int = 3, + embed_dim: int = 768, + ): + """ + Args: + kernel_size (Tuple): kernel size of the projection layer. + stride (Tuple): stride of the projection layer. + padding (Tuple): padding size of the projection layer. + in_chans (int): Number of input image channels. + embed_dim (int): embed_dim (int): Patch embedding dimension. + """ + super().__init__() + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + # B C H W -> B H W C + x = x.permute(0, 2, 3, 1) + return x diff --git a/sam2/modeling/memory_attention.py b/sam2/modeling/memory_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..9c4adb5cf0a335f7013835dd31e3863b6b04e738 --- /dev/null +++ b/sam2/modeling/memory_attention.py @@ -0,0 +1,169 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional + +import torch +from torch import nn, Tensor + +from sam2.modeling.sam.transformer import RoPEAttention + +from sam2.modeling.sam2_utils import get_activation_fn, get_clones + + +class MemoryAttentionLayer(nn.Module): + + def __init__( + self, + activation: str, + cross_attention: nn.Module, + d_model: int, + dim_feedforward: int, + dropout: float, + pos_enc_at_attn: bool, + pos_enc_at_cross_attn_keys: bool, + pos_enc_at_cross_attn_queries: bool, + self_attention: nn.Module, + ): + super().__init__() + self.d_model = d_model + self.dim_feedforward = dim_feedforward + self.dropout_value = dropout + self.self_attn = self_attention + self.cross_attn_image = cross_attention + + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + self.activation_str = activation + self.activation = get_activation_fn(activation) + + # Where to add pos enc + self.pos_enc_at_attn = pos_enc_at_attn + self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries + self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys + + def _forward_sa(self, tgt, query_pos): + # Self-Attention + tgt2 = self.norm1(tgt) + q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2 + tgt2 = self.self_attn(q, k, v=tgt2) + tgt = tgt + self.dropout1(tgt2) + return tgt + + def _forward_ca(self, tgt, memory, query_pos, pos, num_k_exclude_rope=0): + kwds = {} + if num_k_exclude_rope > 0: + assert isinstance(self.cross_attn_image, RoPEAttention) + kwds = {"num_k_exclude_rope": num_k_exclude_rope} + + # Cross-Attention + tgt2 = self.norm2(tgt) + tgt2 = self.cross_attn_image( + q=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2, + k=memory + pos if self.pos_enc_at_cross_attn_keys else memory, + v=memory, + **kwds, + ) + tgt = tgt + self.dropout2(tgt2) + return tgt + + def forward( + self, + tgt, + memory, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None, + num_k_exclude_rope: int = 0, + ) -> torch.Tensor: + + # Self-Attn, Cross-Attn + tgt = self._forward_sa(tgt, query_pos) + tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope) + # MLP + tgt2 = self.norm3(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) + tgt = tgt + self.dropout3(tgt2) + return tgt + + +class MemoryAttention(nn.Module): + def __init__( + self, + d_model: int, + pos_enc_at_input: bool, + layer: nn.Module, + num_layers: int, + batch_first: bool = True, # Do layers expect batch first input? + ): + super().__init__() + self.d_model = d_model + self.layers = get_clones(layer, num_layers) + self.num_layers = num_layers + self.norm = nn.LayerNorm(d_model) + self.pos_enc_at_input = pos_enc_at_input + self.batch_first = batch_first + + def forward( + self, + curr: torch.Tensor, # self-attention inputs + memory: torch.Tensor, # cross-attention inputs + curr_pos: Optional[Tensor] = None, # pos_enc for self-attention inputs + memory_pos: Optional[Tensor] = None, # pos_enc for cross-attention inputs + num_obj_ptr_tokens: int = 0, # number of object pointer *tokens* + ): + if isinstance(curr, list): + assert isinstance(curr_pos, list) + assert len(curr) == len(curr_pos) == 1 + curr, curr_pos = ( + curr[0], + curr_pos[0], + ) + + assert ( + curr.shape[1] == memory.shape[1] + ), "Batch size must be the same for curr and memory" + + output = curr + if self.pos_enc_at_input and curr_pos is not None: + output = output + 0.1 * curr_pos + + if self.batch_first: + # Convert to batch first + output = output.transpose(0, 1) + curr_pos = curr_pos.transpose(0, 1) + memory = memory.transpose(0, 1) + memory_pos = memory_pos.transpose(0, 1) + + for layer in self.layers: + kwds = {} + if isinstance(layer.cross_attn_image, RoPEAttention): + kwds = {"num_k_exclude_rope": num_obj_ptr_tokens} + + output = layer( + tgt=output, + memory=memory, + pos=memory_pos, + query_pos=curr_pos, + **kwds, + ) + normed_output = self.norm(output) + + if self.batch_first: + # Convert back to seq first + normed_output = normed_output.transpose(0, 1) + curr_pos = curr_pos.transpose(0, 1) + + return normed_output diff --git a/sam2/modeling/memory_encoder.py b/sam2/modeling/memory_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..83f98a2544b225f5bdb6e9a046380b8df5887a30 --- /dev/null +++ b/sam2/modeling/memory_encoder.py @@ -0,0 +1,181 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from sam2.modeling.sam2_utils import DropPath, get_clones, LayerNorm2d + + +class MaskDownSampler(nn.Module): + """ + Progressively downsample a mask by total_stride, each time by stride. + Note that LayerNorm is applied per *token*, like in ViT. + + With each downsample (by a factor stride**2), channel capacity increases by the same factor. + In the end, we linearly project to embed_dim channels. + """ + + def __init__( + self, + embed_dim=256, + kernel_size=4, + stride=4, + padding=0, + total_stride=16, + activation=nn.GELU, + ): + super().__init__() + num_layers = int(math.log2(total_stride) // math.log2(stride)) + assert stride**num_layers == total_stride + self.encoder = nn.Sequential() + mask_in_chans, mask_out_chans = 1, 1 + for _ in range(num_layers): + mask_out_chans = mask_in_chans * (stride**2) + self.encoder.append( + nn.Conv2d( + mask_in_chans, + mask_out_chans, + kernel_size=kernel_size, + stride=stride, + padding=padding, + ) + ) + self.encoder.append(LayerNorm2d(mask_out_chans)) + self.encoder.append(activation()) + mask_in_chans = mask_out_chans + + self.encoder.append(nn.Conv2d(mask_out_chans, embed_dim, kernel_size=1)) + + def forward(self, x): + return self.encoder(x) + + +# Lightly adapted from ConvNext (https://github.com/facebookresearch/ConvNeXt) +class CXBlock(nn.Module): + r"""ConvNeXt Block. There are two equivalent implementations: + (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) + (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back + We use (2) as we find it slightly faster in PyTorch + + Args: + dim (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + + def __init__( + self, + dim, + kernel_size=7, + padding=3, + drop_path=0.0, + layer_scale_init_value=1e-6, + use_dwconv=True, + ): + super().__init__() + self.dwconv = nn.Conv2d( + dim, + dim, + kernel_size=kernel_size, + padding=padding, + groups=dim if use_dwconv else 1, + ) # depthwise conv + self.norm = LayerNorm2d(dim, eps=1e-6) + self.pwconv1 = nn.Linear( + dim, 4 * dim + ) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(4 * dim, dim) + self.gamma = ( + nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True) + if layer_scale_init_value > 0 + else None + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + def forward(self, x): + input = x + x = self.dwconv(x) + x = self.norm(x) + x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + x = input + self.drop_path(x) + return x + + +class Fuser(nn.Module): + def __init__(self, layer, num_layers, dim=None, input_projection=False): + super().__init__() + self.proj = nn.Identity() + self.layers = get_clones(layer, num_layers) + + if input_projection: + assert dim is not None + self.proj = nn.Conv2d(dim, dim, kernel_size=1) + + def forward(self, x): + # normally x: (N, C, H, W) + x = self.proj(x) + for layer in self.layers: + x = layer(x) + return x + + +class MemoryEncoder(nn.Module): + def __init__( + self, + out_dim, + mask_downsampler, + fuser, + position_encoding, + in_dim=256, # in_dim of pix_feats + ): + super().__init__() + + self.mask_downsampler = mask_downsampler + + self.pix_feat_proj = nn.Conv2d(in_dim, in_dim, kernel_size=1) + self.fuser = fuser + self.position_encoding = position_encoding + self.out_proj = nn.Identity() + if out_dim != in_dim: + self.out_proj = nn.Conv2d(in_dim, out_dim, kernel_size=1) + + def forward( + self, + pix_feat: torch.Tensor, + masks: torch.Tensor, + skip_mask_sigmoid: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + ## Process masks + # sigmoid, so that less domain shift from gt masks which are bool + if not skip_mask_sigmoid: + masks = F.sigmoid(masks) + masks = self.mask_downsampler(masks) + + ## Fuse pix_feats and downsampled masks + # in case the visual features are on CPU, cast them to CUDA + pix_feat = pix_feat.to(masks.device) + + x = self.pix_feat_proj(pix_feat) + x = x + masks + x = self.fuser(x) + x = self.out_proj(x) + + pos = self.position_encoding(x).to(x.dtype) + + return {"vision_features": x, "vision_pos_enc": [pos]} diff --git a/sam2/modeling/position_encoding.py b/sam2/modeling/position_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..8f41cf91739001ccedbd61e174df8d661310aee1 --- /dev/null +++ b/sam2/modeling/position_encoding.py @@ -0,0 +1,216 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Any, Optional, Tuple + +import numpy as np + +import torch +from torch import nn + + +class PositionEmbeddingSine(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__( + self, + num_pos_feats, + temperature: int = 10000, + normalize: bool = True, + scale: Optional[float] = None, + ): + super().__init__() + assert num_pos_feats % 2 == 0, "Expecting even model width" + self.num_pos_feats = num_pos_feats // 2 + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + self.cache = {} + + def _encode_xy(self, x, y): + # The positions are expected to be normalized + assert len(x) == len(y) and x.ndim == y.ndim == 1 + x_embed = x * self.scale + y_embed = y * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = x_embed[:, None] / dim_t + pos_y = y_embed[:, None] / dim_t + pos_x = torch.stack( + (pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2 + ).flatten(1) + pos_y = torch.stack( + (pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2 + ).flatten(1) + return pos_x, pos_y + + @torch.no_grad() + def encode_boxes(self, x, y, w, h): + pos_x, pos_y = self._encode_xy(x, y) + pos = torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1) + return pos + + encode = encode_boxes # Backwards compatibility + + @torch.no_grad() + def encode_points(self, x, y, labels): + (bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape + assert bx == by and nx == ny and bx == bl and nx == nl + pos_x, pos_y = self._encode_xy(x.flatten(), y.flatten()) + pos_x, pos_y = pos_x.reshape(bx, nx, -1), pos_y.reshape(by, ny, -1) + pos = torch.cat((pos_y, pos_x, labels[:, :, None]), dim=2) + return pos + + @torch.no_grad() + def forward(self, x: torch.Tensor): + cache_key = (x.shape[-2], x.shape[-1]) + if cache_key in self.cache: + return self.cache[cache_key][None].repeat(x.shape[0], 1, 1, 1) + y_embed = ( + torch.arange(1, x.shape[-2] + 1, dtype=torch.float32, device=x.device) + .view(1, -1, 1) + .repeat(x.shape[0], 1, x.shape[-1]) + ) + x_embed = ( + torch.arange(1, x.shape[-1] + 1, dtype=torch.float32, device=x.device) + .view(1, 1, -1) + .repeat(x.shape[0], x.shape[-2], 1) + ) + + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + self.cache[cache_key] = pos[0] + return pos + + +class PositionEmbeddingRandom(nn.Module): + """ + Positional encoding using random spatial frequencies. + """ + + def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None: + super().__init__() + if scale is None or scale <= 0.0: + scale = 1.0 + self.register_buffer( + "positional_encoding_gaussian_matrix", + scale * torch.randn((2, num_pos_feats)), + ) + + def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor: + """Positionally encode points that are normalized to [0,1].""" + # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape + coords = 2 * coords - 1 + coords = coords @ self.positional_encoding_gaussian_matrix + coords = 2 * np.pi * coords + # outputs d_1 x ... x d_n x C shape + return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1) + + def forward(self, size: Tuple[int, int]) -> torch.Tensor: + """Generate positional encoding for a grid of the specified size.""" + h, w = size + device: Any = self.positional_encoding_gaussian_matrix.device + grid = torch.ones((h, w), device=device, dtype=torch.float32) + y_embed = grid.cumsum(dim=0) - 0.5 + x_embed = grid.cumsum(dim=1) - 0.5 + y_embed = y_embed / h + x_embed = x_embed / w + + pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1)) + return pe.permute(2, 0, 1) # C x H x W + + def forward_with_coords( + self, coords_input: torch.Tensor, image_size: Tuple[int, int] + ) -> torch.Tensor: + """Positionally encode points that are not normalized to [0,1].""" + coords = coords_input.clone() + coords[:, :, 0] = coords[:, :, 0] / image_size[1] + coords[:, :, 1] = coords[:, :, 1] / image_size[0] + return self._pe_encoding(coords.to(torch.float)) # B x N x C + + +# Rotary Positional Encoding, adapted from: +# 1. https://github.com/meta-llama/codellama/blob/main/llama/model.py +# 2. https://github.com/naver-ai/rope-vit +# 3. https://github.com/lucidrains/rotary-embedding-torch + + +def init_t_xy(end_x: int, end_y: int): + t = torch.arange(end_x * end_y, dtype=torch.float32) + t_x = (t % end_x).float() + t_y = torch.div(t, end_x, rounding_mode="floor").float() + return t_x, t_y + + +def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0): + freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) + freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) + + t_x, t_y = init_t_xy(end_x, end_y) + freqs_x = torch.outer(t_x, freqs_x) + freqs_y = torch.outer(t_y, freqs_y) + freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x) + freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y) + return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1) + + +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): + ndim = x.ndim + assert 0 <= 1 < ndim + assert freqs_cis.shape == (x.shape[-2], x.shape[-1]) + shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)] + return freqs_cis.view(*shape) + + +def apply_rotary_enc( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, + repeat_freqs_k: bool = False, +): + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = ( + torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + if xk.shape[-2] != 0 + else None + ) + freqs_cis = reshape_for_broadcast(freqs_cis, xq_) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) + if xk_ is None: + # no keys to rotate, due to dropout + return xq_out.type_as(xq).to(xq.device), xk + # repeat freqs along seq_len dim to match k seq_len + if repeat_freqs_k: + r = xk_.shape[-2] // xq_.shape[-2] + freqs_cis = freqs_cis.repeat(*([1] * (freqs_cis.ndim - 2)), r, 1) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device) diff --git a/sam2/modeling/sam/__init__.py b/sam2/modeling/sam/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4547e070da2f3ddc5bf2f466cb2242e6135c7dc3 --- /dev/null +++ b/sam2/modeling/sam/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/sam2/modeling/sam/__pycache__/__init__.cpython-312.pyc b/sam2/modeling/sam/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..409ff4ff8925dbfab0b361e975d61fadaff1c634 Binary files /dev/null and b/sam2/modeling/sam/__pycache__/__init__.cpython-312.pyc differ diff --git a/sam2/modeling/sam/__pycache__/mask_decoder.cpython-312.pyc b/sam2/modeling/sam/__pycache__/mask_decoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2bb9b2d15d9b486d2d53d6a28bf5045b35b2a03 Binary files /dev/null and b/sam2/modeling/sam/__pycache__/mask_decoder.cpython-312.pyc differ diff --git a/sam2/modeling/sam/__pycache__/prompt_encoder.cpython-312.pyc b/sam2/modeling/sam/__pycache__/prompt_encoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ebaafe7e21eb2710851043006ba921cbfaa11d3 Binary files /dev/null and b/sam2/modeling/sam/__pycache__/prompt_encoder.cpython-312.pyc differ diff --git a/sam2/modeling/sam/__pycache__/transformer.cpython-312.pyc b/sam2/modeling/sam/__pycache__/transformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8bbd04c011408316b55c857e30b33a1aa7642205 Binary files /dev/null and b/sam2/modeling/sam/__pycache__/transformer.cpython-312.pyc differ diff --git a/sam2/modeling/sam/mask_decoder.py b/sam2/modeling/sam/mask_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..19a45f49a294f72e39cd7006eeb1ca91a4266c94 --- /dev/null +++ b/sam2/modeling/sam/mask_decoder.py @@ -0,0 +1,295 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional, Tuple, Type + +import torch +from torch import nn + +from sam2.modeling.sam2_utils import LayerNorm2d, MLP + + +class MaskDecoder(nn.Module): + def __init__( + self, + *, + transformer_dim: int, + transformer: nn.Module, + num_multimask_outputs: int = 3, + activation: Type[nn.Module] = nn.GELU, + iou_head_depth: int = 3, + iou_head_hidden_dim: int = 256, + use_high_res_features: bool = False, + iou_prediction_use_sigmoid=False, + dynamic_multimask_via_stability=False, + dynamic_multimask_stability_delta=0.05, + dynamic_multimask_stability_thresh=0.98, + pred_obj_scores: bool = False, + pred_obj_scores_mlp: bool = False, + use_multimask_token_for_obj_ptr: bool = False, + ) -> None: + """ + Predicts masks given an image and prompt embeddings, using a + transformer architecture. + + Arguments: + transformer_dim (int): the channel dimension of the transformer + transformer (nn.Module): the transformer used to predict masks + num_multimask_outputs (int): the number of masks to predict + when disambiguating masks + activation (nn.Module): the type of activation to use when + upscaling masks + iou_head_depth (int): the depth of the MLP used to predict + mask quality + iou_head_hidden_dim (int): the hidden dimension of the MLP + used to predict mask quality + """ + super().__init__() + self.transformer_dim = transformer_dim + self.transformer = transformer + + self.num_multimask_outputs = num_multimask_outputs + + self.iou_token = nn.Embedding(1, transformer_dim) + self.num_mask_tokens = num_multimask_outputs + 1 + self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim) + + self.pred_obj_scores = pred_obj_scores + if self.pred_obj_scores: + self.obj_score_token = nn.Embedding(1, transformer_dim) + self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr + + self.output_upscaling = nn.Sequential( + nn.ConvTranspose2d( + transformer_dim, transformer_dim // 4, kernel_size=2, stride=2 + ), + LayerNorm2d(transformer_dim // 4), + activation(), + nn.ConvTranspose2d( + transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2 + ), + activation(), + ) + self.use_high_res_features = use_high_res_features + if use_high_res_features: + self.conv_s0 = nn.Conv2d( + transformer_dim, transformer_dim // 8, kernel_size=1, stride=1 + ) + self.conv_s1 = nn.Conv2d( + transformer_dim, transformer_dim // 4, kernel_size=1, stride=1 + ) + + self.output_hypernetworks_mlps = nn.ModuleList( + [ + MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) + for i in range(self.num_mask_tokens) + ] + ) + + self.iou_prediction_head = MLP( + transformer_dim, + iou_head_hidden_dim, + self.num_mask_tokens, + iou_head_depth, + sigmoid_output=iou_prediction_use_sigmoid, + ) + if self.pred_obj_scores: + self.pred_obj_score_head = nn.Linear(transformer_dim, 1) + if pred_obj_scores_mlp: + self.pred_obj_score_head = MLP(transformer_dim, transformer_dim, 1, 3) + + # When outputting a single mask, optionally we can dynamically fall back to the best + # multimask output token if the single mask output token gives low stability scores. + self.dynamic_multimask_via_stability = dynamic_multimask_via_stability + self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta + self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh + + def forward( + self, + image_embeddings: torch.Tensor, + image_pe: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + multimask_output: bool, + repeat_image: bool, + high_res_features: Optional[List[torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Predict masks given image and prompt embeddings. + + Arguments: + image_embeddings (torch.Tensor): the embeddings from the image encoder + image_pe (torch.Tensor): positional encoding with the shape of image_embeddings + sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes + dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs + multimask_output (bool): Whether to return multiple masks or a single + mask. + + Returns: + torch.Tensor: batched predicted masks + torch.Tensor: batched predictions of mask quality + torch.Tensor: batched SAM token for mask output + """ + masks, iou_pred, mask_tokens_out, object_score_logits = self.predict_masks( + image_embeddings=image_embeddings, + image_pe=image_pe, + sparse_prompt_embeddings=sparse_prompt_embeddings, + dense_prompt_embeddings=dense_prompt_embeddings, + repeat_image=repeat_image, + high_res_features=high_res_features, + ) + + # Select the correct mask or masks for output + if multimask_output: + masks = masks[:, 1:, :, :] + iou_pred = iou_pred[:, 1:] + elif self.dynamic_multimask_via_stability and not self.training: + masks, iou_pred = self._dynamic_multimask_via_stability(masks, iou_pred) + else: + masks = masks[:, 0:1, :, :] + iou_pred = iou_pred[:, 0:1] + + if multimask_output and self.use_multimask_token_for_obj_ptr: + sam_tokens_out = mask_tokens_out[:, 1:] # [b, 3, c] shape + else: + # Take the mask output token. Here we *always* use the token for single mask output. + # At test time, even if we track after 1-click (and using multimask_output=True), + # we still take the single mask token here. The rationale is that we always track + # after multiple clicks during training, so the past tokens seen during training + # are always the single mask token (and we'll let it be the object-memory token). + sam_tokens_out = mask_tokens_out[:, 0:1] # [b, 1, c] shape + + # Prepare output + return masks, iou_pred, sam_tokens_out, object_score_logits + + def predict_masks( + self, + image_embeddings: torch.Tensor, + image_pe: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + repeat_image: bool, + high_res_features: Optional[List[torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Predicts masks. See 'forward' for more details.""" + # Concatenate output tokens + s = 0 + if self.pred_obj_scores: + output_tokens = torch.cat( + [ + self.obj_score_token.weight, + self.iou_token.weight, + self.mask_tokens.weight, + ], + dim=0, + ) + s = 1 + else: + output_tokens = torch.cat( + [self.iou_token.weight, self.mask_tokens.weight], dim=0 + ) + output_tokens = output_tokens.unsqueeze(0).expand( + sparse_prompt_embeddings.size(0), -1, -1 + ) + tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1) + + # Expand per-image data in batch direction to be per-mask + if repeat_image: + src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0) + else: + assert image_embeddings.shape[0] == tokens.shape[0] + src = image_embeddings + src = src + dense_prompt_embeddings + assert ( + image_pe.size(0) == 1 + ), "image_pe should have size 1 in batch dim (from `get_dense_pe()`)" + pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0) + b, c, h, w = src.shape + + # Run the transformer + hs, src = self.transformer(src, pos_src, tokens) + iou_token_out = hs[:, s, :] + mask_tokens_out = hs[:, s + 1 : (s + 1 + self.num_mask_tokens), :] + + # Upscale mask embeddings and predict masks using the mask tokens + src = src.transpose(1, 2).view(b, c, h, w) + if not self.use_high_res_features: + upscaled_embedding = self.output_upscaling(src) + else: + dc1, ln1, act1, dc2, act2 = self.output_upscaling + feat_s0, feat_s1 = high_res_features + upscaled_embedding = act1(ln1(dc1(src) + feat_s1)) + upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0) + + hyper_in_list: List[torch.Tensor] = [] + for i in range(self.num_mask_tokens): + hyper_in_list.append( + self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) + ) + hyper_in = torch.stack(hyper_in_list, dim=1) + b, c, h, w = upscaled_embedding.shape + masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w) + + # Generate mask quality predictions + iou_pred = self.iou_prediction_head(iou_token_out) + if self.pred_obj_scores: + assert s == 1 + object_score_logits = self.pred_obj_score_head(hs[:, 0, :]) + else: + # Obj scores logits - default to 10.0, i.e. assuming the object is present, sigmoid(10)=1 + object_score_logits = 10.0 * iou_pred.new_ones(iou_pred.shape[0], 1) + + return masks, iou_pred, mask_tokens_out, object_score_logits + + def _get_stability_scores(self, mask_logits): + """ + Compute stability scores of the mask logits based on the IoU between upper and + lower thresholds, similar to https://github.com/fairinternal/onevision/pull/568. + """ + mask_logits = mask_logits.flatten(-2) + stability_delta = self.dynamic_multimask_stability_delta + area_i = torch.sum(mask_logits > stability_delta, dim=-1).float() + area_u = torch.sum(mask_logits > -stability_delta, dim=-1).float() + stability_scores = torch.where(area_u > 0, area_i / area_u, 1.0) + return stability_scores + + def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores): + """ + When outputting a single mask, if the stability score from the current single-mask + output (based on output token 0) falls below a threshold, we instead select from + multi-mask outputs (based on output token 1~3) the mask with the highest predicted + IoU score. This is intended to ensure a valid mask for both clicking and tracking. + """ + # The best mask from multimask output tokens (1~3) + multimask_logits = all_mask_logits[:, 1:, :, :] + multimask_iou_scores = all_iou_scores[:, 1:] + best_scores_inds = torch.argmax(multimask_iou_scores, dim=-1) + batch_inds = torch.arange( + multimask_iou_scores.size(0), device=all_iou_scores.device + ) + best_multimask_logits = multimask_logits[batch_inds, best_scores_inds] + best_multimask_logits = best_multimask_logits.unsqueeze(1) + best_multimask_iou_scores = multimask_iou_scores[batch_inds, best_scores_inds] + best_multimask_iou_scores = best_multimask_iou_scores.unsqueeze(1) + + # The mask from singlemask output token 0 and its stability score + singlemask_logits = all_mask_logits[:, 0:1, :, :] + singlemask_iou_scores = all_iou_scores[:, 0:1] + stability_scores = self._get_stability_scores(singlemask_logits) + is_stable = stability_scores >= self.dynamic_multimask_stability_thresh + + # Dynamically fall back to best multimask output upon low stability scores. + mask_logits_out = torch.where( + is_stable[..., None, None].expand_as(singlemask_logits), + singlemask_logits, + best_multimask_logits, + ) + iou_scores_out = torch.where( + is_stable.expand_as(singlemask_iou_scores), + singlemask_iou_scores, + best_multimask_iou_scores, + ) + return mask_logits_out, iou_scores_out diff --git a/sam2/modeling/sam/prompt_encoder.py b/sam2/modeling/sam/prompt_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..91d9952ca8078bedd04fdc2ea0d900529e432528 --- /dev/null +++ b/sam2/modeling/sam/prompt_encoder.py @@ -0,0 +1,182 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional, Tuple, Type + +import torch +from torch import nn + +from sam2.modeling.position_encoding import PositionEmbeddingRandom + +from sam2.modeling.sam2_utils import LayerNorm2d + + +class PromptEncoder(nn.Module): + def __init__( + self, + embed_dim: int, + image_embedding_size: Tuple[int, int], + input_image_size: Tuple[int, int], + mask_in_chans: int, + activation: Type[nn.Module] = nn.GELU, + ) -> None: + """ + Encodes prompts for input to SAM's mask decoder. + + Arguments: + embed_dim (int): The prompts' embedding dimension + image_embedding_size (tuple(int, int)): The spatial size of the + image embedding, as (H, W). + input_image_size (int): The padded size of the image as input + to the image encoder, as (H, W). + mask_in_chans (int): The number of hidden channels used for + encoding input masks. + activation (nn.Module): The activation to use when encoding + input masks. + """ + super().__init__() + self.embed_dim = embed_dim + self.input_image_size = input_image_size + self.image_embedding_size = image_embedding_size + self.pe_layer = PositionEmbeddingRandom(embed_dim // 2) + + self.num_point_embeddings: int = 4 # pos/neg point + 2 box corners + point_embeddings = [ + nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings) + ] + self.point_embeddings = nn.ModuleList(point_embeddings) + self.not_a_point_embed = nn.Embedding(1, embed_dim) + + self.mask_input_size = ( + 4 * image_embedding_size[0], + 4 * image_embedding_size[1], + ) + self.mask_downscaling = nn.Sequential( + nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2), + LayerNorm2d(mask_in_chans // 4), + activation(), + nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2), + LayerNorm2d(mask_in_chans), + activation(), + nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1), + ) + self.no_mask_embed = nn.Embedding(1, embed_dim) + + def get_dense_pe(self) -> torch.Tensor: + """ + Returns the positional encoding used to encode point prompts, + applied to a dense set of points the shape of the image encoding. + + Returns: + torch.Tensor: Positional encoding with shape + 1x(embed_dim)x(embedding_h)x(embedding_w) + """ + return self.pe_layer(self.image_embedding_size).unsqueeze(0) + + def _embed_points( + self, + points: torch.Tensor, + labels: torch.Tensor, + pad: bool, + ) -> torch.Tensor: + """Embeds point prompts.""" + points = points + 0.5 # Shift to center of pixel + if pad: + padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device) + padding_label = -torch.ones((labels.shape[0], 1), device=labels.device) + points = torch.cat([points, padding_point], dim=1) + labels = torch.cat([labels, padding_label], dim=1) + point_embedding = self.pe_layer.forward_with_coords( + points, self.input_image_size + ) + point_embedding[labels == -1] = 0.0 + point_embedding[labels == -1] += self.not_a_point_embed.weight + point_embedding[labels == 0] += self.point_embeddings[0].weight + point_embedding[labels == 1] += self.point_embeddings[1].weight + point_embedding[labels == 2] += self.point_embeddings[2].weight + point_embedding[labels == 3] += self.point_embeddings[3].weight + return point_embedding + + def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor: + """Embeds box prompts.""" + boxes = boxes + 0.5 # Shift to center of pixel + coords = boxes.reshape(-1, 2, 2) + corner_embedding = self.pe_layer.forward_with_coords( + coords, self.input_image_size + ) + corner_embedding[:, 0, :] += self.point_embeddings[2].weight + corner_embedding[:, 1, :] += self.point_embeddings[3].weight + return corner_embedding + + def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor: + """Embeds mask inputs.""" + mask_embedding = self.mask_downscaling(masks) + return mask_embedding + + def _get_batch_size( + self, + points: Optional[Tuple[torch.Tensor, torch.Tensor]], + boxes: Optional[torch.Tensor], + masks: Optional[torch.Tensor], + ) -> int: + """ + Gets the batch size of the output given the batch size of the input prompts. + """ + if points is not None: + return points[0].shape[0] + elif boxes is not None: + return boxes.shape[0] + elif masks is not None: + return masks.shape[0] + else: + return 1 + + def _get_device(self) -> torch.device: + return self.point_embeddings[0].weight.device + + def forward( + self, + points: Optional[Tuple[torch.Tensor, torch.Tensor]], + boxes: Optional[torch.Tensor], + masks: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Embeds different types of prompts, returning both sparse and dense + embeddings. + + Arguments: + points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates + and labels to embed. + boxes (torch.Tensor or none): boxes to embed + masks (torch.Tensor or none): masks to embed + + Returns: + torch.Tensor: sparse embeddings for the points and boxes, with shape + BxNx(embed_dim), where N is determined by the number of input points + and boxes. + torch.Tensor: dense embeddings for the masks, in the shape + Bx(embed_dim)x(embed_H)x(embed_W) + """ + bs = self._get_batch_size(points, boxes, masks) + sparse_embeddings = torch.empty( + (bs, 0, self.embed_dim), device=self._get_device() + ) + if points is not None: + coords, labels = points + point_embeddings = self._embed_points(coords, labels, pad=(boxes is None)) + sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1) + if boxes is not None: + box_embeddings = self._embed_boxes(boxes) + sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1) + + if masks is not None: + dense_embeddings = self._embed_masks(masks) + else: + dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand( + bs, -1, self.image_embedding_size[0], self.image_embedding_size[1] + ) + + return sparse_embeddings, dense_embeddings diff --git a/sam2/modeling/sam/transformer.py b/sam2/modeling/sam/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..73d2e27cf29db67e10c79e4818e55414f1de1d76 --- /dev/null +++ b/sam2/modeling/sam/transformer.py @@ -0,0 +1,327 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math +import warnings +from functools import partial +from typing import Tuple, Type + +import torch +import torch.nn.functional as F +from torch import nn, Tensor + +from sam2.modeling.position_encoding import apply_rotary_enc, compute_axial_cis + +from sam2.modeling.sam2_utils import MLP +from sam2.utils.misc import get_sdpa_settings + +warnings.simplefilter(action="ignore", category=FutureWarning) +OLD_GPU, USE_FLASH_ATTN, MATH_KERNEL_ON = get_sdpa_settings() + + +class TwoWayTransformer(nn.Module): + def __init__( + self, + depth: int, + embedding_dim: int, + num_heads: int, + mlp_dim: int, + activation: Type[nn.Module] = nn.ReLU, + attention_downsample_rate: int = 2, + ) -> None: + """ + A transformer decoder that attends to an input image using + queries whose positional embedding is supplied. + + Args: + depth (int): number of layers in the transformer + embedding_dim (int): the channel dimension for the input embeddings + num_heads (int): the number of heads for multihead attention. Must + divide embedding_dim + mlp_dim (int): the channel dimension internal to the MLP block + activation (nn.Module): the activation to use in the MLP block + """ + super().__init__() + self.depth = depth + self.embedding_dim = embedding_dim + self.num_heads = num_heads + self.mlp_dim = mlp_dim + self.layers = nn.ModuleList() + + for i in range(depth): + self.layers.append( + TwoWayAttentionBlock( + embedding_dim=embedding_dim, + num_heads=num_heads, + mlp_dim=mlp_dim, + activation=activation, + attention_downsample_rate=attention_downsample_rate, + skip_first_layer_pe=(i == 0), + ) + ) + + self.final_attn_token_to_image = Attention( + embedding_dim, num_heads, downsample_rate=attention_downsample_rate + ) + self.norm_final_attn = nn.LayerNorm(embedding_dim) + + def forward( + self, + image_embedding: Tensor, + image_pe: Tensor, + point_embedding: Tensor, + ) -> Tuple[Tensor, Tensor]: + """ + Args: + image_embedding (torch.Tensor): image to attend to. Should be shape + B x embedding_dim x h x w for any h and w. + image_pe (torch.Tensor): the positional encoding to add to the image. Must + have the same shape as image_embedding. + point_embedding (torch.Tensor): the embedding to add to the query points. + Must have shape B x N_points x embedding_dim for any N_points. + + Returns: + torch.Tensor: the processed point_embedding + torch.Tensor: the processed image_embedding + """ + # BxCxHxW -> BxHWxC == B x N_image_tokens x C + bs, c, h, w = image_embedding.shape + image_embedding = image_embedding.flatten(2).permute(0, 2, 1) + image_pe = image_pe.flatten(2).permute(0, 2, 1) + + # Prepare queries + queries = point_embedding + keys = image_embedding + + # Apply transformer blocks and final layernorm + for layer in self.layers: + queries, keys = layer( + queries=queries, + keys=keys, + query_pe=point_embedding, + key_pe=image_pe, + ) + + # Apply the final attention layer from the points to the image + q = queries + point_embedding + k = keys + image_pe + attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys) + queries = queries + attn_out + queries = self.norm_final_attn(queries) + + return queries, keys + + +class TwoWayAttentionBlock(nn.Module): + def __init__( + self, + embedding_dim: int, + num_heads: int, + mlp_dim: int = 2048, + activation: Type[nn.Module] = nn.ReLU, + attention_downsample_rate: int = 2, + skip_first_layer_pe: bool = False, + ) -> None: + """ + A transformer block with four layers: (1) self-attention of sparse + inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp + block on sparse inputs, and (4) cross attention of dense inputs to sparse + inputs. + + Arguments: + embedding_dim (int): the channel dimension of the embeddings + num_heads (int): the number of heads in the attention layers + mlp_dim (int): the hidden dimension of the mlp block + activation (nn.Module): the activation of the mlp block + skip_first_layer_pe (bool): skip the PE on the first layer + """ + super().__init__() + self.self_attn = Attention(embedding_dim, num_heads) + self.norm1 = nn.LayerNorm(embedding_dim) + + self.cross_attn_token_to_image = Attention( + embedding_dim, num_heads, downsample_rate=attention_downsample_rate + ) + self.norm2 = nn.LayerNorm(embedding_dim) + + self.mlp = MLP( + embedding_dim, mlp_dim, embedding_dim, num_layers=2, activation=activation + ) + self.norm3 = nn.LayerNorm(embedding_dim) + + self.norm4 = nn.LayerNorm(embedding_dim) + self.cross_attn_image_to_token = Attention( + embedding_dim, num_heads, downsample_rate=attention_downsample_rate + ) + + self.skip_first_layer_pe = skip_first_layer_pe + + def forward( + self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor + ) -> Tuple[Tensor, Tensor]: + # Self attention block + if self.skip_first_layer_pe: + queries = self.self_attn(q=queries, k=queries, v=queries) + else: + q = queries + query_pe + attn_out = self.self_attn(q=q, k=q, v=queries) + queries = queries + attn_out + queries = self.norm1(queries) + + # Cross attention block, tokens attending to image embedding + q = queries + query_pe + k = keys + key_pe + attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys) + queries = queries + attn_out + queries = self.norm2(queries) + + # MLP block + mlp_out = self.mlp(queries) + queries = queries + mlp_out + queries = self.norm3(queries) + + # Cross attention block, image embedding attending to tokens + q = queries + query_pe + k = keys + key_pe + attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries) + keys = keys + attn_out + keys = self.norm4(keys) + + return queries, keys + + +class Attention(nn.Module): + """ + An attention layer that allows for downscaling the size of the embedding + after projection to queries, keys, and values. + """ + + def __init__( + self, + embedding_dim: int, + num_heads: int, + downsample_rate: int = 1, + dropout: float = 0.0, + kv_in_dim: int = None, + ) -> None: + super().__init__() + self.embedding_dim = embedding_dim + self.kv_in_dim = kv_in_dim if kv_in_dim is not None else embedding_dim + self.internal_dim = embedding_dim // downsample_rate + self.num_heads = num_heads + assert ( + self.internal_dim % num_heads == 0 + ), "num_heads must divide embedding_dim." + + self.q_proj = nn.Linear(embedding_dim, self.internal_dim) + self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim) + self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim) + self.out_proj = nn.Linear(self.internal_dim, embedding_dim) + + self.dropout_p = dropout + + def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor: + b, n, c = x.shape + x = x.reshape(b, n, num_heads, c // num_heads) + return x.transpose(1, 2) # B x N_heads x N_tokens x C_per_head + + def _recombine_heads(self, x: Tensor) -> Tensor: + b, n_heads, n_tokens, c_per_head = x.shape + x = x.transpose(1, 2) + return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C + + def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor: + # Input projections + q = self.q_proj(q) + k = self.k_proj(k) + v = self.v_proj(v) + + # Separate into heads + q = self._separate_heads(q, self.num_heads) + k = self._separate_heads(k, self.num_heads) + v = self._separate_heads(v, self.num_heads) + + dropout_p = self.dropout_p if self.training else 0.0 + # Attention + with torch.backends.cuda.sdp_kernel( + enable_flash=USE_FLASH_ATTN, + # if Flash attention kernel is off, then math kernel needs to be enabled + enable_math=(OLD_GPU and dropout_p > 0.0) or MATH_KERNEL_ON, + enable_mem_efficient=OLD_GPU, + ): + out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p) + + out = self._recombine_heads(out) + out = self.out_proj(out) + + return out + + +class RoPEAttention(Attention): + """Attention with rotary position encoding.""" + + def __init__( + self, + *args, + rope_theta=10000.0, + # whether to repeat q rope to match k length + # this is needed for cross-attention to memories + rope_k_repeat=False, + feat_sizes=(32, 32), # [w, h] for stride 16 feats at 512 resolution + **kwargs, + ): + super().__init__(*args, **kwargs) + + self.compute_cis = partial( + compute_axial_cis, dim=self.internal_dim // self.num_heads, theta=rope_theta + ) + freqs_cis = self.compute_cis(end_x=feat_sizes[0], end_y=feat_sizes[1]) + self.freqs_cis = freqs_cis + self.rope_k_repeat = rope_k_repeat + + def forward( + self, q: Tensor, k: Tensor, v: Tensor, num_k_exclude_rope: int = 0 + ) -> Tensor: + # Input projections + q = self.q_proj(q) + k = self.k_proj(k) + v = self.v_proj(v) + + # Separate into heads + q = self._separate_heads(q, self.num_heads) + k = self._separate_heads(k, self.num_heads) + v = self._separate_heads(v, self.num_heads) + + # Apply rotary position encoding + w = h = math.sqrt(q.shape[-2]) + self.freqs_cis = self.freqs_cis.to(q.device) + if self.freqs_cis.shape[0] != q.shape[-2]: + self.freqs_cis = self.compute_cis(end_x=w, end_y=h).to(q.device) + if q.shape[-2] != k.shape[-2]: + assert self.rope_k_repeat + + num_k_rope = k.size(-2) - num_k_exclude_rope + q, k[:, :, :num_k_rope] = apply_rotary_enc( + q, + k[:, :, :num_k_rope], + freqs_cis=self.freqs_cis, + repeat_freqs_k=self.rope_k_repeat, + ) + + dropout_p = self.dropout_p if self.training else 0.0 + # Attention + with torch.backends.cuda.sdp_kernel( + enable_flash=USE_FLASH_ATTN, + # if Flash attention kernel is off, then math kernel needs to be enabled + enable_math=(OLD_GPU and dropout_p > 0.0) or MATH_KERNEL_ON, + enable_mem_efficient=OLD_GPU, + ): + out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p) + + out = self._recombine_heads(out) + out = self.out_proj(out) + + return out diff --git a/sam2/modeling/sam2_base.py b/sam2/modeling/sam2_base.py new file mode 100644 index 0000000000000000000000000000000000000000..1d9014673b4f8f057cc95bd3cb69e74237c95c98 --- /dev/null +++ b/sam2/modeling/sam2_base.py @@ -0,0 +1,829 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.distributed +import torch.nn.functional as F + +from torch.nn.init import trunc_normal_ + +from sam2.modeling.sam.mask_decoder import MaskDecoder +from sam2.modeling.sam.prompt_encoder import PromptEncoder +from sam2.modeling.sam.transformer import TwoWayTransformer +from sam2.modeling.sam2_utils import get_1d_sine_pe, MLP, select_closest_cond_frames + +# a large negative value as a placeholder score for missing objects +NO_OBJ_SCORE = -1024.0 + + +class SAM2Base(torch.nn.Module): + def __init__( + self, + image_encoder, + memory_attention, + memory_encoder, + num_maskmem=7, # default 1 input frame + 6 previous frames + image_size=512, + backbone_stride=16, # stride of the image backbone output + sigmoid_scale_for_mem_enc=1.0, # scale factor for mask sigmoid prob + sigmoid_bias_for_mem_enc=0.0, # bias factor for mask sigmoid prob + # During evaluation, whether to binarize the sigmoid mask logits on interacted frames with clicks + binarize_mask_from_pts_for_mem_enc=False, + use_mask_input_as_output_without_sam=False, # on frames with mask input, whether to directly output the input mask without using a SAM prompt encoder + mask decoder + # The maximum number of conditioning frames to participate in the memory attention (-1 means no limit; if there are more conditioning frames than this limit, + # we only cross-attend to the temporally closest `max_cond_frames_in_attn` conditioning frames in the encoder when tracking each frame). This gives the model + # a temporal locality when handling a large number of annotated frames (since closer frames should be more important) and also avoids GPU OOM. + max_cond_frames_in_attn=-1, + # on the first frame, whether to directly add the no-memory embedding to the image feature + # (instead of using the transformer encoder) + directly_add_no_mem_embed=False, + # whether to use high-resolution feature maps in the SAM mask decoder + use_high_res_features_in_sam=False, + # whether to output multiple (3) masks for the first click on initial conditioning frames + multimask_output_in_sam=False, + # the minimum and maximum number of clicks to use multimask_output_in_sam (only relevant when `multimask_output_in_sam=True`; + # default is 1 for both, meaning that only the first click gives multimask output; also note that a box counts as two points) + multimask_min_pt_num=1, + multimask_max_pt_num=1, + # whether to also use multimask output for tracking (not just for the first click on initial conditioning frames; only relevant when `multimask_output_in_sam=True`) + multimask_output_for_tracking=False, + # Whether to use multimask tokens for obj ptr; Only relevant when both + # use_obj_ptrs_in_encoder=True and multimask_output_for_tracking=True + use_multimask_token_for_obj_ptr: bool = False, + # whether to use sigmoid to restrict ious prediction to [0-1] + iou_prediction_use_sigmoid=False, + # The memory bank's temporal stride during evaluation (i.e. the `r` parameter in XMem and Cutie; XMem and Cutie use r=5). + # For r>1, the (self.num_maskmem - 1) non-conditioning memory frames consist of + # (self.num_maskmem - 2) nearest frames from every r-th frames, plus the last frame. + memory_temporal_stride_for_eval=1, + # if `add_all_frames_to_correct_as_cond` is True, we also append to the conditioning frame list any frame that receives a later correction click + # if `add_all_frames_to_correct_as_cond` is False, we conditioning frame list to only use those initial conditioning frames + add_all_frames_to_correct_as_cond=False, + # whether to apply non-overlapping constraints on the object masks in the memory encoder during evaluation (to avoid/alleviate superposing masks) + non_overlap_masks_for_mem_enc=False, + # whether to cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder + use_obj_ptrs_in_encoder=False, + # the maximum number of object pointers from other frames in encoder cross attention (only relevant when `use_obj_ptrs_in_encoder=True`) + max_obj_ptrs_in_encoder=16, + # whether to add temporal positional encoding to the object pointers in the encoder (only relevant when `use_obj_ptrs_in_encoder=True`) + add_tpos_enc_to_obj_ptrs=True, + # whether to add an extra linear projection layer for the temporal positional encoding in the object pointers to avoid potential interference + # with spatial positional encoding (only relevant when both `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`) + proj_tpos_enc_in_obj_ptrs=False, + # whether to only attend to object pointers in the past (before the current frame) in the encoder during evaluation + # (only relevant when `use_obj_ptrs_in_encoder=True`; this might avoid pointer information too far in the future to distract the initial tracking) + only_obj_ptrs_in_the_past_for_eval=False, + # Whether to predict if there is an object in the frame + pred_obj_scores: bool = False, + # Whether to use an MLP to predict object scores + pred_obj_scores_mlp: bool = False, + # Only relevant if pred_obj_scores=True and use_obj_ptrs_in_encoder=True; + # Whether to have a fixed no obj pointer when there is no object present + # or to use it as an additive embedding with obj_ptr produced by decoder + fixed_no_obj_ptr: bool = False, + # Soft no object, i.e. mix in no_obj_ptr softly, + # hope to make recovery easier if there is a mistake and mitigate accumulation of errors + soft_no_obj_ptr: bool = False, + use_mlp_for_obj_ptr_proj: bool = False, + # extra arguments used to construct the SAM mask decoder; if not None, it should be a dict of kwargs to be passed into `MaskDecoder` class. + sam_mask_decoder_extra_args=None, + compile_image_encoder: bool = False, + ): + super().__init__() + + # Part 1: the image backbone + self.image_encoder = image_encoder + # Use level 0, 1, 2 for high-res setting, or just level 2 for the default setting + self.use_high_res_features_in_sam = use_high_res_features_in_sam + self.num_feature_levels = 3 if use_high_res_features_in_sam else 1 + self.use_obj_ptrs_in_encoder = use_obj_ptrs_in_encoder + self.max_obj_ptrs_in_encoder = max_obj_ptrs_in_encoder + if use_obj_ptrs_in_encoder: + # A conv layer to downsample the mask prompt to stride 4 (the same stride as + # low-res SAM mask logits) and to change its scales from 0~1 to SAM logit scale, + # so that it can be fed into the SAM mask decoder to generate a pointer. + self.mask_downsample = torch.nn.Conv2d(1, 1, kernel_size=4, stride=4) + self.add_tpos_enc_to_obj_ptrs = add_tpos_enc_to_obj_ptrs + if proj_tpos_enc_in_obj_ptrs: + assert add_tpos_enc_to_obj_ptrs # these options need to be used together + self.proj_tpos_enc_in_obj_ptrs = proj_tpos_enc_in_obj_ptrs + self.only_obj_ptrs_in_the_past_for_eval = only_obj_ptrs_in_the_past_for_eval + + # Part 2: memory attention to condition current frame's visual features + # with memories (and obj ptrs) from past frames + self.memory_attention = memory_attention + self.hidden_dim = memory_attention.d_model + + # Part 3: memory encoder for the previous frame's outputs + self.memory_encoder = memory_encoder + self.mem_dim = self.hidden_dim + if hasattr(self.memory_encoder, "out_proj") and hasattr( + self.memory_encoder.out_proj, "weight" + ): + # if there is compression of memories along channel dim + self.mem_dim = self.memory_encoder.out_proj.weight.shape[0] + self.num_maskmem = num_maskmem # Number of memories accessible + # Temporal encoding of the memories + self.maskmem_tpos_enc = torch.nn.Parameter( + torch.zeros(num_maskmem, 1, 1, self.mem_dim) + ) + trunc_normal_(self.maskmem_tpos_enc, std=0.02) + # a single token to indicate no memory embedding from previous frames + self.no_mem_embed = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim)) + self.no_mem_pos_enc = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim)) + trunc_normal_(self.no_mem_embed, std=0.02) + trunc_normal_(self.no_mem_pos_enc, std=0.02) + self.directly_add_no_mem_embed = directly_add_no_mem_embed + # Apply sigmoid to the output raw mask logits (to turn them from + # range (-inf, +inf) to range (0, 1)) before feeding them into the memory encoder + self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc + self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc + self.binarize_mask_from_pts_for_mem_enc = binarize_mask_from_pts_for_mem_enc + self.non_overlap_masks_for_mem_enc = non_overlap_masks_for_mem_enc + self.memory_temporal_stride_for_eval = memory_temporal_stride_for_eval + # On frames with mask input, whether to directly output the input mask without + # using a SAM prompt encoder + mask decoder + self.use_mask_input_as_output_without_sam = use_mask_input_as_output_without_sam + self.multimask_output_in_sam = multimask_output_in_sam + self.multimask_min_pt_num = multimask_min_pt_num + self.multimask_max_pt_num = multimask_max_pt_num + self.multimask_output_for_tracking = multimask_output_for_tracking + self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr + self.iou_prediction_use_sigmoid = iou_prediction_use_sigmoid + + # Part 4: SAM-style prompt encoder (for both mask and point inputs) + # and SAM-style mask decoder for the final mask output + self.image_size = image_size + self.backbone_stride = backbone_stride + self.sam_mask_decoder_extra_args = sam_mask_decoder_extra_args + self.pred_obj_scores = pred_obj_scores + self.pred_obj_scores_mlp = pred_obj_scores_mlp + self.fixed_no_obj_ptr = fixed_no_obj_ptr + self.soft_no_obj_ptr = soft_no_obj_ptr + if self.fixed_no_obj_ptr: + assert self.pred_obj_scores + assert self.use_obj_ptrs_in_encoder + if self.pred_obj_scores and self.use_obj_ptrs_in_encoder: + self.no_obj_ptr = torch.nn.Parameter(torch.zeros(1, self.hidden_dim)) + trunc_normal_(self.no_obj_ptr, std=0.02) + self.use_mlp_for_obj_ptr_proj = use_mlp_for_obj_ptr_proj + + self._build_sam_heads() + self.add_all_frames_to_correct_as_cond = add_all_frames_to_correct_as_cond + self.max_cond_frames_in_attn = max_cond_frames_in_attn + + # Model compilation + if compile_image_encoder: + # Compile the forward function (not the full module) to allow loading checkpoints. + print( + "Image encoder compilation is enabled. First forward pass will be slow." + ) + self.image_encoder.forward = torch.compile( + self.image_encoder.forward, + mode="max-autotune", + fullgraph=True, + dynamic=False, + ) + + @property + def device(self): + return next(self.parameters()).device + + def forward(self, *args, **kwargs): + raise NotImplementedError( + "Please use the corresponding methods in SAM2VideoPredictor for inference." + "See notebooks/video_predictor_example.ipynb for an example." + ) + + def _build_sam_heads(self): + """Build SAM-style prompt encoder and mask decoder.""" + self.sam_prompt_embed_dim = self.hidden_dim + self.sam_image_embedding_size = self.image_size // self.backbone_stride + + # build PromptEncoder and MaskDecoder from SAM + # (their hyperparameters like `mask_in_chans=16` are from SAM code) + self.sam_prompt_encoder = PromptEncoder( + embed_dim=self.sam_prompt_embed_dim, + image_embedding_size=( + self.sam_image_embedding_size, + self.sam_image_embedding_size, + ), + input_image_size=(self.image_size, self.image_size), + mask_in_chans=16, + ) + self.sam_mask_decoder = MaskDecoder( + num_multimask_outputs=3, + transformer=TwoWayTransformer( + depth=2, + embedding_dim=self.sam_prompt_embed_dim, + mlp_dim=2048, + num_heads=8, + ), + transformer_dim=self.sam_prompt_embed_dim, + iou_head_depth=3, + iou_head_hidden_dim=256, + use_high_res_features=self.use_high_res_features_in_sam, + iou_prediction_use_sigmoid=self.iou_prediction_use_sigmoid, + pred_obj_scores=self.pred_obj_scores, + pred_obj_scores_mlp=self.pred_obj_scores_mlp, + use_multimask_token_for_obj_ptr=self.use_multimask_token_for_obj_ptr, + **(self.sam_mask_decoder_extra_args or {}), + ) + if self.use_obj_ptrs_in_encoder: + # a linear projection on SAM output tokens to turn them into object pointers + self.obj_ptr_proj = torch.nn.Linear(self.hidden_dim, self.hidden_dim) + if self.use_mlp_for_obj_ptr_proj: + self.obj_ptr_proj = MLP( + self.hidden_dim, self.hidden_dim, self.hidden_dim, 3 + ) + else: + self.obj_ptr_proj = torch.nn.Identity() + if self.proj_tpos_enc_in_obj_ptrs: + # a linear projection on temporal positional encoding in object pointers to + # avoid potential interference with spatial positional encoding + self.obj_ptr_tpos_proj = torch.nn.Linear(self.hidden_dim, self.mem_dim) + else: + self.obj_ptr_tpos_proj = torch.nn.Identity() + + def _forward_sam_heads( + self, + backbone_features, + point_inputs=None, + mask_inputs=None, + high_res_features=None, + multimask_output=False, + ): + """ + Forward SAM prompt encoders and mask heads. + + Inputs: + - backbone_features: image features of [B, C, H, W] shape + - point_inputs: a dictionary with "point_coords" and "point_labels", where + 1) "point_coords" has [B, P, 2] shape and float32 dtype and contains the + absolute pixel-unit coordinate in (x, y) format of the P input points + 2) "point_labels" has shape [B, P] and int32 dtype, where 1 means + positive clicks, 0 means negative clicks, and -1 means padding + - mask_inputs: a mask of [B, 1, H*16, W*16] shape, float or bool, with the + same spatial size as the image. + - high_res_features: either 1) None or 2) or a list of length 2 containing + two feature maps of [B, C, 4*H, 4*W] and [B, C, 2*H, 2*W] shapes respectively, + which will be used as high-resolution feature maps for SAM decoder. + - multimask_output: if it's True, we output 3 candidate masks and their 3 + corresponding IoU estimates, and if it's False, we output only 1 mask and + its corresponding IoU estimate. + + Outputs: + - low_res_multimasks: [B, M, H*4, W*4] shape (where M = 3 if + `multimask_output=True` and M = 1 if `multimask_output=False`), the SAM + output mask logits (before sigmoid) for the low-resolution masks, with 4x + the resolution (1/4 stride) of the input backbone_features. + - high_res_multimasks: [B, M, H*16, W*16] shape (where M = 3 + if `multimask_output=True` and M = 1 if `multimask_output=False`), + upsampled from the low-resolution masks, with shape size as the image + (stride is 1 pixel). + - ious, [B, M] shape, where (where M = 3 if `multimask_output=True` and M = 1 + if `multimask_output=False`), the estimated IoU of each output mask. + - low_res_masks: [B, 1, H*4, W*4] shape, the best mask in `low_res_multimasks`. + If `multimask_output=True`, it's the mask with the highest IoU estimate. + If `multimask_output=False`, it's the same as `low_res_multimasks`. + - high_res_masks: [B, 1, H*16, W*16] shape, the best mask in `high_res_multimasks`. + If `multimask_output=True`, it's the mask with the highest IoU estimate. + If `multimask_output=False`, it's the same as `high_res_multimasks`. + - obj_ptr: [B, C] shape, the object pointer vector for the output mask, extracted + based on the output token from the SAM mask decoder. + """ + B = backbone_features.size(0) + device = backbone_features.device + assert backbone_features.size(1) == self.sam_prompt_embed_dim + assert backbone_features.size(2) == self.sam_image_embedding_size + assert backbone_features.size(3) == self.sam_image_embedding_size + + # a) Handle point prompts + if point_inputs is not None: + sam_point_coords = point_inputs["point_coords"] + sam_point_labels = point_inputs["point_labels"] + assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B + else: + # If no points are provide, pad with an empty point (with label -1) + sam_point_coords = torch.zeros(B, 1, 2, device=device) + sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device) + + # b) Handle mask prompts + if mask_inputs is not None: + # If mask_inputs is provided, downsize it into low-res mask input if needed + # and feed it as a dense mask prompt into the SAM mask encoder + assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1) + if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size: + sam_mask_prompt = F.interpolate( + mask_inputs.float(), + size=self.sam_prompt_encoder.mask_input_size, + align_corners=False, + mode="bilinear", + antialias=True, # use antialias for downsampling + ) + else: + sam_mask_prompt = mask_inputs + else: + # Otherwise, simply feed None (and SAM's prompt encoder will add + # a learned `no_mask_embed` to indicate no mask input in this case). + sam_mask_prompt = None + + sparse_embeddings, dense_embeddings = self.sam_prompt_encoder( + points=(sam_point_coords, sam_point_labels), + boxes=None, + masks=sam_mask_prompt, + ) + ( + low_res_multimasks, + ious, + sam_output_tokens, + object_score_logits, + ) = self.sam_mask_decoder( + image_embeddings=backbone_features, + image_pe=self.sam_prompt_encoder.get_dense_pe(), + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + repeat_image=False, # the image is already batched + high_res_features=high_res_features, + ) + if self.pred_obj_scores: + is_obj_appearing = object_score_logits > 0 + + # Mask used for spatial memories is always a *hard* choice between obj and no obj, + # consistent with the actual mask prediction + low_res_multimasks = torch.where( + is_obj_appearing[:, None, None], + low_res_multimasks, + NO_OBJ_SCORE, + ) + + # convert masks from possibly bfloat16 (or float16) to float32 + # (older PyTorch versions before 2.1 don't support `interpolate` on bf16) + low_res_multimasks = low_res_multimasks.float() + high_res_multimasks = F.interpolate( + low_res_multimasks, + size=(self.image_size, self.image_size), + mode="bilinear", + align_corners=False, + ) + + sam_output_token = sam_output_tokens[:, 0] + if multimask_output: + # take the best mask prediction (with the highest IoU estimation) + best_iou_inds = torch.argmax(ious, dim=-1) + batch_inds = torch.arange(B, device=device) + low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1) + high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1) + if sam_output_tokens.size(1) > 1: + sam_output_token = sam_output_tokens[batch_inds, best_iou_inds] + else: + low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks + + # Extract object pointer from the SAM output token (with occlusion handling) + obj_ptr = self.obj_ptr_proj(sam_output_token) + if self.pred_obj_scores: + # Allow *soft* no obj ptr, unlike for masks + if self.soft_no_obj_ptr: + # Only hard possible with gt + assert not self.teacher_force_obj_scores_for_mem + lambda_is_obj_appearing = object_score_logits.sigmoid() + else: + lambda_is_obj_appearing = is_obj_appearing.float() + + if self.fixed_no_obj_ptr: + obj_ptr = lambda_is_obj_appearing * obj_ptr + obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr + + return ( + low_res_multimasks, + high_res_multimasks, + ious, + low_res_masks, + high_res_masks, + obj_ptr, + object_score_logits, + ) + + def _use_mask_as_output(self, backbone_features, high_res_features, mask_inputs): + """ + Directly turn binary `mask_inputs` into a output mask logits without using SAM. + (same input and output shapes as in _forward_sam_heads above). + """ + # Use -10/+10 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid). + out_scale, out_bias = 20.0, -10.0 # sigmoid(-10.0)=4.5398e-05 + mask_inputs_float = mask_inputs.float() + high_res_masks = mask_inputs_float * out_scale + out_bias + low_res_masks = F.interpolate( + high_res_masks, + size=(high_res_masks.size(-2) // 4, high_res_masks.size(-1) // 4), + align_corners=False, + mode="bilinear", + antialias=True, # use antialias for downsampling + ) + # a dummy IoU prediction of all 1's under mask input + ious = mask_inputs.new_ones(mask_inputs.size(0), 1).float() + if not self.use_obj_ptrs_in_encoder: + # all zeros as a dummy object pointer (of shape [B, C]) + obj_ptr = torch.zeros( + mask_inputs.size(0), self.hidden_dim, device=mask_inputs.device + ) + else: + # produce an object pointer using the SAM decoder from the mask input + _, _, _, _, _, obj_ptr, _ = self._forward_sam_heads( + backbone_features=backbone_features, + mask_inputs=self.mask_downsample(mask_inputs_float), + high_res_features=high_res_features, + ) + # In this method, we are treating mask_input as output, e.g. using it directly to create spatial mem; + # Below, we follow the same design axiom to use mask_input to decide if obj appears or not instead of relying + # on the object_scores from the SAM decoder. + is_obj_appearing = torch.any(mask_inputs.flatten(1).float() > 0.0, dim=1) + is_obj_appearing = is_obj_appearing[..., None] + lambda_is_obj_appearing = is_obj_appearing.float() + object_score_logits = out_scale * lambda_is_obj_appearing + out_bias + if self.pred_obj_scores: + if self.fixed_no_obj_ptr: + obj_ptr = lambda_is_obj_appearing * obj_ptr + obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr + + return ( + low_res_masks, + high_res_masks, + ious, + low_res_masks, + high_res_masks, + obj_ptr, + object_score_logits, + ) + + def forward_image(self, img_batch: torch.Tensor): + """Get the image feature on the input batch.""" + backbone_out = self.image_encoder(img_batch) + if self.use_high_res_features_in_sam: + # precompute projected level 0 and level 1 features in SAM decoder + # to avoid running it again on every SAM click + backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0( + backbone_out["backbone_fpn"][0] + ) + backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1( + backbone_out["backbone_fpn"][1] + ) + return backbone_out + + def _prepare_backbone_features(self, backbone_out): + """Prepare and flatten visual features.""" + backbone_out = backbone_out.copy() + assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"]) + assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels + + feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :] + vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :] + + feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds] + # flatten NxCxHxW to HWxNxC + vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps] + vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds] + + return backbone_out, vision_feats, vision_pos_embeds, feat_sizes + + def _prepare_memory_conditioned_features( + self, + frame_idx, + is_init_cond_frame, + current_vision_feats, + current_vision_pos_embeds, + feat_sizes, + output_dict, + num_frames, + track_in_reverse=False, # tracking in reverse time order (for demo usage) + ): + """Fuse the current frame's visual feature map with previous memory.""" + B = current_vision_feats[-1].size(1) # batch size on this frame + C = self.hidden_dim + H, W = feat_sizes[-1] # top-level (lowest-resolution) feature size + device = current_vision_feats[-1].device + # The case of `self.num_maskmem == 0` below is primarily used for reproducing SAM on images. + # In this case, we skip the fusion with any memory. + if self.num_maskmem == 0: # Disable memory and skip fusion + pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W) + return pix_feat + + num_obj_ptr_tokens = 0 + # Step 1: condition the visual features of the current frame on previous memories + if not is_init_cond_frame: + # Retrieve the memories encoded with the maskmem backbone + to_cat_memory, to_cat_memory_pos_embed = [], [] + # Add conditioning frames's output first (all cond frames have t_pos=0 for + # when getting temporal positional embedding below) + assert len(output_dict["cond_frame_outputs"]) > 0 + # Select a maximum number of temporally closest cond frames for cross attention + cond_outputs = output_dict["cond_frame_outputs"] + selected_cond_outputs, unselected_cond_outputs = select_closest_cond_frames( + frame_idx, cond_outputs, self.max_cond_frames_in_attn + ) + t_pos_and_prevs = [(0, out) for out in selected_cond_outputs.values()] + # Add last (self.num_maskmem - 1) frames before current frame for non-conditioning memory + # the earliest one has t_pos=1 and the latest one has t_pos=self.num_maskmem-1 + # We also allow taking the memory frame non-consecutively (with r>1), in which case + # we take (self.num_maskmem - 2) frames among every r-th frames plus the last frame. + r = self.memory_temporal_stride_for_eval + for t_pos in range(1, self.num_maskmem): + t_rel = self.num_maskmem - t_pos # how many frames before current frame + if t_rel == 1: + # for t_rel == 1, we take the last frame (regardless of r) + if not track_in_reverse: + # the frame immediately before this frame (i.e. frame_idx - 1) + prev_frame_idx = frame_idx - t_rel + else: + # the frame immediately after this frame (i.e. frame_idx + 1) + prev_frame_idx = frame_idx + t_rel + else: + # for t_rel >= 2, we take the memory frame from every r-th frames + if not track_in_reverse: + # first find the nearest frame among every r-th frames before this frame + # for r=1, this would be (frame_idx - 2) + prev_frame_idx = ((frame_idx - 2) // r) * r + # then seek further among every r-th frames + prev_frame_idx = prev_frame_idx - (t_rel - 2) * r + else: + # first find the nearest frame among every r-th frames after this frame + # for r=1, this would be (frame_idx + 2) + prev_frame_idx = -(-(frame_idx + 2) // r) * r + # then seek further among every r-th frames + prev_frame_idx = prev_frame_idx + (t_rel - 2) * r + out = output_dict["non_cond_frame_outputs"].get(prev_frame_idx, None) + if out is None: + # If an unselected conditioning frame is among the last (self.num_maskmem - 1) + # frames, we still attend to it as if it's a non-conditioning frame. + out = unselected_cond_outputs.get(prev_frame_idx, None) + t_pos_and_prevs.append((t_pos, out)) + + for t_pos, prev in t_pos_and_prevs: + if prev is None: + continue # skip padding frames + # "maskmem_features" might have been offloaded to CPU in demo use cases, + # so we load it back to GPU (it's a no-op if it's already on GPU). + feats = prev["maskmem_features"].cuda(non_blocking=True) + to_cat_memory.append(feats.flatten(2).permute(2, 0, 1)) + # Spatial positional encoding (it might have been offloaded to CPU in eval) + maskmem_enc = prev["maskmem_pos_enc"][-1].cuda() + maskmem_enc = maskmem_enc.flatten(2).permute(2, 0, 1) + # Temporal positional encoding + maskmem_enc = ( + maskmem_enc + self.maskmem_tpos_enc[self.num_maskmem - t_pos - 1] + ) + to_cat_memory_pos_embed.append(maskmem_enc) + + # Construct the list of past object pointers + if self.use_obj_ptrs_in_encoder: + max_obj_ptrs_in_encoder = min(num_frames, self.max_obj_ptrs_in_encoder) + # First add those object pointers from selected conditioning frames + # (optionally, only include object pointers in the past during evaluation) + if not self.training and self.only_obj_ptrs_in_the_past_for_eval: + ptr_cond_outputs = { + t: out + for t, out in selected_cond_outputs.items() + if (t >= frame_idx if track_in_reverse else t <= frame_idx) + } + else: + ptr_cond_outputs = selected_cond_outputs + pos_and_ptrs = [ + # Temporal pos encoding contains how far away each pointer is from current frame + (abs(frame_idx - t), out["obj_ptr"]) + for t, out in ptr_cond_outputs.items() + ] + # Add up to (max_obj_ptrs_in_encoder - 1) non-conditioning frames before current frame + for t_diff in range(1, max_obj_ptrs_in_encoder): + t = frame_idx + t_diff if track_in_reverse else frame_idx - t_diff + if t < 0 or (num_frames is not None and t >= num_frames): + break + out = output_dict["non_cond_frame_outputs"].get( + t, unselected_cond_outputs.get(t, None) + ) + if out is not None: + pos_and_ptrs.append((t_diff, out["obj_ptr"])) + # If we have at least one object pointer, add them to the across attention + if len(pos_and_ptrs) > 0: + pos_list, ptrs_list = zip(*pos_and_ptrs) + # stack object pointers along dim=0 into [ptr_seq_len, B, C] shape + obj_ptrs = torch.stack(ptrs_list, dim=0) + # a temporal positional embedding based on how far each object pointer is from + # the current frame (sine embedding normalized by the max pointer num). + if self.add_tpos_enc_to_obj_ptrs: + t_diff_max = max_obj_ptrs_in_encoder - 1 + tpos_dim = C if self.proj_tpos_enc_in_obj_ptrs else self.mem_dim + obj_pos = torch.tensor(pos_list, device=device) + obj_pos = get_1d_sine_pe(obj_pos / t_diff_max, dim=tpos_dim) + obj_pos = self.obj_ptr_tpos_proj(obj_pos) + obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim) + else: + obj_pos = obj_ptrs.new_zeros(len(pos_list), B, self.mem_dim) + if self.mem_dim < C: + # split a pointer into (C // self.mem_dim) tokens for self.mem_dim < C + obj_ptrs = obj_ptrs.reshape( + -1, B, C // self.mem_dim, self.mem_dim + ) + obj_ptrs = obj_ptrs.permute(0, 2, 1, 3).flatten(0, 1) + obj_pos = obj_pos.repeat_interleave(C // self.mem_dim, dim=0) + to_cat_memory.append(obj_ptrs) + to_cat_memory_pos_embed.append(obj_pos) + num_obj_ptr_tokens = obj_ptrs.shape[0] + else: + num_obj_ptr_tokens = 0 + else: + # for initial conditioning frames, encode them without using any previous memory + if self.directly_add_no_mem_embed: + # directly add no-mem embedding (instead of using the transformer encoder) + pix_feat_with_mem = current_vision_feats[-1] + self.no_mem_embed + pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W) + return pix_feat_with_mem + + # Use a dummy token on the first frame (to avoid emtpy memory input to tranformer encoder) + to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)] + to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)] + + # Step 2: Concatenate the memories and forward through the transformer encoder + memory = torch.cat(to_cat_memory, dim=0) + memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0) + + pix_feat_with_mem = self.memory_attention( + curr=current_vision_feats, + curr_pos=current_vision_pos_embeds, + memory=memory, + memory_pos=memory_pos_embed, + num_obj_ptr_tokens=num_obj_ptr_tokens, + ) + # reshape the output (HW)BC => BCHW + pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W) + return pix_feat_with_mem + + def _encode_new_memory( + self, + current_vision_feats, + feat_sizes, + pred_masks_high_res, + is_mask_from_pts, + ): + """Encode the current image and its prediction into a memory feature.""" + B = current_vision_feats[-1].size(1) # batch size on this frame + C = self.hidden_dim + H, W = feat_sizes[-1] # top-level (lowest-resolution) feature size + # top-level feature, (HW)BC => BCHW + pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W) + if self.non_overlap_masks_for_mem_enc and not self.training: + # optionally, apply non-overlapping constraints to the masks (it's applied + # in the batch dimension and should only be used during eval, where all + # the objects come from the same video under batch size 1). + pred_masks_high_res = self._apply_non_overlapping_constraints( + pred_masks_high_res + ) + # scale the raw mask logits with a temperature before applying sigmoid + binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts + if binarize and not self.training: + mask_for_mem = (pred_masks_high_res > 0).float() + else: + # apply sigmoid on the raw mask logits to turn them into range (0, 1) + mask_for_mem = torch.sigmoid(pred_masks_high_res) + # apply scale and bias terms to the sigmoid probabilities + if self.sigmoid_scale_for_mem_enc != 1.0: + mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc + if self.sigmoid_bias_for_mem_enc != 0.0: + mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc + maskmem_out = self.memory_encoder( + pix_feat, mask_for_mem, skip_mask_sigmoid=True # sigmoid already applied + ) + maskmem_features = maskmem_out["vision_features"] + maskmem_pos_enc = maskmem_out["vision_pos_enc"] + + return maskmem_features, maskmem_pos_enc + + def track_step( + self, + frame_idx, + is_init_cond_frame, + current_vision_feats, + current_vision_pos_embeds, + feat_sizes, + point_inputs, + mask_inputs, + output_dict, + num_frames, + track_in_reverse=False, # tracking in reverse time order (for demo usage) + # Whether to run the memory encoder on the predicted masks. Sometimes we might want + # to skip the memory encoder with `run_mem_encoder=False`. For example, + # in demo we might call `track_step` multiple times for each user click, + # and only encode the memory when the user finalizes their clicks. And in ablation + # settings like SAM training on static images, we don't need the memory encoder. + run_mem_encoder=True, + # The previously predicted SAM mask logits (which can be fed together with new clicks in demo). + prev_sam_mask_logits=None, + ): + current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs} + # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW + if len(current_vision_feats) > 1: + high_res_features = [ + x.permute(1, 2, 0).view(x.size(1), x.size(2), *s) + for x, s in zip(current_vision_feats[:-1], feat_sizes[:-1]) + ] + else: + high_res_features = None + if mask_inputs is not None and self.use_mask_input_as_output_without_sam: + # When use_mask_input_as_output_without_sam=True, we directly output the mask input + # (see it as a GT mask) without using a SAM prompt encoder + mask decoder. + pix_feat = current_vision_feats[-1].permute(1, 2, 0) + pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1]) + sam_outputs = self._use_mask_as_output( + pix_feat, high_res_features, mask_inputs + ) + else: + # fused the visual feature with previous memory features in the memory bank + pix_feat_with_mem = self._prepare_memory_conditioned_features( + frame_idx=frame_idx, + is_init_cond_frame=is_init_cond_frame, + current_vision_feats=current_vision_feats[-1:], + current_vision_pos_embeds=current_vision_pos_embeds[-1:], + feat_sizes=feat_sizes[-1:], + output_dict=output_dict, + num_frames=num_frames, + track_in_reverse=track_in_reverse, + ) + # apply SAM-style segmentation head + # here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder, + # e.g. in demo where such logits come from earlier interaction instead of correction sampling + # (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead) + if prev_sam_mask_logits is not None: + assert point_inputs is not None and mask_inputs is None + mask_inputs = prev_sam_mask_logits + multimask_output = self._use_multimask(is_init_cond_frame, point_inputs) + sam_outputs = self._forward_sam_heads( + backbone_features=pix_feat_with_mem, + point_inputs=point_inputs, + mask_inputs=mask_inputs, + high_res_features=high_res_features, + multimask_output=multimask_output, + ) + ( + _, + _, + _, + low_res_masks, + high_res_masks, + obj_ptr, + _, + ) = sam_outputs + + current_out["pred_masks"] = low_res_masks + current_out["pred_masks_high_res"] = high_res_masks + current_out["obj_ptr"] = obj_ptr + + # Finally run the memory encoder on the predicted mask to encode + # it into a new memory feature (that can be used in future frames) + if run_mem_encoder and self.num_maskmem > 0: + high_res_masks_for_mem_enc = high_res_masks + maskmem_features, maskmem_pos_enc = self._encode_new_memory( + current_vision_feats=current_vision_feats, + feat_sizes=feat_sizes, + pred_masks_high_res=high_res_masks_for_mem_enc, + is_mask_from_pts=(point_inputs is not None), + ) + current_out["maskmem_features"] = maskmem_features + current_out["maskmem_pos_enc"] = maskmem_pos_enc + else: + current_out["maskmem_features"] = None + current_out["maskmem_pos_enc"] = None + + return current_out + + def _use_multimask(self, is_init_cond_frame, point_inputs): + """Whether to use multimask output in the SAM head.""" + num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1) + multimask_output = ( + self.multimask_output_in_sam + and (is_init_cond_frame or self.multimask_output_for_tracking) + and (self.multimask_min_pt_num <= num_pts <= self.multimask_max_pt_num) + ) + return multimask_output + + def _apply_non_overlapping_constraints(self, pred_masks): + """ + Apply non-overlapping constraints to the object scores in pred_masks. Here we + keep only the highest scoring object at each spatial location in pred_masks. + """ + batch_size = pred_masks.size(0) + if batch_size == 1: + return pred_masks + + device = pred_masks.device + # "max_obj_inds": object index of the object with the highest score at each location + max_obj_inds = torch.argmax(pred_masks, dim=0, keepdim=True) + # "batch_obj_inds": object index of each object slice (along dim 0) in `pred_masks` + batch_obj_inds = torch.arange(batch_size, device=device)[:, None, None, None] + keep = max_obj_inds == batch_obj_inds + # suppress overlapping regions' scores below -10.0 so that the foreground regions + # don't overlap (here sigmoid(-10.0)=4.5398e-05) + pred_masks = torch.where(keep, pred_masks, torch.clamp(pred_masks, max=-10.0)) + return pred_masks diff --git a/sam2/modeling/sam2_utils.py b/sam2/modeling/sam2_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b28713e3ea2b14da57798b2a8030d4882bb98221 --- /dev/null +++ b/sam2/modeling/sam2_utils.py @@ -0,0 +1,149 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num): + """ + Select up to `max_cond_frame_num` conditioning frames from `cond_frame_outputs` + that are temporally closest to the current frame at `frame_idx`. Here, we take + - a) the closest conditioning frame before `frame_idx` (if any); + - b) the closest conditioning frame after `frame_idx` (if any); + - c) any other temporally closest conditioning frames until reaching a total + of `max_cond_frame_num` conditioning frames. + + Outputs: + - selected_outputs: selected items (keys & values) from `cond_frame_outputs`. + - unselected_outputs: items (keys & values) not selected in `cond_frame_outputs`. + """ + if max_cond_frame_num == -1 or len(cond_frame_outputs) <= max_cond_frame_num: + selected_outputs = cond_frame_outputs + unselected_outputs = {} + else: + assert max_cond_frame_num >= 2, "we should allow using 2+ conditioning frames" + selected_outputs = {} + + # the closest conditioning frame before `frame_idx` (if any) + idx_before = max((t for t in cond_frame_outputs if t < frame_idx), default=None) + if idx_before is not None: + selected_outputs[idx_before] = cond_frame_outputs[idx_before] + + # the closest conditioning frame after `frame_idx` (if any) + idx_after = min((t for t in cond_frame_outputs if t >= frame_idx), default=None) + if idx_after is not None: + selected_outputs[idx_after] = cond_frame_outputs[idx_after] + + # add other temporally closest conditioning frames until reaching a total + # of `max_cond_frame_num` conditioning frames. + num_remain = max_cond_frame_num - len(selected_outputs) + inds_remain = sorted( + (t for t in cond_frame_outputs if t not in selected_outputs), + key=lambda x: abs(x - frame_idx), + )[:num_remain] + selected_outputs.update((t, cond_frame_outputs[t]) for t in inds_remain) + unselected_outputs = { + t: v for t, v in cond_frame_outputs.items() if t not in selected_outputs + } + + return selected_outputs, unselected_outputs + + +def get_1d_sine_pe(pos_inds, dim, temperature=10000): + """ + Get 1D sine positional embedding as in the original Transformer paper. + """ + pe_dim = dim // 2 + dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device) + dim_t = temperature ** (2 * (dim_t // 2) / pe_dim) + + pos_embed = pos_inds.unsqueeze(-1) / dim_t + pos_embed = torch.cat([pos_embed.sin(), pos_embed.cos()], dim=-1) + return pos_embed + + +def get_activation_fn(activation): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(f"activation should be relu/gelu, not {activation}.") + + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +class DropPath(nn.Module): + # adapted from https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py + def __init__(self, drop_prob=0.0, scale_by_keep=True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + if self.drop_prob == 0.0 or not self.training: + return x + keep_prob = 1 - self.drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and self.scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + +# Lightly adapted from +# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa +class MLP(nn.Module): + def __init__( + self, + input_dim: int, + hidden_dim: int, + output_dim: int, + num_layers: int, + activation: nn.Module = nn.ReLU, + sigmoid_output: bool = False, + ) -> None: + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) + ) + self.sigmoid_output = sigmoid_output + self.act = activation() + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x) + if self.sigmoid_output: + x = F.sigmoid(x) + return x + + +# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa +# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa +class LayerNorm2d(nn.Module): + def __init__(self, num_channels: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(num_channels)) + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.eps = eps + + def forward(self, x: torch.Tensor) -> torch.Tensor: + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x diff --git a/sam2/sam2_image_predictor.py b/sam2/sam2_image_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..89cb0468e1fcb7e3b1e359949f5d5a0c26c0c0e2 --- /dev/null +++ b/sam2/sam2_image_predictor.py @@ -0,0 +1,446 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +from PIL.Image import Image + +from sam2.modeling.sam2_base import SAM2Base + +from sam2.utils.transforms import SAM2Transforms + + +class SAM2ImagePredictor: + def __init__( + self, + sam_model: SAM2Base, + mask_threshold=0.0, + max_hole_area=0.0, + max_sprinkle_area=0.0, + ) -> None: + """ + Uses SAM-2 to calculate the image embedding for an image, and then + allow repeated, efficient mask prediction given prompts. + + Arguments: + sam_model (Sam-2): The model to use for mask prediction. + mask_threshold (float): The threshold to use when converting mask logits + to binary masks. Masks are thresholded at 0 by default. + fill_hole_area (int): If fill_hole_area > 0, we fill small holes in up to + the maximum area of fill_hole_area in low_res_masks. + """ + super().__init__() + self.model = sam_model + self._transforms = SAM2Transforms( + resolution=self.model.image_size, + mask_threshold=mask_threshold, + max_hole_area=max_hole_area, + max_sprinkle_area=max_sprinkle_area, + ) + + # Predictor state + self._is_image_set = False + self._features = None + self._orig_hw = None + # Whether the predictor is set for single image or a batch of images + self._is_batch = False + + # Predictor config + self.mask_threshold = mask_threshold + + # Spatial dim for backbone feature maps + self._bb_feat_sizes = [ + (256, 256), + (128, 128), + (64, 64), + ] + + @torch.no_grad() + def set_image( + self, + image: Union[np.ndarray, Image], + ) -> None: + """ + Calculates the image embeddings for the provided image, allowing + masks to be predicted with the 'predict' method. + + Arguments: + image (np.ndarray or PIL Image): The input image to embed in RGB format. The image should be in HWC format if np.ndarray, or WHC format if PIL Image + with pixel values in [0, 255]. + image_format (str): The color format of the image, in ['RGB', 'BGR']. + """ + self.reset_predictor() + # Transform the image to the form expected by the model + if isinstance(image, np.ndarray): + logging.info("For numpy array image, we assume (HxWxC) format") + self._orig_hw = [image.shape[:2]] + elif isinstance(image, Image): + w, h = image.size + self._orig_hw = [(h, w)] + else: + raise NotImplementedError("Image format not supported") + + input_image = self._transforms(image) + input_image = input_image[None, ...].to(self.device) + + assert ( + len(input_image.shape) == 4 and input_image.shape[1] == 3 + ), f"input_image must be of size 1x3xHxW, got {input_image.shape}" + logging.info("Computing image embeddings for the provided image...") + backbone_out = self.model.forward_image(input_image) + _, vision_feats, _, _ = self.model._prepare_backbone_features(backbone_out) + # Add no_mem_embed, which is added to the lowest rest feat. map during training on videos + if self.model.directly_add_no_mem_embed: + vision_feats[-1] = vision_feats[-1] + self.model.no_mem_embed + + feats = [ + feat.permute(1, 2, 0).view(1, -1, *feat_size) + for feat, feat_size in zip(vision_feats[::-1], self._bb_feat_sizes[::-1]) + ][::-1] + self._features = {"image_embed": feats[-1], "high_res_feats": feats[:-1]} + self._is_image_set = True + logging.info("Image embeddings computed.") + + @torch.no_grad() + def set_image_batch( + self, + image_list: List[Union[np.ndarray]], + ) -> None: + """ + Calculates the image embeddings for the provided image batch, allowing + masks to be predicted with the 'predict_batch' method. + + Arguments: + image_list (List[np.ndarray]): The input images to embed in RGB format. The image should be in HWC format if np.ndarray + with pixel values in [0, 255]. + """ + self.reset_predictor() + assert isinstance(image_list, list) + self._orig_hw = [] + for image in image_list: + assert isinstance( + image, np.ndarray + ), "Images are expected to be an np.ndarray in RGB format, and of shape HWC" + self._orig_hw.append(image.shape[:2]) + # Transform the image to the form expected by the model + img_batch = self._transforms.forward_batch(image_list) + img_batch = img_batch.to(self.device) + batch_size = img_batch.shape[0] + assert ( + len(img_batch.shape) == 4 and img_batch.shape[1] == 3 + ), f"img_batch must be of size Bx3xHxW, got {img_batch.shape}" + logging.info("Computing image embeddings for the provided images...") + backbone_out = self.model.forward_image(img_batch) + _, vision_feats, _, _ = self.model._prepare_backbone_features(backbone_out) + # Add no_mem_embed, which is added to the lowest rest feat. map during training on videos + if self.model.directly_add_no_mem_embed: + vision_feats[-1] = vision_feats[-1] + self.model.no_mem_embed + + feats = [ + feat.permute(1, 2, 0).view(batch_size, -1, *feat_size) + for feat, feat_size in zip(vision_feats[::-1], self._bb_feat_sizes[::-1]) + ][::-1] + self._features = {"image_embed": feats[-1], "high_res_feats": feats[:-1]} + self._is_image_set = True + self._is_batch = True + logging.info("Image embeddings computed.") + + def predict_batch( + self, + point_coords_batch: List[np.ndarray] = None, + point_labels_batch: List[np.ndarray] = None, + box_batch: List[np.ndarray] = None, + mask_input_batch: List[np.ndarray] = None, + multimask_output: bool = True, + return_logits: bool = False, + normalize_coords=True, + ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]: + """This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images. + It returns a tupele of lists of masks, ious, and low_res_masks_logits. + """ + assert self._is_batch, "This function should only be used when in batched mode" + if not self._is_image_set: + raise RuntimeError( + "An image must be set with .set_image_batch(...) before mask prediction." + ) + num_images = len(self._features["image_embed"]) + all_masks = [] + all_ious = [] + all_low_res_masks = [] + for img_idx in range(num_images): + # Transform input prompts + point_coords = ( + point_coords_batch[img_idx] if point_coords_batch is not None else None + ) + point_labels = ( + point_labels_batch[img_idx] if point_labels_batch is not None else None + ) + box = box_batch[img_idx] if box_batch is not None else None + mask_input = ( + mask_input_batch[img_idx] if mask_input_batch is not None else None + ) + mask_input, unnorm_coords, labels, unnorm_box = self._prep_prompts( + point_coords, + point_labels, + box, + mask_input, + normalize_coords, + img_idx=img_idx, + ) + masks, iou_predictions, low_res_masks = self._predict( + unnorm_coords, + labels, + unnorm_box, + mask_input, + multimask_output, + return_logits=return_logits, + img_idx=img_idx, + ) + masks_np = masks.squeeze(0).float().detach().cpu().numpy() + iou_predictions_np = ( + iou_predictions.squeeze(0).float().detach().cpu().numpy() + ) + low_res_masks_np = low_res_masks.squeeze(0).float().detach().cpu().numpy() + all_masks.append(masks_np) + all_ious.append(iou_predictions_np) + all_low_res_masks.append(low_res_masks_np) + + return all_masks, all_ious, all_low_res_masks + + def predict( + self, + point_coords: Optional[np.ndarray] = None, + point_labels: Optional[np.ndarray] = None, + box: Optional[np.ndarray] = None, + mask_input: Optional[np.ndarray] = None, + multimask_output: bool = True, + return_logits: bool = False, + normalize_coords=True, + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Predict masks for the given input prompts, using the currently set image. + + Arguments: + point_coords (np.ndarray or None): A Nx2 array of point prompts to the + model. Each point is in (X,Y) in pixels. + point_labels (np.ndarray or None): A length N array of labels for the + point prompts. 1 indicates a foreground point and 0 indicates a + background point. + box (np.ndarray or None): A length 4 array given a box prompt to the + model, in XYXY format. + mask_input (np.ndarray): A low resolution mask input to the model, typically + coming from a previous prediction iteration. Has form 1xHxW, where + for SAM, H=W=256. + multimask_output (bool): If true, the model will return three masks. + For ambiguous input prompts (such as a single click), this will often + produce better masks than a single prediction. If only a single + mask is needed, the model's predicted quality score can be used + to select the best mask. For non-ambiguous prompts, such as multiple + input prompts, multimask_output=False can give better results. + return_logits (bool): If true, returns un-thresholded masks logits + instead of a binary mask. + normalize_coords (bool): If true, the point coordinates will be normalized to the range [0,1] and point_coords is expected to be wrt. image dimensions. + + Returns: + (np.ndarray): The output masks in CxHxW format, where C is the + number of masks, and (H, W) is the original image size. + (np.ndarray): An array of length C containing the model's + predictions for the quality of each mask. + (np.ndarray): An array of shape CxHxW, where C is the number + of masks and H=W=256. These low resolution logits can be passed to + a subsequent iteration as mask input. + """ + if not self._is_image_set: + raise RuntimeError( + "An image must be set with .set_image(...) before mask prediction." + ) + + # Transform input prompts + + mask_input, unnorm_coords, labels, unnorm_box = self._prep_prompts( + point_coords, point_labels, box, mask_input, normalize_coords + ) + + masks, iou_predictions, low_res_masks = self._predict( + unnorm_coords, + labels, + unnorm_box, + mask_input, + multimask_output, + return_logits=return_logits, + ) + + masks_np = masks.squeeze(0).float().detach().cpu().numpy() + iou_predictions_np = iou_predictions.squeeze(0).float().detach().cpu().numpy() + low_res_masks_np = low_res_masks.squeeze(0).float().detach().cpu().numpy() + return masks_np, iou_predictions_np, low_res_masks_np + + def _prep_prompts( + self, point_coords, point_labels, box, mask_logits, normalize_coords, img_idx=-1 + ): + + unnorm_coords, labels, unnorm_box, mask_input = None, None, None, None + if point_coords is not None: + assert ( + point_labels is not None + ), "point_labels must be supplied if point_coords is supplied." + point_coords = torch.as_tensor( + point_coords, dtype=torch.float, device=self.device + ) + unnorm_coords = self._transforms.transform_coords( + point_coords, normalize=normalize_coords, orig_hw=self._orig_hw[img_idx] + ) + labels = torch.as_tensor(point_labels, dtype=torch.int, device=self.device) + if len(unnorm_coords.shape) == 2: + unnorm_coords, labels = unnorm_coords[None, ...], labels[None, ...] + if box is not None: + box = torch.as_tensor(box, dtype=torch.float, device=self.device) + unnorm_box = self._transforms.transform_boxes( + box, normalize=normalize_coords, orig_hw=self._orig_hw[img_idx] + ) # Bx2x2 + if mask_logits is not None: + mask_input = torch.as_tensor( + mask_logits, dtype=torch.float, device=self.device + ) + if len(mask_input.shape) == 3: + mask_input = mask_input[None, :, :, :] + return mask_input, unnorm_coords, labels, unnorm_box + + @torch.no_grad() + def _predict( + self, + point_coords: Optional[torch.Tensor], + point_labels: Optional[torch.Tensor], + boxes: Optional[torch.Tensor] = None, + mask_input: Optional[torch.Tensor] = None, + multimask_output: bool = True, + return_logits: bool = False, + img_idx: int = -1, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Predict masks for the given input prompts, using the currently set image. + Input prompts are batched torch tensors and are expected to already be + transformed to the input frame using SAM2Transforms. + + Arguments: + point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the + model. Each point is in (X,Y) in pixels. + point_labels (torch.Tensor or None): A BxN array of labels for the + point prompts. 1 indicates a foreground point and 0 indicates a + background point. + boxes (np.ndarray or None): A Bx4 array given a box prompt to the + model, in XYXY format. + mask_input (np.ndarray): A low resolution mask input to the model, typically + coming from a previous prediction iteration. Has form Bx1xHxW, where + for SAM, H=W=256. Masks returned by a previous iteration of the + predict method do not need further transformation. + multimask_output (bool): If true, the model will return three masks. + For ambiguous input prompts (such as a single click), this will often + produce better masks than a single prediction. If only a single + mask is needed, the model's predicted quality score can be used + to select the best mask. For non-ambiguous prompts, such as multiple + input prompts, multimask_output=False can give better results. + return_logits (bool): If true, returns un-thresholded masks logits + instead of a binary mask. + + Returns: + (torch.Tensor): The output masks in BxCxHxW format, where C is the + number of masks, and (H, W) is the original image size. + (torch.Tensor): An array of shape BxC containing the model's + predictions for the quality of each mask. + (torch.Tensor): An array of shape BxCxHxW, where C is the number + of masks and H=W=256. These low res logits can be passed to + a subsequent iteration as mask input. + """ + if not self._is_image_set: + raise RuntimeError( + "An image must be set with .set_image(...) before mask prediction." + ) + + if point_coords is not None: + concat_points = (point_coords, point_labels) + else: + concat_points = None + + # Embed prompts + if boxes is not None: + box_coords = boxes.reshape(-1, 2, 2) + box_labels = torch.tensor([[2, 3]], dtype=torch.int, device=boxes.device) + box_labels = box_labels.repeat(boxes.size(0), 1) + # we merge "boxes" and "points" into a single "concat_points" input (where + # boxes are added at the beginning) to sam_prompt_encoder + if concat_points is not None: + concat_coords = torch.cat([box_coords, concat_points[0]], dim=1) + concat_labels = torch.cat([box_labels, concat_points[1]], dim=1) + concat_points = (concat_coords, concat_labels) + else: + concat_points = (box_coords, box_labels) + + sparse_embeddings, dense_embeddings = self.model.sam_prompt_encoder( + points=concat_points, + boxes=None, + masks=mask_input, + ) + + # Predict masks + batched_mode = ( + concat_points is not None and concat_points[0].shape[0] > 1 + ) # multi object prediction + high_res_features = [ + feat_level[img_idx].unsqueeze(0) + for feat_level in self._features["high_res_feats"] + ] + low_res_masks, iou_predictions, _, _ = self.model.sam_mask_decoder( + image_embeddings=self._features["image_embed"][img_idx].unsqueeze(0), + image_pe=self.model.sam_prompt_encoder.get_dense_pe(), + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + repeat_image=batched_mode, + high_res_features=high_res_features, + ) + + # Upscale the masks to the original image resolution + masks = self._transforms.postprocess_masks( + low_res_masks, self._orig_hw[img_idx] + ) + low_res_masks = torch.clamp(low_res_masks, -32.0, 32.0) + if not return_logits: + masks = masks > self.mask_threshold + + return masks, iou_predictions, low_res_masks + + def get_image_embedding(self) -> torch.Tensor: + """ + Returns the image embeddings for the currently set image, with + shape 1xCxHxW, where C is the embedding dimension and (H,W) are + the embedding spatial dimension of SAM (typically C=256, H=W=64). + """ + if not self._is_image_set: + raise RuntimeError( + "An image must be set with .set_image(...) to generate an embedding." + ) + assert ( + self._features is not None + ), "Features must exist if an image has been set." + return self._features["image_embed"] + + @property + def device(self) -> torch.device: + return self.model.device + + def reset_predictor(self) -> None: + """ + Resets the image embeddings and other state variables. + """ + self._is_image_set = False + self._features = None + self._orig_hw = None + self._is_batch = False diff --git a/sam2/sam2_video_predictor.py b/sam2/sam2_video_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..d4109a43f2e9655032a0400f73dba25b664db59c --- /dev/null +++ b/sam2/sam2_video_predictor.py @@ -0,0 +1,898 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from collections import OrderedDict + +import torch + +from tqdm import tqdm + +from sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base +from sam2.utils.misc import concat_points, fill_holes_in_mask_scores, load_video_frames + + +class SAM2VideoPredictor(SAM2Base): + """The predictor class to handle user interactions and manage inference states.""" + + def __init__( + self, + fill_hole_area=0, + # whether to apply non-overlapping constraints on the output object masks + non_overlap_masks=False, + # whether to clear non-conditioning memory of the surrounding frames (which may contain outdated information) after adding correction clicks; + # note that this would only apply to *single-object tracking* unless `clear_non_cond_mem_for_multi_obj` is also set to True) + clear_non_cond_mem_around_input=False, + # whether to also clear non-conditioning memory of the surrounding frames (only effective when `clear_non_cond_mem_around_input` is True). + clear_non_cond_mem_for_multi_obj=False, + **kwargs, + ): + super().__init__(**kwargs) + self.fill_hole_area = fill_hole_area + self.non_overlap_masks = non_overlap_masks + self.clear_non_cond_mem_around_input = clear_non_cond_mem_around_input + self.clear_non_cond_mem_for_multi_obj = clear_non_cond_mem_for_multi_obj + + @torch.inference_mode() + def init_state( + self, + video_path, + offload_video_to_cpu=False, + offload_state_to_cpu=False, + async_loading_frames=False, + ): + """Initialize a inference state.""" + images, video_height, video_width = load_video_frames( + video_path=video_path, + image_size=self.image_size, + offload_video_to_cpu=offload_video_to_cpu, + async_loading_frames=async_loading_frames, + ) + inference_state = {} + inference_state["images"] = images + inference_state["num_frames"] = len(images) + # whether to offload the video frames to CPU memory + # turning on this option saves the GPU memory with only a very small overhead + inference_state["offload_video_to_cpu"] = offload_video_to_cpu + # whether to offload the inference state to CPU memory + # turning on this option saves the GPU memory at the cost of a lower tracking fps + # (e.g. in a test case of 768x768 model, fps dropped from 27 to 24 when tracking one object + # and from 24 to 21 when tracking two objects) + inference_state["offload_state_to_cpu"] = offload_state_to_cpu + # the original video height and width, used for resizing final output scores + inference_state["video_height"] = video_height + inference_state["video_width"] = video_width + inference_state["device"] = torch.device("cpu") + if offload_state_to_cpu: + inference_state["storage_device"] = torch.device("cpu") + else: + inference_state["storage_device"] = torch.device("cuda") + # inputs on each frame + inference_state["point_inputs_per_obj"] = {} + inference_state["mask_inputs_per_obj"] = {} + # visual features on a small number of recently visited frames for quick interactions + inference_state["cached_features"] = {} + # values that don't change across frames (so we only need to hold one copy of them) + inference_state["constants"] = {} + # mapping between client-side object id and model-side object index + inference_state["obj_id_to_idx"] = OrderedDict() + inference_state["obj_idx_to_id"] = OrderedDict() + inference_state["obj_ids"] = [] + # A storage to hold the model's tracking results and states on each frame + inference_state["output_dict"] = { + "cond_frame_outputs": {}, # dict containing {frame_idx: } + "non_cond_frame_outputs": {}, # dict containing {frame_idx: } + } + # Slice (view) of each object tracking results, sharing the same memory with "output_dict" + inference_state["output_dict_per_obj"] = {} + # A temporary storage to hold new outputs when user interact with a frame + # to add clicks or mask (it's merged into "output_dict" before propagation starts) + inference_state["temp_output_dict_per_obj"] = {} + # Frames that already holds consolidated outputs from click or mask inputs + # (we directly use their consolidated outputs during tracking) + inference_state["consolidated_frame_inds"] = { + "cond_frame_outputs": set(), # set containing frame indices + "non_cond_frame_outputs": set(), # set containing frame indices + } + # metadata for each tracking frame (e.g. which direction it's tracked) + inference_state["tracking_has_started"] = False + inference_state["frames_already_tracked"] = {} + # Warm up the visual backbone and cache the image feature on frame 0 + self._get_image_feature(inference_state, frame_idx=0, batch_size=1) + return inference_state + + def _obj_id_to_idx(self, inference_state, obj_id): + """Map client-side object id to model-side object index.""" + obj_idx = inference_state["obj_id_to_idx"].get(obj_id, None) + if obj_idx is not None: + return obj_idx + + # This is a new object id not sent to the server before. We only allow adding + # new objects *before* the tracking starts. + allow_new_object = not inference_state["tracking_has_started"] + if allow_new_object: + # get the next object slot + obj_idx = len(inference_state["obj_id_to_idx"]) + inference_state["obj_id_to_idx"][obj_id] = obj_idx + inference_state["obj_idx_to_id"][obj_idx] = obj_id + inference_state["obj_ids"] = list(inference_state["obj_id_to_idx"]) + # set up input and output structures for this object + inference_state["point_inputs_per_obj"][obj_idx] = {} + inference_state["mask_inputs_per_obj"][obj_idx] = {} + inference_state["output_dict_per_obj"][obj_idx] = { + "cond_frame_outputs": {}, # dict containing {frame_idx: } + "non_cond_frame_outputs": {}, # dict containing {frame_idx: } + } + inference_state["temp_output_dict_per_obj"][obj_idx] = { + "cond_frame_outputs": {}, # dict containing {frame_idx: } + "non_cond_frame_outputs": {}, # dict containing {frame_idx: } + } + return obj_idx + else: + raise RuntimeError( + f"Cannot add new object id {obj_id} after tracking starts. " + f"All existing object ids: {inference_state['obj_ids']}. " + f"Please call 'reset_state' to restart from scratch." + ) + + def _obj_idx_to_id(self, inference_state, obj_idx): + """Map model-side object index to client-side object id.""" + return inference_state["obj_idx_to_id"][obj_idx] + + def _get_obj_num(self, inference_state): + """Get the total number of unique object ids received so far in this session.""" + return len(inference_state["obj_idx_to_id"]) + + @torch.inference_mode() + def add_new_points( + self, + inference_state, + frame_idx, + obj_id, + points, + labels, + clear_old_points=True, + normalize_coords=True, + ): + """Add new points to a frame.""" + obj_idx = self._obj_id_to_idx(inference_state, obj_id) + point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx] + mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx] + + if not isinstance(points, torch.Tensor): + points = torch.tensor(points, dtype=torch.float32) + if not isinstance(labels, torch.Tensor): + labels = torch.tensor(labels, dtype=torch.int32) + if points.dim() == 2: + points = points.unsqueeze(0) # add batch dimension + if labels.dim() == 1: + labels = labels.unsqueeze(0) # add batch dimension + if normalize_coords: + video_H = inference_state["video_height"] + video_W = inference_state["video_width"] + points = points / torch.tensor([video_W, video_H]).to(points.device) + # scale the (normalized) coordinates by the model's internal image size + points = points * self.image_size + points = points.to(inference_state["device"]) + labels = labels.to(inference_state["device"]) + + if not clear_old_points: + point_inputs = point_inputs_per_frame.get(frame_idx, None) + else: + point_inputs = None + point_inputs = concat_points(point_inputs, points, labels) + + point_inputs_per_frame[frame_idx] = point_inputs + mask_inputs_per_frame.pop(frame_idx, None) + # If this frame hasn't been tracked before, we treat it as an initial conditioning + # frame, meaning that the inputs points are to generate segments on this frame without + # using any memory from other frames, like in SAM. Otherwise (if it has been tracked), + # the input points will be used to correct the already tracked masks. + is_init_cond_frame = frame_idx not in inference_state["frames_already_tracked"] + # whether to track in reverse time order + if is_init_cond_frame: + reverse = False + else: + reverse = inference_state["frames_already_tracked"][frame_idx]["reverse"] + obj_output_dict = inference_state["output_dict_per_obj"][obj_idx] + obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx] + # Add a frame to conditioning output if it's an initial conditioning frame or + # if the model sees all frames receiving clicks/mask as conditioning frames. + is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond + storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs" + + # Get any previously predicted mask logits on this object and feed it along with + # the new clicks into the SAM mask decoder. + prev_sam_mask_logits = None + # lookup temporary output dict first, which contains the most recent output + # (if not found, then lookup conditioning and non-conditioning frame output) + prev_out = obj_temp_output_dict[storage_key].get(frame_idx) + if prev_out is None: + prev_out = obj_output_dict["cond_frame_outputs"].get(frame_idx) + if prev_out is None: + prev_out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx) + + if prev_out is not None and prev_out["pred_masks"] is not None: + prev_sam_mask_logits = prev_out["pred_masks"].cuda(non_blocking=True) + # Clamp the scale of prev_sam_mask_logits to avoid rare numerical issues. + prev_sam_mask_logits = torch.clamp(prev_sam_mask_logits, -32.0, 32.0) + current_out, _ = self._run_single_frame_inference( + inference_state=inference_state, + output_dict=obj_output_dict, # run on the slice of a single object + frame_idx=frame_idx, + batch_size=1, # run on the slice of a single object + is_init_cond_frame=is_init_cond_frame, + point_inputs=point_inputs, + mask_inputs=None, + reverse=reverse, + # Skip the memory encoder when adding clicks or mask. We execute the memory encoder + # at the beginning of `propagate_in_video` (after user finalize their clicks). This + # allows us to enforce non-overlapping constraints on all objects before encoding + # them into memory. + run_mem_encoder=False, + prev_sam_mask_logits=prev_sam_mask_logits, + ) + # Add the output to the output dict (to be used as future memory) + obj_temp_output_dict[storage_key][frame_idx] = current_out + + # Resize the output mask to the original video resolution + obj_ids = inference_state["obj_ids"] + consolidated_out = self._consolidate_temp_output_across_obj( + inference_state, + frame_idx, + is_cond=is_cond, + run_mem_encoder=False, + consolidate_at_video_res=True, + ) + _, video_res_masks = self._get_orig_video_res_output( + inference_state, consolidated_out["pred_masks_video_res"] + ) + return frame_idx, obj_ids, video_res_masks + + @torch.inference_mode() + def add_new_mask( + self, + inference_state, + frame_idx, + obj_id, + mask, + ): + """Add new mask to a frame.""" + obj_idx = self._obj_id_to_idx(inference_state, obj_id) + point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx] + mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx] + + if not isinstance(mask, torch.Tensor): + mask = torch.tensor(mask, dtype=torch.bool) + assert mask.dim() == 2 + mask_H, mask_W = mask.shape + mask_inputs_orig = mask[None, None] # add batch and channel dimension + mask_inputs_orig = mask_inputs_orig.float().to(inference_state["device"]) + + # resize the mask if it doesn't match the model's image size + if mask_H != self.image_size or mask_W != self.image_size: + mask_inputs = torch.nn.functional.interpolate( + mask_inputs_orig, + size=(self.image_size, self.image_size), + align_corners=False, + mode="bilinear", + antialias=True, # use antialias for downsampling + ) + mask_inputs = (mask_inputs >= 0.5).float() + else: + mask_inputs = mask_inputs_orig + + mask_inputs_per_frame[frame_idx] = mask_inputs + point_inputs_per_frame.pop(frame_idx, None) + # If this frame hasn't been tracked before, we treat it as an initial conditioning + # frame, meaning that the inputs points are to generate segments on this frame without + # using any memory from other frames, like in SAM. Otherwise (if it has been tracked), + # the input points will be used to correct the already tracked masks. + is_init_cond_frame = frame_idx not in inference_state["frames_already_tracked"] + # whether to track in reverse time order + if is_init_cond_frame: + reverse = False + else: + reverse = inference_state["frames_already_tracked"][frame_idx]["reverse"] + obj_output_dict = inference_state["output_dict_per_obj"][obj_idx] + obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx] + # Add a frame to conditioning output if it's an initial conditioning frame or + # if the model sees all frames receiving clicks/mask as conditioning frames. + is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond + storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs" + + current_out, _ = self._run_single_frame_inference( + inference_state=inference_state, + output_dict=obj_output_dict, # run on the slice of a single object + frame_idx=frame_idx, + batch_size=1, # run on the slice of a single object + is_init_cond_frame=is_init_cond_frame, + point_inputs=None, + mask_inputs=mask_inputs, + reverse=reverse, + # Skip the memory encoder when adding clicks or mask. We execute the memory encoder + # at the beginning of `propagate_in_video` (after user finalize their clicks). This + # allows us to enforce non-overlapping constraints on all objects before encoding + # them into memory. + run_mem_encoder=False, + ) + # Add the output to the output dict (to be used as future memory) + obj_temp_output_dict[storage_key][frame_idx] = current_out + + # Resize the output mask to the original video resolution + obj_ids = inference_state["obj_ids"] + consolidated_out = self._consolidate_temp_output_across_obj( + inference_state, + frame_idx, + is_cond=is_cond, + run_mem_encoder=False, + consolidate_at_video_res=True, + ) + _, video_res_masks = self._get_orig_video_res_output( + inference_state, consolidated_out["pred_masks_video_res"] + ) + return frame_idx, obj_ids, video_res_masks + + def _get_orig_video_res_output(self, inference_state, any_res_masks): + """ + Resize the object scores to the original video resolution (video_res_masks) + and apply non-overlapping constraints for final output. + """ + device = inference_state["device"] + video_H = inference_state["video_height"] + video_W = inference_state["video_width"] + any_res_masks = any_res_masks.to(device, non_blocking=True) + if any_res_masks.shape[-2:] == (video_H, video_W): + video_res_masks = any_res_masks + else: + video_res_masks = torch.nn.functional.interpolate( + any_res_masks, + size=(video_H, video_W), + mode="bilinear", + align_corners=False, + ) + if self.non_overlap_masks: + video_res_masks = self._apply_non_overlapping_constraints(video_res_masks) + return any_res_masks, video_res_masks + + def _consolidate_temp_output_across_obj( + self, + inference_state, + frame_idx, + is_cond, + run_mem_encoder, + consolidate_at_video_res=False, + ): + """ + Consolidate the per-object temporary outputs in `temp_output_dict_per_obj` on + a frame into a single output for all objects, including + 1) fill any missing objects either from `output_dict_per_obj` (if they exist in + `output_dict_per_obj` for this frame) or leave them as placeholder values + (if they don't exist in `output_dict_per_obj` for this frame); + 2) if specified, rerun memory encoder after apply non-overlapping constraints + on the object scores. + """ + batch_size = self._get_obj_num(inference_state) + storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs" + # Optionally, we allow consolidating the temporary outputs at the original + # video resolution (to provide a better editing experience for mask prompts). + if consolidate_at_video_res: + assert not run_mem_encoder, "memory encoder cannot run at video resolution" + consolidated_H = inference_state["video_height"] + consolidated_W = inference_state["video_width"] + consolidated_mask_key = "pred_masks_video_res" + else: + consolidated_H = consolidated_W = self.image_size // 4 + consolidated_mask_key = "pred_masks" + + # Initialize `consolidated_out`. Its "maskmem_features" and "maskmem_pos_enc" + # will be added when rerunning the memory encoder after applying non-overlapping + # constraints to object scores. Its "pred_masks" are prefilled with a large + # negative value (NO_OBJ_SCORE) to represent missing objects. + consolidated_out = { + "maskmem_features": None, + "maskmem_pos_enc": None, + consolidated_mask_key: torch.full( + size=(batch_size, 1, consolidated_H, consolidated_W), + fill_value=NO_OBJ_SCORE, + dtype=torch.float32, + device=inference_state["storage_device"], + ), + "obj_ptr": torch.full( + size=(batch_size, self.hidden_dim), + fill_value=NO_OBJ_SCORE, + dtype=torch.float32, + device=inference_state["device"], + ), + } + empty_mask_ptr = None + for obj_idx in range(batch_size): + obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx] + obj_output_dict = inference_state["output_dict_per_obj"][obj_idx] + out = obj_temp_output_dict[storage_key].get(frame_idx, None) + # If the object doesn't appear in "temp_output_dict_per_obj" on this frame, + # we fall back and look up its previous output in "output_dict_per_obj". + # We look up both "cond_frame_outputs" and "non_cond_frame_outputs" in + # "output_dict_per_obj" to find a previous output for this object. + if out is None: + out = obj_output_dict["cond_frame_outputs"].get(frame_idx, None) + if out is None: + out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx, None) + # If the object doesn't appear in "output_dict_per_obj" either, we skip it + # and leave its mask scores to the default scores (i.e. the NO_OBJ_SCORE + # placeholder above) and set its object pointer to be a dummy pointer. + if out is None: + # Fill in dummy object pointers for those objects without any inputs or + # tracking outcomes on this frame (only do it under `run_mem_encoder=True`, + # i.e. when we need to build the memory for tracking). + if run_mem_encoder: + if empty_mask_ptr is None: + empty_mask_ptr = self._get_empty_mask_ptr( + inference_state, frame_idx + ) + # fill object pointer with a dummy pointer (based on an empty mask) + consolidated_out["obj_ptr"][obj_idx : obj_idx + 1] = empty_mask_ptr + continue + # Add the temporary object output mask to consolidated output mask + obj_mask = out["pred_masks"] + consolidated_pred_masks = consolidated_out[consolidated_mask_key] + if obj_mask.shape[-2:] == consolidated_pred_masks.shape[-2:]: + consolidated_pred_masks[obj_idx : obj_idx + 1] = obj_mask + else: + # Resize first if temporary object mask has a different resolution + resized_obj_mask = torch.nn.functional.interpolate( + obj_mask, + size=consolidated_pred_masks.shape[-2:], + mode="bilinear", + align_corners=False, + ) + consolidated_pred_masks[obj_idx : obj_idx + 1] = resized_obj_mask + consolidated_out["obj_ptr"][obj_idx : obj_idx + 1] = out["obj_ptr"] + + # Optionally, apply non-overlapping constraints on the consolidated scores + # and rerun the memory encoder + if run_mem_encoder: + device = inference_state["device"] + high_res_masks = torch.nn.functional.interpolate( + consolidated_out["pred_masks"].to(device, non_blocking=True), + size=(self.image_size, self.image_size), + mode="bilinear", + align_corners=False, + ) + if self.non_overlap_masks_for_mem_enc: + high_res_masks = self._apply_non_overlapping_constraints(high_res_masks) + maskmem_features, maskmem_pos_enc = self._run_memory_encoder( + inference_state=inference_state, + frame_idx=frame_idx, + batch_size=batch_size, + high_res_masks=high_res_masks, + is_mask_from_pts=True, # these frames are what the user interacted with + ) + consolidated_out["maskmem_features"] = maskmem_features + consolidated_out["maskmem_pos_enc"] = maskmem_pos_enc + + return consolidated_out + + def _get_empty_mask_ptr(self, inference_state, frame_idx): + """Get a dummy object pointer based on an empty mask on the current frame.""" + # A dummy (empty) mask with a single object + batch_size = 1 + mask_inputs = torch.zeros( + (batch_size, 1, self.image_size, self.image_size), + dtype=torch.float32, + device=inference_state["device"], + ) + + # Retrieve correct image features + ( + _, + _, + current_vision_feats, + current_vision_pos_embeds, + feat_sizes, + ) = self._get_image_feature(inference_state, frame_idx, batch_size) + + # Feed the empty mask and image feature above to get a dummy object pointer + current_out = self.track_step( + frame_idx=frame_idx, + is_init_cond_frame=True, + current_vision_feats=current_vision_feats, + current_vision_pos_embeds=current_vision_pos_embeds, + feat_sizes=feat_sizes, + point_inputs=None, + mask_inputs=mask_inputs, + output_dict={}, + num_frames=inference_state["num_frames"], + track_in_reverse=False, + run_mem_encoder=False, + prev_sam_mask_logits=None, + ) + return current_out["obj_ptr"] + + @torch.inference_mode() + def propagate_in_video_preflight(self, inference_state): + """Prepare inference_state and consolidate temporary outputs before tracking.""" + # Tracking has started and we don't allow adding new objects until session is reset. + inference_state["tracking_has_started"] = True + batch_size = self._get_obj_num(inference_state) + + # Consolidate per-object temporary outputs in "temp_output_dict_per_obj" and + # add them into "output_dict". + temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"] + output_dict = inference_state["output_dict"] + # "consolidated_frame_inds" contains indices of those frames where consolidated + # temporary outputs have been added (either in this call or any previous calls + # to `propagate_in_video_preflight`). + consolidated_frame_inds = inference_state["consolidated_frame_inds"] + for is_cond in [False, True]: + # Separately consolidate conditioning and non-conditioning temp outptus + storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs" + # Find all the frames that contain temporary outputs for any objects + # (these should be the frames that have just received clicks for mask inputs + # via `add_new_points` or `add_new_mask`) + temp_frame_inds = set() + for obj_temp_output_dict in temp_output_dict_per_obj.values(): + temp_frame_inds.update(obj_temp_output_dict[storage_key].keys()) + consolidated_frame_inds[storage_key].update(temp_frame_inds) + # consolidate the temprary output across all objects on this frame + for frame_idx in temp_frame_inds: + consolidated_out = self._consolidate_temp_output_across_obj( + inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True + ) + # merge them into "output_dict" and also create per-object slices + output_dict[storage_key][frame_idx] = consolidated_out + self._add_output_per_object( + inference_state, frame_idx, consolidated_out, storage_key + ) + clear_non_cond_mem = self.clear_non_cond_mem_around_input and ( + self.clear_non_cond_mem_for_multi_obj or batch_size <= 1 + ) + if clear_non_cond_mem: + # clear non-conditioning memory of the surrounding frames + self._clear_non_cond_mem_around_input(inference_state, frame_idx) + + # clear temporary outputs in `temp_output_dict_per_obj` + for obj_temp_output_dict in temp_output_dict_per_obj.values(): + obj_temp_output_dict[storage_key].clear() + + # edge case: if an output is added to "cond_frame_outputs", we remove any prior + # output on the same frame in "non_cond_frame_outputs" + for frame_idx in output_dict["cond_frame_outputs"]: + output_dict["non_cond_frame_outputs"].pop(frame_idx, None) + for obj_output_dict in inference_state["output_dict_per_obj"].values(): + for frame_idx in obj_output_dict["cond_frame_outputs"]: + obj_output_dict["non_cond_frame_outputs"].pop(frame_idx, None) + for frame_idx in consolidated_frame_inds["cond_frame_outputs"]: + assert frame_idx in output_dict["cond_frame_outputs"] + consolidated_frame_inds["non_cond_frame_outputs"].discard(frame_idx) + + # Make sure that the frame indices in "consolidated_frame_inds" are exactly those frames + # with either points or mask inputs (which should be true under a correct workflow). + all_consolidated_frame_inds = ( + consolidated_frame_inds["cond_frame_outputs"] + | consolidated_frame_inds["non_cond_frame_outputs"] + ) + input_frames_inds = set() + for point_inputs_per_frame in inference_state["point_inputs_per_obj"].values(): + input_frames_inds.update(point_inputs_per_frame.keys()) + for mask_inputs_per_frame in inference_state["mask_inputs_per_obj"].values(): + input_frames_inds.update(mask_inputs_per_frame.keys()) + assert all_consolidated_frame_inds == input_frames_inds + + @torch.inference_mode() + def propagate_in_video( + self, + inference_state, + start_frame_idx=None, + max_frame_num_to_track=None, + reverse=False, + ): + """Propagate the input points across frames to track in the entire video.""" + self.propagate_in_video_preflight(inference_state) + + output_dict = inference_state["output_dict"] + consolidated_frame_inds = inference_state["consolidated_frame_inds"] + obj_ids = inference_state["obj_ids"] + num_frames = inference_state["num_frames"] + batch_size = self._get_obj_num(inference_state) + if len(output_dict["cond_frame_outputs"]) == 0: + raise RuntimeError("No points are provided; please add points first") + clear_non_cond_mem = self.clear_non_cond_mem_around_input and ( + self.clear_non_cond_mem_for_multi_obj or batch_size <= 1 + ) + + # set start index, end index, and processing order + if start_frame_idx is None: + # default: start from the earliest frame with input points + start_frame_idx = min(output_dict["cond_frame_outputs"]) + if max_frame_num_to_track is None: + # default: track all the frames in the video + max_frame_num_to_track = num_frames + if reverse: + end_frame_idx = max(start_frame_idx - max_frame_num_to_track, 0) + if start_frame_idx > 0: + processing_order = range(start_frame_idx, end_frame_idx - 1, -1) + else: + processing_order = [] # skip reverse tracking if starting from frame 0 + else: + end_frame_idx = min( + start_frame_idx + max_frame_num_to_track, num_frames - 1 + ) + processing_order = range(start_frame_idx, end_frame_idx + 1) + + for frame_idx in tqdm(processing_order, desc="propagate in video"): + # We skip those frames already in consolidated outputs (these are frames + # that received input clicks or mask). Note that we cannot directly run + # batched forward on them via `_run_single_frame_inference` because the + # number of clicks on each object might be different. + if frame_idx in consolidated_frame_inds["cond_frame_outputs"]: + storage_key = "cond_frame_outputs" + current_out = output_dict[storage_key][frame_idx] + pred_masks = current_out["pred_masks"] + if clear_non_cond_mem: + # clear non-conditioning memory of the surrounding frames + self._clear_non_cond_mem_around_input(inference_state, frame_idx) + elif frame_idx in consolidated_frame_inds["non_cond_frame_outputs"]: + storage_key = "non_cond_frame_outputs" + current_out = output_dict[storage_key][frame_idx] + pred_masks = current_out["pred_masks"] + else: + storage_key = "non_cond_frame_outputs" + current_out, pred_masks = self._run_single_frame_inference( + inference_state=inference_state, + output_dict=output_dict, + frame_idx=frame_idx, + batch_size=batch_size, + is_init_cond_frame=False, + point_inputs=None, + mask_inputs=None, + reverse=reverse, + run_mem_encoder=True, + ) + output_dict[storage_key][frame_idx] = current_out + # Create slices of per-object outputs for subsequent interaction with each + # individual object after tracking. + self._add_output_per_object( + inference_state, frame_idx, current_out, storage_key + ) + inference_state["frames_already_tracked"][frame_idx] = {"reverse": reverse} + + # Resize the output mask to the original video resolution (we directly use + # the mask scores on GPU for output to avoid any CPU conversion in between) + _, video_res_masks = self._get_orig_video_res_output( + inference_state, pred_masks + ) + yield frame_idx, obj_ids, video_res_masks + + def _add_output_per_object( + self, inference_state, frame_idx, current_out, storage_key + ): + """ + Split a multi-object output into per-object output slices and add them into + `output_dict_per_obj`. The resulting slices share the same tensor storage. + """ + maskmem_features = current_out["maskmem_features"] + assert maskmem_features is None or isinstance(maskmem_features, torch.Tensor) + + maskmem_pos_enc = current_out["maskmem_pos_enc"] + assert maskmem_pos_enc is None or isinstance(maskmem_pos_enc, list) + + output_dict_per_obj = inference_state["output_dict_per_obj"] + for obj_idx, obj_output_dict in output_dict_per_obj.items(): + obj_slice = slice(obj_idx, obj_idx + 1) + obj_out = { + "maskmem_features": None, + "maskmem_pos_enc": None, + "pred_masks": current_out["pred_masks"][obj_slice], + "obj_ptr": current_out["obj_ptr"][obj_slice], + } + if maskmem_features is not None: + obj_out["maskmem_features"] = maskmem_features[obj_slice] + if maskmem_pos_enc is not None: + obj_out["maskmem_pos_enc"] = [x[obj_slice] for x in maskmem_pos_enc] + obj_output_dict[storage_key][frame_idx] = obj_out + + @torch.inference_mode() + def reset_state(self, inference_state): + """Remove all input points or mask in all frames throughout the video.""" + self._reset_tracking_results(inference_state) + # Remove all object ids + inference_state["obj_id_to_idx"].clear() + inference_state["obj_idx_to_id"].clear() + inference_state["obj_ids"].clear() + inference_state["point_inputs_per_obj"].clear() + inference_state["mask_inputs_per_obj"].clear() + inference_state["output_dict_per_obj"].clear() + inference_state["temp_output_dict_per_obj"].clear() + + def _reset_tracking_results(self, inference_state): + """Reset all tracking inputs and results across the videos.""" + for v in inference_state["point_inputs_per_obj"].values(): + v.clear() + for v in inference_state["mask_inputs_per_obj"].values(): + v.clear() + for v in inference_state["output_dict_per_obj"].values(): + v["cond_frame_outputs"].clear() + v["non_cond_frame_outputs"].clear() + for v in inference_state["temp_output_dict_per_obj"].values(): + v["cond_frame_outputs"].clear() + v["non_cond_frame_outputs"].clear() + inference_state["output_dict"]["cond_frame_outputs"].clear() + inference_state["output_dict"]["non_cond_frame_outputs"].clear() + inference_state["consolidated_frame_inds"]["cond_frame_outputs"].clear() + inference_state["consolidated_frame_inds"]["non_cond_frame_outputs"].clear() + inference_state["tracking_has_started"] = False + inference_state["frames_already_tracked"].clear() + + def _get_image_feature(self, inference_state, frame_idx, batch_size): + """Compute the image features on a given frame.""" + # Look up in the cache first + image, backbone_out = inference_state["cached_features"].get( + frame_idx, (None, None) + ) + if backbone_out is None: + # Cache miss -- we will run inference on a single image + image = inference_state["images"][frame_idx].cpu().float().unsqueeze(0) + backbone_out = self.forward_image(image) + # Cache the most recent frame's feature (for repeated interactions with + # a frame; we can use an LRU cache for more frames in the future). + inference_state["cached_features"] = {frame_idx: (image, backbone_out)} + + # expand the features to have the same dimension as the number of objects + expanded_image = image.expand(batch_size, -1, -1, -1) + expanded_backbone_out = { + "backbone_fpn": backbone_out["backbone_fpn"].copy(), + "vision_pos_enc": backbone_out["vision_pos_enc"].copy(), + } + for i, feat in enumerate(expanded_backbone_out["backbone_fpn"]): + expanded_backbone_out["backbone_fpn"][i] = feat.expand( + batch_size, -1, -1, -1 + ) + for i, pos in enumerate(expanded_backbone_out["vision_pos_enc"]): + pos = pos.expand(batch_size, -1, -1, -1) + expanded_backbone_out["vision_pos_enc"][i] = pos + + features = self._prepare_backbone_features(expanded_backbone_out) + features = (expanded_image,) + features + return features + + def _run_single_frame_inference( + self, + inference_state, + output_dict, + frame_idx, + batch_size, + is_init_cond_frame, + point_inputs, + mask_inputs, + reverse, + run_mem_encoder, + prev_sam_mask_logits=None, + ): + """Run tracking on a single frame based on current inputs and previous memory.""" + # Retrieve correct image features + ( + _, + _, + current_vision_feats, + current_vision_pos_embeds, + feat_sizes, + ) = self._get_image_feature(inference_state, frame_idx, batch_size) + + # point and mask should not appear as input simultaneously on the same frame + assert point_inputs is None or mask_inputs is None + current_out = self.track_step( + frame_idx=frame_idx, + is_init_cond_frame=is_init_cond_frame, + current_vision_feats=current_vision_feats, + current_vision_pos_embeds=current_vision_pos_embeds, + feat_sizes=feat_sizes, + point_inputs=point_inputs, + mask_inputs=mask_inputs, + output_dict=output_dict, + num_frames=inference_state["num_frames"], + track_in_reverse=reverse, + run_mem_encoder=run_mem_encoder, + prev_sam_mask_logits=prev_sam_mask_logits, + ) + + # optionally offload the output to CPU memory to save GPU space + storage_device = inference_state["storage_device"] + maskmem_features = current_out["maskmem_features"] + if maskmem_features is not None: + maskmem_features = maskmem_features.to(torch.bfloat16) + maskmem_features = maskmem_features.to(storage_device, non_blocking=True) + pred_masks_gpu = current_out["pred_masks"] + # potentially fill holes in the predicted masks + if self.fill_hole_area > 0: + pred_masks_gpu = fill_holes_in_mask_scores( + pred_masks_gpu, self.fill_hole_area + ) + pred_masks = pred_masks_gpu.to(storage_device, non_blocking=True) + # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it + maskmem_pos_enc = self._get_maskmem_pos_enc(inference_state, current_out) + # object pointer is a small tensor, so we always keep it on GPU memory for fast access + obj_ptr = current_out["obj_ptr"] + # make a compact version of this frame's output to reduce the state size + compact_current_out = { + "maskmem_features": maskmem_features, + "maskmem_pos_enc": maskmem_pos_enc, + "pred_masks": pred_masks, + "obj_ptr": obj_ptr, + } + return compact_current_out, pred_masks_gpu + + def _run_memory_encoder( + self, inference_state, frame_idx, batch_size, high_res_masks, is_mask_from_pts + ): + """ + Run the memory encoder on `high_res_masks`. This is usually after applying + non-overlapping constraints to object scores. Since their scores changed, their + memory also need to be computed again with the memory encoder. + """ + # Retrieve correct image features + _, _, current_vision_feats, _, feat_sizes = self._get_image_feature( + inference_state, frame_idx, batch_size + ) + maskmem_features, maskmem_pos_enc = self._encode_new_memory( + current_vision_feats=current_vision_feats, + feat_sizes=feat_sizes, + pred_masks_high_res=high_res_masks, + is_mask_from_pts=is_mask_from_pts, + ) + + # optionally offload the output to CPU memory to save GPU space + storage_device = inference_state["storage_device"] + maskmem_features = maskmem_features.to(torch.bfloat16) + maskmem_features = maskmem_features.to(storage_device, non_blocking=True) + # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it + maskmem_pos_enc = self._get_maskmem_pos_enc( + inference_state, {"maskmem_pos_enc": maskmem_pos_enc} + ) + return maskmem_features, maskmem_pos_enc + + def _get_maskmem_pos_enc(self, inference_state, current_out): + """ + `maskmem_pos_enc` is the same across frames and objects, so we cache it as + a constant in the inference session to reduce session storage size. + """ + model_constants = inference_state["constants"] + # "out_maskmem_pos_enc" should be either a list of tensors or None + out_maskmem_pos_enc = current_out["maskmem_pos_enc"] + if out_maskmem_pos_enc is not None: + if "maskmem_pos_enc" not in model_constants: + assert isinstance(out_maskmem_pos_enc, list) + # only take the slice for one object, since it's same across objects + maskmem_pos_enc = [x[0:1].clone() for x in out_maskmem_pos_enc] + model_constants["maskmem_pos_enc"] = maskmem_pos_enc + else: + maskmem_pos_enc = model_constants["maskmem_pos_enc"] + # expand the cached maskmem_pos_enc to the actual batch size + batch_size = out_maskmem_pos_enc[0].size(0) + expanded_maskmem_pos_enc = [ + x.expand(batch_size, -1, -1, -1) for x in maskmem_pos_enc + ] + else: + expanded_maskmem_pos_enc = None + return expanded_maskmem_pos_enc + + def _clear_non_cond_mem_around_input(self, inference_state, frame_idx): + """ + Remove the non-conditioning memory around the input frame. When users provide + correction clicks, the surrounding frames' non-conditioning memories can still + contain outdated object appearance information and could confuse the model. + + This method clears those non-conditioning memories surrounding the interacted + frame to avoid giving the model both old and new information about the object. + """ + r = self.memory_temporal_stride_for_eval + frame_idx_begin = frame_idx - r * self.num_maskmem + frame_idx_end = frame_idx + r * self.num_maskmem + output_dict = inference_state["output_dict"] + non_cond_frame_outputs = output_dict["non_cond_frame_outputs"] + for t in range(frame_idx_begin, frame_idx_end + 1): + non_cond_frame_outputs.pop(t, None) + for obj_output_dict in inference_state["output_dict_per_obj"].values(): + obj_output_dict["non_cond_frame_outputs"].pop(t, None) diff --git a/sam2/utils/__init__.py b/sam2/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4547e070da2f3ddc5bf2f466cb2242e6135c7dc3 --- /dev/null +++ b/sam2/utils/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/sam2/utils/__pycache__/__init__.cpython-312.pyc b/sam2/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3094ef39a0fad6e71aaaa8d9cf2be91a8790a522 Binary files /dev/null and b/sam2/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/sam2/utils/__pycache__/misc.cpython-312.pyc b/sam2/utils/__pycache__/misc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f080e227df56cfc02a328f7a53d2fab39d0ead9 Binary files /dev/null and b/sam2/utils/__pycache__/misc.cpython-312.pyc differ diff --git a/sam2/utils/__pycache__/transforms.cpython-312.pyc b/sam2/utils/__pycache__/transforms.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f6990dd23c3c3b0f2b3ccf5ecdea7921fc594d7 Binary files /dev/null and b/sam2/utils/__pycache__/transforms.cpython-312.pyc differ diff --git a/sam2/utils/amg.py b/sam2/utils/amg.py new file mode 100644 index 0000000000000000000000000000000000000000..08a56abdec44fef5539f97c0f532fab1e3aefe68 --- /dev/null +++ b/sam2/utils/amg.py @@ -0,0 +1,348 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math +from copy import deepcopy +from itertools import product +from typing import Any, Dict, Generator, ItemsView, List, Tuple + +import numpy as np +import torch + +# Very lightly adapted from https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/utils/amg.py + + +class MaskData: + """ + A structure for storing masks and their related data in batched format. + Implements basic filtering and concatenation. + """ + + def __init__(self, **kwargs) -> None: + for v in kwargs.values(): + assert isinstance( + v, (list, np.ndarray, torch.Tensor) + ), "MaskData only supports list, numpy arrays, and torch tensors." + self._stats = dict(**kwargs) + + def __setitem__(self, key: str, item: Any) -> None: + assert isinstance( + item, (list, np.ndarray, torch.Tensor) + ), "MaskData only supports list, numpy arrays, and torch tensors." + self._stats[key] = item + + def __delitem__(self, key: str) -> None: + del self._stats[key] + + def __getitem__(self, key: str) -> Any: + return self._stats[key] + + def items(self) -> ItemsView[str, Any]: + return self._stats.items() + + def filter(self, keep: torch.Tensor) -> None: + for k, v in self._stats.items(): + if v is None: + self._stats[k] = None + elif isinstance(v, torch.Tensor): + self._stats[k] = v[torch.as_tensor(keep, device=v.device)] + elif isinstance(v, np.ndarray): + self._stats[k] = v[keep.detach().cpu().numpy()] + elif isinstance(v, list) and keep.dtype == torch.bool: + self._stats[k] = [a for i, a in enumerate(v) if keep[i]] + elif isinstance(v, list): + self._stats[k] = [v[i] for i in keep] + else: + raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.") + + def cat(self, new_stats: "MaskData") -> None: + for k, v in new_stats.items(): + if k not in self._stats or self._stats[k] is None: + self._stats[k] = deepcopy(v) + elif isinstance(v, torch.Tensor): + self._stats[k] = torch.cat([self._stats[k], v], dim=0) + elif isinstance(v, np.ndarray): + self._stats[k] = np.concatenate([self._stats[k], v], axis=0) + elif isinstance(v, list): + self._stats[k] = self._stats[k] + deepcopy(v) + else: + raise TypeError(f"MaskData key {k} has an unsupported type {type(v)}.") + + def to_numpy(self) -> None: + for k, v in self._stats.items(): + if isinstance(v, torch.Tensor): + self._stats[k] = v.float().detach().cpu().numpy() + + +def is_box_near_crop_edge( + boxes: torch.Tensor, crop_box: List[int], orig_box: List[int], atol: float = 20.0 +) -> torch.Tensor: + """Filter masks at the edge of a crop, but not at the edge of the original image.""" + crop_box_torch = torch.as_tensor(crop_box, dtype=torch.float, device=boxes.device) + orig_box_torch = torch.as_tensor(orig_box, dtype=torch.float, device=boxes.device) + boxes = uncrop_boxes_xyxy(boxes, crop_box).float() + near_crop_edge = torch.isclose(boxes, crop_box_torch[None, :], atol=atol, rtol=0) + near_image_edge = torch.isclose(boxes, orig_box_torch[None, :], atol=atol, rtol=0) + near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge) + return torch.any(near_crop_edge, dim=1) + + +def box_xyxy_to_xywh(box_xyxy: torch.Tensor) -> torch.Tensor: + box_xywh = deepcopy(box_xyxy) + box_xywh[2] = box_xywh[2] - box_xywh[0] + box_xywh[3] = box_xywh[3] - box_xywh[1] + return box_xywh + + +def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]: + assert len(args) > 0 and all( + len(a) == len(args[0]) for a in args + ), "Batched iteration must have inputs of all the same size." + n_batches = len(args[0]) // batch_size + int(len(args[0]) % batch_size != 0) + for b in range(n_batches): + yield [arg[b * batch_size : (b + 1) * batch_size] for arg in args] + + +def mask_to_rle_pytorch(tensor: torch.Tensor) -> List[Dict[str, Any]]: + """ + Encodes masks to an uncompressed RLE, in the format expected by + pycoco tools. + """ + # Put in fortran order and flatten h,w + b, h, w = tensor.shape + tensor = tensor.permute(0, 2, 1).flatten(1) + + # Compute change indices + diff = tensor[:, 1:] ^ tensor[:, :-1] + change_indices = diff.nonzero() + + # Encode run length + out = [] + for i in range(b): + cur_idxs = change_indices[change_indices[:, 0] == i, 1] + cur_idxs = torch.cat( + [ + torch.tensor([0], dtype=cur_idxs.dtype, device=cur_idxs.device), + cur_idxs + 1, + torch.tensor([h * w], dtype=cur_idxs.dtype, device=cur_idxs.device), + ] + ) + btw_idxs = cur_idxs[1:] - cur_idxs[:-1] + counts = [] if tensor[i, 0] == 0 else [0] + counts.extend(btw_idxs.detach().cpu().tolist()) + out.append({"size": [h, w], "counts": counts}) + return out + + +def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray: + """Compute a binary mask from an uncompressed RLE.""" + h, w = rle["size"] + mask = np.empty(h * w, dtype=bool) + idx = 0 + parity = False + for count in rle["counts"]: + mask[idx : idx + count] = parity + idx += count + parity ^= True + mask = mask.reshape(w, h) + return mask.transpose() # Put in C order + + +def area_from_rle(rle: Dict[str, Any]) -> int: + return sum(rle["counts"][1::2]) + + +def calculate_stability_score( + masks: torch.Tensor, mask_threshold: float, threshold_offset: float +) -> torch.Tensor: + """ + Computes the stability score for a batch of masks. The stability + score is the IoU between the binary masks obtained by thresholding + the predicted mask logits at high and low values. + """ + # One mask is always contained inside the other. + # Save memory by preventing unnecessary cast to torch.int64 + intersections = ( + (masks > (mask_threshold + threshold_offset)) + .sum(-1, dtype=torch.int16) + .sum(-1, dtype=torch.int32) + ) + unions = ( + (masks > (mask_threshold - threshold_offset)) + .sum(-1, dtype=torch.int16) + .sum(-1, dtype=torch.int32) + ) + return intersections / unions + + +def build_point_grid(n_per_side: int) -> np.ndarray: + """Generates a 2D grid of points evenly spaced in [0,1]x[0,1].""" + offset = 1 / (2 * n_per_side) + points_one_side = np.linspace(offset, 1 - offset, n_per_side) + points_x = np.tile(points_one_side[None, :], (n_per_side, 1)) + points_y = np.tile(points_one_side[:, None], (1, n_per_side)) + points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2) + return points + + +def build_all_layer_point_grids( + n_per_side: int, n_layers: int, scale_per_layer: int +) -> List[np.ndarray]: + """Generates point grids for all crop layers.""" + points_by_layer = [] + for i in range(n_layers + 1): + n_points = int(n_per_side / (scale_per_layer**i)) + points_by_layer.append(build_point_grid(n_points)) + return points_by_layer + + +def generate_crop_boxes( + im_size: Tuple[int, ...], n_layers: int, overlap_ratio: float +) -> Tuple[List[List[int]], List[int]]: + """ + Generates a list of crop boxes of different sizes. Each layer + has (2**i)**2 boxes for the ith layer. + """ + crop_boxes, layer_idxs = [], [] + im_h, im_w = im_size + short_side = min(im_h, im_w) + + # Original image + crop_boxes.append([0, 0, im_w, im_h]) + layer_idxs.append(0) + + def crop_len(orig_len, n_crops, overlap): + return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops)) + + for i_layer in range(n_layers): + n_crops_per_side = 2 ** (i_layer + 1) + overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side)) + + crop_w = crop_len(im_w, n_crops_per_side, overlap) + crop_h = crop_len(im_h, n_crops_per_side, overlap) + + crop_box_x0 = [int((crop_w - overlap) * i) for i in range(n_crops_per_side)] + crop_box_y0 = [int((crop_h - overlap) * i) for i in range(n_crops_per_side)] + + # Crops in XYWH format + for x0, y0 in product(crop_box_x0, crop_box_y0): + box = [x0, y0, min(x0 + crop_w, im_w), min(y0 + crop_h, im_h)] + crop_boxes.append(box) + layer_idxs.append(i_layer + 1) + + return crop_boxes, layer_idxs + + +def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor: + x0, y0, _, _ = crop_box + offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device) + # Check if boxes has a channel dimension + if len(boxes.shape) == 3: + offset = offset.unsqueeze(1) + return boxes + offset + + +def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor: + x0, y0, _, _ = crop_box + offset = torch.tensor([[x0, y0]], device=points.device) + # Check if points has a channel dimension + if len(points.shape) == 3: + offset = offset.unsqueeze(1) + return points + offset + + +def uncrop_masks( + masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w: int +) -> torch.Tensor: + x0, y0, x1, y1 = crop_box + if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h: + return masks + # Coordinate transform masks + pad_x, pad_y = orig_w - (x1 - x0), orig_h - (y1 - y0) + pad = (x0, pad_x - x0, y0, pad_y - y0) + return torch.nn.functional.pad(masks, pad, value=0) + + +def remove_small_regions( + mask: np.ndarray, area_thresh: float, mode: str +) -> Tuple[np.ndarray, bool]: + """ + Removes small disconnected regions and holes in a mask. Returns the + mask and an indicator of if the mask has been modified. + """ + import cv2 # type: ignore + + assert mode in ["holes", "islands"] + correct_holes = mode == "holes" + working_mask = (correct_holes ^ mask).astype(np.uint8) + n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8) + sizes = stats[:, -1][1:] # Row 0 is background label + small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh] + if len(small_regions) == 0: + return mask, False + fill_labels = [0] + small_regions + if not correct_holes: + fill_labels = [i for i in range(n_labels) if i not in fill_labels] + # If every region is below threshold, keep largest + if len(fill_labels) == 0: + fill_labels = [int(np.argmax(sizes)) + 1] + mask = np.isin(regions, fill_labels) + return mask, True + + +def coco_encode_rle(uncompressed_rle: Dict[str, Any]) -> Dict[str, Any]: + from pycocotools import mask as mask_utils # type: ignore + + h, w = uncompressed_rle["size"] + rle = mask_utils.frPyObjects(uncompressed_rle, h, w) + rle["counts"] = rle["counts"].decode("utf-8") # Necessary to serialize with json + return rle + + +def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor: + """ + Calculates boxes in XYXY format around masks. Return [0,0,0,0] for + an empty mask. For input shape C1xC2x...xHxW, the output shape is C1xC2x...x4. + """ + # torch.max below raises an error on empty inputs, just skip in this case + if torch.numel(masks) == 0: + return torch.zeros(*masks.shape[:-2], 4, device=masks.device) + + # Normalize shape to CxHxW + shape = masks.shape + h, w = shape[-2:] + if len(shape) > 2: + masks = masks.flatten(0, -3) + else: + masks = masks.unsqueeze(0) + + # Get top and bottom edges + in_height, _ = torch.max(masks, dim=-1) + in_height_coords = in_height * torch.arange(h, device=in_height.device)[None, :] + bottom_edges, _ = torch.max(in_height_coords, dim=-1) + in_height_coords = in_height_coords + h * (~in_height) + top_edges, _ = torch.min(in_height_coords, dim=-1) + + # Get left and right edges + in_width, _ = torch.max(masks, dim=-2) + in_width_coords = in_width * torch.arange(w, device=in_width.device)[None, :] + right_edges, _ = torch.max(in_width_coords, dim=-1) + in_width_coords = in_width_coords + w * (~in_width) + left_edges, _ = torch.min(in_width_coords, dim=-1) + + # If the mask is empty the right edge will be to the left of the left edge. + # Replace these boxes with [0, 0, 0, 0] + empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges) + out = torch.stack([left_edges, top_edges, right_edges, bottom_edges], dim=-1) + out = out * (~empty_filter).unsqueeze(-1) + + # Return to original shape + if len(shape) > 2: + out = out.reshape(*shape[:-2], 4) + else: + out = out[0] + + return out diff --git a/sam2/utils/misc.py b/sam2/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..18f6e79c180b2aabda677c7b319e9b22ad34cafa --- /dev/null +++ b/sam2/utils/misc.py @@ -0,0 +1,238 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import os +import warnings +from threading import Thread + +import numpy as np +import torch +from PIL import Image +from tqdm import tqdm + + +def get_sdpa_settings(): + if torch.cuda.is_available(): + old_gpu = torch.cuda.get_device_properties(0).major < 7 + # only use Flash Attention on Ampere (8.0) or newer GPUs + use_flash_attn = torch.cuda.get_device_properties(0).major >= 8 + if not use_flash_attn: + warnings.warn( + "Flash Attention is disabled as it requires a GPU with Ampere (8.0) CUDA capability.", + category=UserWarning, + stacklevel=2, + ) + # keep math kernel for PyTorch versions before 2.2 (Flash Attention v2 is only + # available on PyTorch 2.2+, while Flash Attention v1 cannot handle all cases) + pytorch_version = tuple(int(v) for v in torch.__version__.split(".")[:2]) + if pytorch_version < (2, 2): + warnings.warn( + f"You are using PyTorch {torch.__version__} without Flash Attention v2 support. " + "Consider upgrading to PyTorch 2.2+ for Flash Attention v2 (which could be faster).", + category=UserWarning, + stacklevel=2, + ) + math_kernel_on = pytorch_version < (2, 2) or not use_flash_attn + else: + old_gpu = True + use_flash_attn = False + math_kernel_on = True + + return old_gpu, use_flash_attn, math_kernel_on + + +def get_connected_components(mask): + """ + Get the connected components (8-connectivity) of binary masks of shape (N, 1, H, W). + + Inputs: + - mask: A binary mask tensor of shape (N, 1, H, W), where 1 is foreground and 0 is + background. + + Outputs: + - labels: A tensor of shape (N, 1, H, W) containing the connected component labels + for foreground pixels and 0 for background pixels. + - counts: A tensor of shape (N, 1, H, W) containing the area of the connected + components for foreground pixels and 0 for background pixels. + """ + # from sam2 import _C + + # return _C.get_connected_componnets(mask.to(torch.uint8).contiguous()) + pass + +def mask_to_box(masks: torch.Tensor): + """ + compute bounding box given an input mask + + Inputs: + - masks: [B, 1, H, W] boxes, dtype=torch.Tensor + + Returns: + - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor + """ + B, _, h, w = masks.shape + device = masks.device + xs = torch.arange(w, device=device, dtype=torch.int32) + ys = torch.arange(h, device=device, dtype=torch.int32) + grid_xs, grid_ys = torch.meshgrid(xs, ys, indexing="xy") + grid_xs = grid_xs[None, None, ...].expand(B, 1, h, w) + grid_ys = grid_ys[None, None, ...].expand(B, 1, h, w) + min_xs, _ = torch.min(torch.where(masks, grid_xs, w).flatten(-2), dim=-1) + max_xs, _ = torch.max(torch.where(masks, grid_xs, -1).flatten(-2), dim=-1) + min_ys, _ = torch.min(torch.where(masks, grid_ys, h).flatten(-2), dim=-1) + max_ys, _ = torch.max(torch.where(masks, grid_ys, -1).flatten(-2), dim=-1) + bbox_coords = torch.stack((min_xs, min_ys, max_xs, max_ys), dim=-1) + + return bbox_coords + + +def _load_img_as_tensor(img_path, image_size): + img_pil = Image.open(img_path) + img_np = np.array(img_pil.convert("RGB").resize((image_size, image_size))) + if img_np.dtype == np.uint8: # np.uint8 is expected for JPEG images + img_np = img_np / 255.0 + else: + raise RuntimeError(f"Unknown image dtype: {img_np.dtype} on {img_path}") + img = torch.from_numpy(img_np).permute(2, 0, 1) + video_width, video_height = img_pil.size # the original video size + return img, video_height, video_width + + +class AsyncVideoFrameLoader: + """ + A list of video frames to be load asynchronously without blocking session start. + """ + + def __init__(self, img_paths, image_size, offload_video_to_cpu, img_mean, img_std): + self.img_paths = img_paths + self.image_size = image_size + self.offload_video_to_cpu = offload_video_to_cpu + self.img_mean = img_mean + self.img_std = img_std + # items in `self._images` will be loaded asynchronously + self.images = [None] * len(img_paths) + # catch and raise any exceptions in the async loading thread + self.exception = None + # video_height and video_width be filled when loading the first image + self.video_height = None + self.video_width = None + + # load the first frame to fill video_height and video_width and also + # to cache it (since it's most likely where the user will click) + self.__getitem__(0) + + # load the rest of frames asynchronously without blocking the session start + def _load_frames(): + try: + for n in tqdm(range(len(self.images)), desc="frame loading (JPEG)"): + self.__getitem__(n) + except Exception as e: + self.exception = e + + self.thread = Thread(target=_load_frames, daemon=True) + self.thread.start() + + def __getitem__(self, index): + if self.exception is not None: + raise RuntimeError("Failure in frame loading thread") from self.exception + + img = self.images[index] + if img is not None: + return img + + img, video_height, video_width = _load_img_as_tensor( + self.img_paths[index], self.image_size + ) + self.video_height = video_height + self.video_width = video_width + # normalize by mean and std + img -= self.img_mean + img /= self.img_std + if not self.offload_video_to_cpu: + img = img.cuda(non_blocking=True) + self.images[index] = img + return img + + def __len__(self): + return len(self.images) + + +def load_video_frames( + video_path, + image_size, + offload_video_to_cpu, + img_mean=(0.485, 0.456, 0.406), + img_std=(0.229, 0.224, 0.225), + async_loading_frames=False, +): + """ + Load the video frames from a directory of JPEG files (".jpg" format). + + The frames are resized to image_size x image_size and are loaded to GPU if + `offload_video_to_cpu` is `False` and to CPU if `offload_video_to_cpu` is `True`. + + You can load a frame asynchronously by setting `async_loading_frames` to `True`. + """ + if isinstance(video_path, str) and os.path.isdir(video_path): + jpg_folder = video_path + else: + raise NotImplementedError("Only JPEG frames are supported at this moment") + + frame_names = [ + p + for p in os.listdir(jpg_folder) + if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"] + ] + frame_names.sort(key=lambda p: int(os.path.splitext(p)[0])) + num_frames = len(frame_names) + if num_frames == 0: + raise RuntimeError(f"no images found in {jpg_folder}") + img_paths = [os.path.join(jpg_folder, frame_name) for frame_name in frame_names] + img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None] + img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None] + + if async_loading_frames: + lazy_images = AsyncVideoFrameLoader( + img_paths, image_size, offload_video_to_cpu, img_mean, img_std + ) + return lazy_images, lazy_images.video_height, lazy_images.video_width + + images = torch.zeros(num_frames, 3, image_size, image_size, dtype=torch.float32) + for n, img_path in enumerate(tqdm(img_paths, desc="frame loading (JPEG)")): + images[n], video_height, video_width = _load_img_as_tensor(img_path, image_size) + if not offload_video_to_cpu: + images = images.cpu() + img_mean = img_mean.cpu() + img_std = img_std.cpu() + # normalize by mean and std + images -= img_mean + images /= img_std + return images, video_height, video_width + + +def fill_holes_in_mask_scores(mask, max_area): + """ + A post processor to fill small holes in mask scores with area under `max_area`. + """ + # Holes are those connected components in background with area <= self.max_area + # (background regions are those with mask scores <= 0) + assert max_area > 0, "max_area must be positive" + labels, areas = get_connected_components(mask <= 0) + is_hole = (labels > 0) & (areas <= max_area) + # We fill holes with a small positive mask score (0.1) to change them to foreground. + mask = torch.where(is_hole, 0.1, mask) + return mask + + +def concat_points(old_point_inputs, new_points, new_labels): + """Add new points and labels to previous point inputs (add at the end).""" + if old_point_inputs is None: + points, labels = new_points, new_labels + else: + points = torch.cat([old_point_inputs["point_coords"], new_points], dim=1) + labels = torch.cat([old_point_inputs["point_labels"], new_labels], dim=1) + + return {"point_coords": points, "point_labels": labels} diff --git a/sam2/utils/transforms.py b/sam2/utils/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..7d2c53a912c68c9fad0054a03268b3efae2d6d35 --- /dev/null +++ b/sam2/utils/transforms.py @@ -0,0 +1,99 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchvision.transforms import Normalize, Resize, ToTensor + + +class SAM2Transforms(nn.Module): + def __init__( + self, resolution, mask_threshold, max_hole_area=0.0, max_sprinkle_area=0.0 + ): + """ + Transforms for SAM2. + """ + super().__init__() + self.resolution = resolution + self.mask_threshold = mask_threshold + self.max_hole_area = max_hole_area + self.max_sprinkle_area = max_sprinkle_area + self.mean = [0.485, 0.456, 0.406] + self.std = [0.229, 0.224, 0.225] + self.to_tensor = ToTensor() + self.transforms = torch.jit.script( + nn.Sequential( + Resize((self.resolution, self.resolution)), + Normalize(self.mean, self.std), + ) + ) + + def __call__(self, x): + x = self.to_tensor(x) + return self.transforms(x) + + def forward_batch(self, img_list): + img_batch = [self.transforms(self.to_tensor(img)) for img in img_list] + img_batch = torch.stack(img_batch, dim=0) + return img_batch + + def transform_coords( + self, coords: torch.Tensor, normalize=False, orig_hw=None + ) -> torch.Tensor: + """ + Expects a torch tensor with length 2 in the last dimension. The coordinates can be in absolute image or normalized coordinates, + If the coords are in absolute image coordinates, normalize should be set to True and original image size is required. + + Returns + Un-normalized coordinates in the range of [0, 1] which is expected by the SAM2 model. + """ + if normalize: + assert orig_hw is not None + h, w = orig_hw + coords = coords.clone() + coords[..., 0] = coords[..., 0] / w + coords[..., 1] = coords[..., 1] / h + + coords = coords * self.resolution # unnormalize coords + return coords + + def transform_boxes( + self, boxes: torch.Tensor, normalize=False, orig_hw=None + ) -> torch.Tensor: + """ + Expects a tensor of shape Bx4. The coordinates can be in absolute image or normalized coordinates, + if the coords are in absolute image coordinates, normalize should be set to True and original image size is required. + """ + boxes = self.transform_coords(boxes.reshape(-1, 2, 2), normalize, orig_hw) + return boxes + + def postprocess_masks(self, masks: torch.Tensor, orig_hw) -> torch.Tensor: + """ + Perform PostProcessing on output masks. + """ + from sam2.utils.misc import get_connected_components + + masks = masks.float() + if self.max_hole_area > 0: + # Holes are those connected components in background with area <= self.fill_hole_area + # (background regions are those with mask scores <= self.mask_threshold) + mask_flat = masks.flatten(0, 1).unsqueeze(1) # flatten as 1-channel image + labels, areas = get_connected_components(mask_flat <= self.mask_threshold) + is_hole = (labels > 0) & (areas <= self.max_hole_area) + is_hole = is_hole.reshape_as(masks) + # We fill holes with a small positive mask score (10.0) to change them to foreground. + masks = torch.where(is_hole, self.mask_threshold + 10.0, masks) + + if self.max_sprinkle_area > 0: + labels, areas = get_connected_components(mask_flat > self.mask_threshold) + is_hole = (labels > 0) & (areas <= self.max_sprinkle_area) + is_hole = is_hole.reshape_as(masks) + # We fill holes with negative mask score (-10.0) to change them to background. + masks = torch.where(is_hole, self.mask_threshold - 10.0, masks) + + masks = F.interpolate(masks, orig_hw, mode="bilinear", align_corners=False) + return masks diff --git a/sam2_configs/__init__.py b/sam2_configs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4547e070da2f3ddc5bf2f466cb2242e6135c7dc3 --- /dev/null +++ b/sam2_configs/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/sam2_configs/__pycache__/__init__.cpython-312.pyc b/sam2_configs/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52283c47a5fce3f3b3afa687aba89c02c303b87b Binary files /dev/null and b/sam2_configs/__pycache__/__init__.cpython-312.pyc differ diff --git a/sam2_configs/sam2_hiera_b+.yaml b/sam2_configs/sam2_hiera_b+.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9fdcfa4054b6d03a159c4fad01515fd3153d23d8 --- /dev/null +++ b/sam2_configs/sam2_hiera_b+.yaml @@ -0,0 +1,113 @@ +# @package _global_ + +# Model +model: + _target_: sam2.modeling.sam2_base.SAM2Base + image_encoder: + _target_: sam2.modeling.backbones.image_encoder.ImageEncoder + scalp: 1 + trunk: + _target_: sam2.modeling.backbones.hieradet.Hiera + embed_dim: 112 + num_heads: 2 + neck: + _target_: sam2.modeling.backbones.image_encoder.FpnNeck + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 256 + normalize: true + scale: null + temperature: 10000 + d_model: 256 + backbone_channel_list: [896, 448, 224, 112] + fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features + fpn_interp_model: nearest + + memory_attention: + _target_: sam2.modeling.memory_attention.MemoryAttention + d_model: 256 + pos_enc_at_input: true + layer: + _target_: sam2.modeling.memory_attention.MemoryAttentionLayer + activation: relu + dim_feedforward: 2048 + dropout: 0.1 + pos_enc_at_attn: false + self_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [32, 32] + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + d_model: 256 + pos_enc_at_cross_attn_keys: true + pos_enc_at_cross_attn_queries: false + cross_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [32, 32] + rope_k_repeat: True + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + kv_in_dim: 64 + num_layers: 4 + + memory_encoder: + _target_: sam2.modeling.memory_encoder.MemoryEncoder + out_dim: 64 + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 64 + normalize: true + scale: null + temperature: 10000 + mask_downsampler: + _target_: sam2.modeling.memory_encoder.MaskDownSampler + kernel_size: 3 + stride: 2 + padding: 1 + fuser: + _target_: sam2.modeling.memory_encoder.Fuser + layer: + _target_: sam2.modeling.memory_encoder.CXBlock + dim: 256 + kernel_size: 7 + padding: 3 + layer_scale_init_value: 1e-6 + use_dwconv: True # depth-wise convs + num_layers: 2 + + num_maskmem: 7 + image_size: 1024 + # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask + sigmoid_scale_for_mem_enc: 20.0 + sigmoid_bias_for_mem_enc: -10.0 + use_mask_input_as_output_without_sam: true + # Memory + directly_add_no_mem_embed: true + # use high-resolution feature map in the SAM mask decoder + use_high_res_features_in_sam: true + # output 3 masks on the first click on initial conditioning frames + multimask_output_in_sam: true + # SAM heads + iou_prediction_use_sigmoid: True + # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder + use_obj_ptrs_in_encoder: true + add_tpos_enc_to_obj_ptrs: false + only_obj_ptrs_in_the_past_for_eval: true + # object occlusion prediction + pred_obj_scores: true + pred_obj_scores_mlp: true + fixed_no_obj_ptr: true + # multimask tracking settings + multimask_output_for_tracking: true + use_multimask_token_for_obj_ptr: true + multimask_min_pt_num: 0 + multimask_max_pt_num: 1 + use_mlp_for_obj_ptr_proj: true + # Compilation flag + compile_image_encoder: False diff --git a/sam2_configs/sam2_hiera_l.yaml b/sam2_configs/sam2_hiera_l.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5ef6cc3a4c7656f2791d12575b70b3dbb665bb25 --- /dev/null +++ b/sam2_configs/sam2_hiera_l.yaml @@ -0,0 +1,117 @@ +# @package _global_ + +# Model +model: + _target_: sam2.modeling.sam2_base.SAM2Base + image_encoder: + _target_: sam2.modeling.backbones.image_encoder.ImageEncoder + scalp: 1 + trunk: + _target_: sam2.modeling.backbones.hieradet.Hiera + embed_dim: 144 + num_heads: 2 + stages: [2, 6, 36, 4] + global_att_blocks: [23, 33, 43] + window_pos_embed_bkg_spatial_size: [7, 7] + window_spec: [8, 4, 16, 8] + neck: + _target_: sam2.modeling.backbones.image_encoder.FpnNeck + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 256 + normalize: true + scale: null + temperature: 10000 + d_model: 256 + backbone_channel_list: [1152, 576, 288, 144] + fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features + fpn_interp_model: nearest + + memory_attention: + _target_: sam2.modeling.memory_attention.MemoryAttention + d_model: 256 + pos_enc_at_input: true + layer: + _target_: sam2.modeling.memory_attention.MemoryAttentionLayer + activation: relu + dim_feedforward: 2048 + dropout: 0.1 + pos_enc_at_attn: false + self_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [32, 32] + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + d_model: 256 + pos_enc_at_cross_attn_keys: true + pos_enc_at_cross_attn_queries: false + cross_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [32, 32] + rope_k_repeat: True + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + kv_in_dim: 64 + num_layers: 4 + + memory_encoder: + _target_: sam2.modeling.memory_encoder.MemoryEncoder + out_dim: 64 + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 64 + normalize: true + scale: null + temperature: 10000 + mask_downsampler: + _target_: sam2.modeling.memory_encoder.MaskDownSampler + kernel_size: 3 + stride: 2 + padding: 1 + fuser: + _target_: sam2.modeling.memory_encoder.Fuser + layer: + _target_: sam2.modeling.memory_encoder.CXBlock + dim: 256 + kernel_size: 7 + padding: 3 + layer_scale_init_value: 1e-6 + use_dwconv: True # depth-wise convs + num_layers: 2 + + num_maskmem: 7 + image_size: 1024 + # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask + sigmoid_scale_for_mem_enc: 20.0 + sigmoid_bias_for_mem_enc: -10.0 + use_mask_input_as_output_without_sam: true + # Memory + directly_add_no_mem_embed: true + # use high-resolution feature map in the SAM mask decoder + use_high_res_features_in_sam: true + # output 3 masks on the first click on initial conditioning frames + multimask_output_in_sam: true + # SAM heads + iou_prediction_use_sigmoid: True + # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder + use_obj_ptrs_in_encoder: true + add_tpos_enc_to_obj_ptrs: false + only_obj_ptrs_in_the_past_for_eval: true + # object occlusion prediction + pred_obj_scores: true + pred_obj_scores_mlp: true + fixed_no_obj_ptr: true + # multimask tracking settings + multimask_output_for_tracking: true + use_multimask_token_for_obj_ptr: true + multimask_min_pt_num: 0 + multimask_max_pt_num: 1 + use_mlp_for_obj_ptr_proj: true + # Compilation flag + compile_image_encoder: False diff --git a/sam2_configs/sam2_hiera_s.yaml b/sam2_configs/sam2_hiera_s.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e6ebeeae747874ba1938ffdf69876202e7a98c0a --- /dev/null +++ b/sam2_configs/sam2_hiera_s.yaml @@ -0,0 +1,116 @@ +# @package _global_ + +# Model +model: + _target_: sam2.modeling.sam2_base.SAM2Base + image_encoder: + _target_: sam2.modeling.backbones.image_encoder.ImageEncoder + scalp: 1 + trunk: + _target_: sam2.modeling.backbones.hieradet.Hiera + embed_dim: 96 + num_heads: 1 + stages: [1, 2, 11, 2] + global_att_blocks: [7, 10, 13] + window_pos_embed_bkg_spatial_size: [7, 7] + neck: + _target_: sam2.modeling.backbones.image_encoder.FpnNeck + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 256 + normalize: true + scale: null + temperature: 10000 + d_model: 256 + backbone_channel_list: [768, 384, 192, 96] + fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features + fpn_interp_model: nearest + + memory_attention: + _target_: sam2.modeling.memory_attention.MemoryAttention + d_model: 256 + pos_enc_at_input: true + layer: + _target_: sam2.modeling.memory_attention.MemoryAttentionLayer + activation: relu + dim_feedforward: 2048 + dropout: 0.1 + pos_enc_at_attn: false + self_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [32, 32] + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + d_model: 256 + pos_enc_at_cross_attn_keys: true + pos_enc_at_cross_attn_queries: false + cross_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [32, 32] + rope_k_repeat: True + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + kv_in_dim: 64 + num_layers: 4 + + memory_encoder: + _target_: sam2.modeling.memory_encoder.MemoryEncoder + out_dim: 64 + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 64 + normalize: true + scale: null + temperature: 10000 + mask_downsampler: + _target_: sam2.modeling.memory_encoder.MaskDownSampler + kernel_size: 3 + stride: 2 + padding: 1 + fuser: + _target_: sam2.modeling.memory_encoder.Fuser + layer: + _target_: sam2.modeling.memory_encoder.CXBlock + dim: 256 + kernel_size: 7 + padding: 3 + layer_scale_init_value: 1e-6 + use_dwconv: True # depth-wise convs + num_layers: 2 + + num_maskmem: 7 + image_size: 1024 + # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask + sigmoid_scale_for_mem_enc: 20.0 + sigmoid_bias_for_mem_enc: -10.0 + use_mask_input_as_output_without_sam: true + # Memory + directly_add_no_mem_embed: true + # use high-resolution feature map in the SAM mask decoder + use_high_res_features_in_sam: true + # output 3 masks on the first click on initial conditioning frames + multimask_output_in_sam: true + # SAM heads + iou_prediction_use_sigmoid: True + # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder + use_obj_ptrs_in_encoder: true + add_tpos_enc_to_obj_ptrs: false + only_obj_ptrs_in_the_past_for_eval: true + # object occlusion prediction + pred_obj_scores: true + pred_obj_scores_mlp: true + fixed_no_obj_ptr: true + # multimask tracking settings + multimask_output_for_tracking: true + use_multimask_token_for_obj_ptr: true + multimask_min_pt_num: 0 + multimask_max_pt_num: 1 + use_mlp_for_obj_ptr_proj: true + # Compilation flag + compile_image_encoder: False diff --git a/sam2_configs/sam2_hiera_t.yaml b/sam2_configs/sam2_hiera_t.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0c0b3a36094f1cee9b6d320eacd1c5774e019fb2 --- /dev/null +++ b/sam2_configs/sam2_hiera_t.yaml @@ -0,0 +1,118 @@ +# @package _global_ + +# Model +model: + _target_: sam2.modeling.sam2_base.SAM2Base + image_encoder: + _target_: sam2.modeling.backbones.image_encoder.ImageEncoder + scalp: 1 + trunk: + _target_: sam2.modeling.backbones.hieradet.Hiera + embed_dim: 96 + num_heads: 1 + stages: [1, 2, 7, 2] + global_att_blocks: [5, 7, 9] + window_pos_embed_bkg_spatial_size: [7, 7] + neck: + _target_: sam2.modeling.backbones.image_encoder.FpnNeck + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 256 + normalize: true + scale: null + temperature: 10000 + d_model: 256 + backbone_channel_list: [768, 384, 192, 96] + fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features + fpn_interp_model: nearest + + memory_attention: + _target_: sam2.modeling.memory_attention.MemoryAttention + d_model: 256 + pos_enc_at_input: true + layer: + _target_: sam2.modeling.memory_attention.MemoryAttentionLayer + activation: relu + dim_feedforward: 2048 + dropout: 0.1 + pos_enc_at_attn: false + self_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [32, 32] + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + d_model: 256 + pos_enc_at_cross_attn_keys: true + pos_enc_at_cross_attn_queries: false + cross_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [32, 32] + rope_k_repeat: True + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + kv_in_dim: 64 + num_layers: 4 + + memory_encoder: + _target_: sam2.modeling.memory_encoder.MemoryEncoder + out_dim: 64 + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 64 + normalize: true + scale: null + temperature: 10000 + mask_downsampler: + _target_: sam2.modeling.memory_encoder.MaskDownSampler + kernel_size: 3 + stride: 2 + padding: 1 + fuser: + _target_: sam2.modeling.memory_encoder.Fuser + layer: + _target_: sam2.modeling.memory_encoder.CXBlock + dim: 256 + kernel_size: 7 + padding: 3 + layer_scale_init_value: 1e-6 + use_dwconv: True # depth-wise convs + num_layers: 2 + + num_maskmem: 7 + image_size: 1024 + # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask + # SAM decoder + sigmoid_scale_for_mem_enc: 20.0 + sigmoid_bias_for_mem_enc: -10.0 + use_mask_input_as_output_without_sam: true + # Memory + directly_add_no_mem_embed: true + # use high-resolution feature map in the SAM mask decoder + use_high_res_features_in_sam: true + # output 3 masks on the first click on initial conditioning frames + multimask_output_in_sam: true + # SAM heads + iou_prediction_use_sigmoid: True + # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder + use_obj_ptrs_in_encoder: true + add_tpos_enc_to_obj_ptrs: false + only_obj_ptrs_in_the_past_for_eval: true + # object occlusion prediction + pred_obj_scores: true + pred_obj_scores_mlp: true + fixed_no_obj_ptr: true + # multimask tracking settings + multimask_output_for_tracking: true + use_multimask_token_for_obj_ptr: true + multimask_min_pt_num: 0 + multimask_max_pt_num: 1 + use_mlp_for_obj_ptr_proj: true + # Compilation flag + # HieraT does not currently support compilation, should always be set to False + compile_image_encoder: False diff --git a/sam2_hiera_b+.yaml b/sam2_hiera_b+.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9fdcfa4054b6d03a159c4fad01515fd3153d23d8 --- /dev/null +++ b/sam2_hiera_b+.yaml @@ -0,0 +1,113 @@ +# @package _global_ + +# Model +model: + _target_: sam2.modeling.sam2_base.SAM2Base + image_encoder: + _target_: sam2.modeling.backbones.image_encoder.ImageEncoder + scalp: 1 + trunk: + _target_: sam2.modeling.backbones.hieradet.Hiera + embed_dim: 112 + num_heads: 2 + neck: + _target_: sam2.modeling.backbones.image_encoder.FpnNeck + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 256 + normalize: true + scale: null + temperature: 10000 + d_model: 256 + backbone_channel_list: [896, 448, 224, 112] + fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features + fpn_interp_model: nearest + + memory_attention: + _target_: sam2.modeling.memory_attention.MemoryAttention + d_model: 256 + pos_enc_at_input: true + layer: + _target_: sam2.modeling.memory_attention.MemoryAttentionLayer + activation: relu + dim_feedforward: 2048 + dropout: 0.1 + pos_enc_at_attn: false + self_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [32, 32] + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + d_model: 256 + pos_enc_at_cross_attn_keys: true + pos_enc_at_cross_attn_queries: false + cross_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [32, 32] + rope_k_repeat: True + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + kv_in_dim: 64 + num_layers: 4 + + memory_encoder: + _target_: sam2.modeling.memory_encoder.MemoryEncoder + out_dim: 64 + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 64 + normalize: true + scale: null + temperature: 10000 + mask_downsampler: + _target_: sam2.modeling.memory_encoder.MaskDownSampler + kernel_size: 3 + stride: 2 + padding: 1 + fuser: + _target_: sam2.modeling.memory_encoder.Fuser + layer: + _target_: sam2.modeling.memory_encoder.CXBlock + dim: 256 + kernel_size: 7 + padding: 3 + layer_scale_init_value: 1e-6 + use_dwconv: True # depth-wise convs + num_layers: 2 + + num_maskmem: 7 + image_size: 1024 + # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask + sigmoid_scale_for_mem_enc: 20.0 + sigmoid_bias_for_mem_enc: -10.0 + use_mask_input_as_output_without_sam: true + # Memory + directly_add_no_mem_embed: true + # use high-resolution feature map in the SAM mask decoder + use_high_res_features_in_sam: true + # output 3 masks on the first click on initial conditioning frames + multimask_output_in_sam: true + # SAM heads + iou_prediction_use_sigmoid: True + # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder + use_obj_ptrs_in_encoder: true + add_tpos_enc_to_obj_ptrs: false + only_obj_ptrs_in_the_past_for_eval: true + # object occlusion prediction + pred_obj_scores: true + pred_obj_scores_mlp: true + fixed_no_obj_ptr: true + # multimask tracking settings + multimask_output_for_tracking: true + use_multimask_token_for_obj_ptr: true + multimask_min_pt_num: 0 + multimask_max_pt_num: 1 + use_mlp_for_obj_ptr_proj: true + # Compilation flag + compile_image_encoder: False diff --git a/sam2_hiera_base_plus.pt b/sam2_hiera_base_plus.pt new file mode 100644 index 0000000000000000000000000000000000000000..604440531fd79487b117597f70fe7031899b499e --- /dev/null +++ b/sam2_hiera_base_plus.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0bb7f236400a49669ffdd1be617959a8b1d1065081789d7bbff88eded3a8071 +size 323493298 diff --git a/sam_2_image_generation.ipynb b/sam_2_image_generation.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..75062246c5890d7568bb8b0a6faab90c56583d0d --- /dev/null +++ b/sam_2_image_generation.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72c184e77b0f986c83632ed71f3bd9c0f1fe281ecbce902f54f763c51e34e854 +size 19079204 diff --git a/use_cases.py b/use_cases.py new file mode 100644 index 0000000000000000000000000000000000000000..f9e03464bccc65dfd8a6943e1fb97bb410850164 --- /dev/null +++ b/use_cases.py @@ -0,0 +1,79 @@ +import streamlit as st + +def use_case(): + st.title("Video Augmentation Use Cases") + + st.markdown("### 1. Face Blur in Privacy Protection") + st.write(""" + In scenarios where privacy is a concern, such as public surveillance or social media content, blurring faces is crucial to protect identities. + Video augmentation techniques can automatically detect and blur faces in video footage, ensuring compliance with privacy regulations and protecting individuals' identities. + """) + st.video('images/pix_output_video (1).mp4') + + st.markdown("### 2. Enhanced Video Editing and Post-Production") + st.write(""" + In video production, object masks allow editors to isolate and manipulate specific elements within a scene. + Whether itโ€™s changing backgrounds, applying effects, or removing unwanted objects, the appโ€™s masking capabilities make complex editing tasks more accessible and efficient. + """) + col1 ,col2 = st.columns(2) + with col1: + st.video('images/zoe.mp4') + with col2: + st.video("images/redhulk.mp4") + + st.markdown("### 3. Content Creation and Entertainment") + st.write(""" + In media and entertainment, content creators often need innovative and visually appealing effects in videos. + Video augmentation can apply artistic filters, color grading, and other visual effects, allowing creators to experiment with different styles and generate engaging content quickly. + """) + st.video('images/with_replacement_output_video.mp4') + + st.markdown("### 4. Creative Content Generation with Generative AI") + st.write(""" + By leveraging generative AI on object masks, the app can transform or replace the masked areas with entirely new content. + For example, in advertising or entertainment, a product or character can be dynamically altered to fit different themes or environments, providing unique and personalized experiences for viewers. + This technique can also be used in film production to create special effects, in digital art to generate novel compositions, or in marketing to produce customized visuals that resonate with diverse audiences. + """) + st.video('images/genai shaolin.mp4') + + st.divider() + + st.header("Uncharted Use cases") + + st.markdown("### 5. Data Augmentation for Limited Datasets") + st.write(""" + When working with limited video data, augmentation can help create a larger and more diverse dataset without additional data collection. + Techniques like temporal jittering, speed variations, color adjustments, and geometric transformations can generate synthetic videos, improving model performance. + """) + + st.markdown("### 6. Enhancing Security and Surveillance Systems") + st.write(""" + Security and surveillance systems rely on accurate detection and tracking of objects or individuals in various environments. + Video augmentation simulates different lighting conditions, weather effects (rain, fog), and camera angles, improving detection algorithms' robustness in real-world scenarios. + """) + + st.markdown("### 7. Improving Autonomous Driving Systems") + st.write(""" + Autonomous vehicles need to understand and react to various road conditions, lighting scenarios, and unexpected obstacles. + Video augmentation techniques like altering weather conditions, introducing random obstacles, and simulating different times of day help train resilient autonomous driving systems. + """) + + + + st.markdown("### 8. Medical Video Analysis") + st.write(""" + In medical diagnostics, especially in video-based analysis like endoscopy or ultrasound, the quality and diversity of video data are critical for accurate diagnoses. + Video augmentation can create variations in medical videos by adjusting contrast, adding noise, or simulating different imaging conditions, helping train more robust and accurate AI models. + """) + + st.markdown("### 9. Sports Analytics and Player Performance Evaluation") + st.write(""" + Sports analytics involve evaluating player performance from video footage, which can vary across games and conditions. + Augmenting videos with changes in speed, perspective, or focus can simulate different game scenarios, improving player tracking, action recognition, and strategy analysis. + """) + + st.markdown("### 10. E-commerce and Virtual Try-Ons") + st.write(""" + E-commerce platforms offering virtual try-ons for clothing or accessories need to simulate various conditions such as different lighting or angles. + Augmenting product videos helps create a more realistic virtual try-on experience, improving customer engagement and satisfaction. + """) diff --git a/video_augmentation.py b/video_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..64c63aaf7806d8f895b87613ccd3797cb2942afb --- /dev/null +++ b/video_augmentation.py @@ -0,0 +1,86 @@ +import streamlit as st +from PIL import Image +from streamlit_drawable_canvas import st_canvas +import os + + +def image_annoter(): + st.title("Image Annoter for YOLO") + st.write("Enter image files folder Location.") + + # Upload video + file_location = st.text_input("Enter File folder Location",None) + + if file_location is not None: + + # Folder containing your images + label_folder = file_location + + # Get a list of all images in the folder + image_files = [f for f in os.listdir(label_folder) if f.endswith(('png', 'jpg', 'jpeg'))] + + # Initialize session state to keep track of the current image index + if 'current_index' not in st.session_state: + st.session_state.current_index = 0 + + # Set the downscaling factor + downscale_factor = 0.5 # Adjust the downscale factor as needed + + # Navigation buttons + col1, col2, col3 = st.columns([1, 2, 1]) + if col1.button("Previous"): + if st.session_state.current_index > 0: + st.session_state.current_index -= 1 + + if col3.button("Next"): + if st.session_state.current_index < len(image_files) - 1: + st.session_state.current_index += 1 + + # Display the current image + current_image_file = image_files[st.session_state.current_index] + st.write(f"Annotating: {current_image_file}") + image_path = os.path.join(label_folder, current_image_file) + image = Image.open(image_path) + + # Downscale the image for the canvas + scaled_width = int(image.width * downscale_factor) + scaled_height = int(image.height * downscale_factor) + scaled_image = image.resize((scaled_width, scaled_height)) + + # Display the image on the canvas + canvas_result = st_canvas( + fill_color="rgba(255, 0, 0, 0.3)", # Fill color for the bounding box + stroke_width=3, + stroke_color="#ff0000", + background_image=scaled_image, + height=scaled_height, + width=scaled_width, + drawing_mode="rect", + key=current_image_file + ) + + # Save annotations + if st.button("Save Annotation"): + if canvas_result.json_data is not None: + # Extract coordinates of the bounding box + for obj in canvas_result.json_data["objects"]: + if obj["type"] == "rect": + left = obj["left"] / downscale_factor + top = obj["top"] / downscale_factor + width = obj["width"] / downscale_factor + height = obj["height"] / downscale_factor + x_center = left + width / 2 + y_center = top + height / 2 + + # Normalize the coordinates (YOLO format) + x_center /= image.width + y_center /= image.height + width /= image.width + height /= image.height + + # Save the annotation to a .txt file + annotation_path = os.path.join(label_folder, current_image_file.replace('.jpg', '.txt').replace('.png', '.txt')) + with open(annotation_path, 'w') as f: + f.write(f"0 {x_center} {y_center} {width} {height}\n") + + st.success(f"Annotation saved for {current_image_file}")