diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..5dc8af2b1c216ae074434c644fc24a41bca9a839 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+images/background.mp4 filter=lfs diff=lfs merge=lfs -text
+images/genai[[:space:]]shaolin.mp4 filter=lfs diff=lfs merge=lfs -text
+images/image_annote.mp4 filter=lfs diff=lfs merge=lfs -text
+images/image_aug.mp4 filter=lfs diff=lfs merge=lfs -text
+images/pix_output_video[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
+images/redhulk.mp4 filter=lfs diff=lfs merge=lfs -text
+images/with_replacement_output_video.mp4 filter=lfs diff=lfs merge=lfs -text
+images/zoe.mp4 filter=lfs diff=lfs merge=lfs -text
+sam_2_image_generation.ipynb filter=lfs diff=lfs merge=lfs -text
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..60498de22f69d49e03cdebac9ae5d02c33b72ab6
--- /dev/null
+++ b/app.py
@@ -0,0 +1,138 @@
+import streamlit as st
+import base64
+
+# Set the page configuration
+st.set_page_config(
+ page_title="MetaMorph AI",
+ page_icon="๐",
+ initial_sidebar_state="expanded",
+ layout="wide",
+ menu_items={
+ 'Get help': 'https://www.linkedin.com/in/gaurav-verma-4696bb106/',
+ 'About': "MetaMorph: Revolutionize your media with cutting-edge image and video augmentation using the META Sam-2 model for stunning visual transformations!"
+ }
+)
+
+# Function to load video as base64
+def get_base64_video(video_path):
+ with open(video_path, 'rb') as video_file:
+ video_bytes = video_file.read()
+ return base64.b64encode(video_bytes).decode('utf-8')
+
+# Video file path
+video_path = 'images/background.mp4'
+
+# Get the base64 video
+video_base64 = get_base64_video(video_path)
+
+# Add video as background
+background_video = f"""
+
+
+
+
+ MetaMorphix AI ๐ฆโ๐ฅ
+
+
+
+
+
+ """
+ st.markdown(html_code, unsafe_allow_html=True)
+
+ # Additional content
+
+# Functionality for pages
+from home import home_page
+from image_augmentation import image_augmentation_page
+from video_augmentation import image_annoter
+from use_cases import use_case
+def main():
+ st.sidebar.title("Navigation")
+ page = st.sidebar.selectbox("Go to", ("Home","Use Cases", "Image Augmentation", "Video Augmentation"))
+
+ if page == "Home":
+ home_page()
+ elif page == "Use Cases":
+ use_case()
+ elif page == "Image Augmentation":
+ image_augmentation_page()
+ elif page == "Video Augmentation":
+ image_annoter()
+
+if __name__ == "__main__":
+ main()
diff --git a/home.py b/home.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a7d6b5659503bdbc0f0db30e1739cf5fcc74e70
--- /dev/null
+++ b/home.py
@@ -0,0 +1,41 @@
+import streamlit as st
+
+
+
+def home_page():
+ st.title("Welcome to MetaMorphix AI")
+ st.write("""
+ This application uses the **META Sam-2 model** to perform advanced augmentation on images and videos.,
+ \n**YOLO** trained and pretrained model for Object Detection.
+ \n**Stability AI API** for Generative AI - Image to Image generation on mask.
+ \n**Image Annoter** for YOLO training Folder Input, Process Replica That of Roboflow app.
+
+ Navigate to the desired section using the sidebar.
+
+ \nScroll down to see the tutorial.
+
+ """)
+ st.divider()
+ st.header("For Image Augmentation")
+ st.write("""1. Navigate to Image Augmentation page & Upload a Image.
+ \n2. Mark coordinates on canvas **(green for Inclusive points & red for Exclusive points).**
+ \n3. Select Augmentaion method [Pixelated, Hue Change, Mask Replacement, Img2Img Generation] and proceed.""")
+ st.video("images/image_aug.mp4")
+
+ st.divider()
+ st.header("For Image Annotation on an Image Directory")
+ st.write("""1. Navigate to Video Augmentation page & Paste Local Directory link where train images are to annoted.
+ \n2. create Bounding box on canvas.
+ \n3. click on save annoptation and navigate through next button""")
+ st.video("images/image_annote.mp4")
+
+ st.warning("As of now Video Augmentation can only be happen on Jupyter notebook due to certain Limitation")
+ st.write("Go to following link to access Notebook and Use Kaggle GPU")
+ # Define the profile link
+ profile_url = "https://www.kaggle.com/code/gauravverma069/sam-2-meta-video-augmentation-with-yolo-and-genai"
+ st.markdown(f"[Visit my Kaggle Notebook link]({profile_url})")
+
+
+
+
+
\ No newline at end of file
diff --git a/image_augmentation.py b/image_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..67fce7ff3c168485390bc49772d0e69f29bbfdc0
--- /dev/null
+++ b/image_augmentation.py
@@ -0,0 +1,296 @@
+import streamlit as st
+from streamlit_drawable_canvas import st_canvas
+from PIL import Image
+import numpy as np
+import matplotlib.pyplot as plt
+import image_mask_gen
+import torch
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+import os
+import io
+import warnings
+from stability_sdk import client
+import stability_sdk.interfaces.gooseai.generation.generation_pb2 as generation
+
+import streamlit as st
+import base64
+
+
+# Function to display points on the image using matplotlib
+def show_points(coords, labels, ax, marker_size=375):
+ pos_points = coords[labels == 1]
+ neg_points = coords[labels == 0]
+ ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+ ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+
+def remove_duplicates(coords, labels):
+ unique_coords = []
+ unique_labels = []
+ seen = set()
+
+ for coord, label in zip(coords, labels):
+ coord_tuple = tuple(coord)
+ if coord_tuple not in seen:
+ seen.add(coord_tuple)
+ unique_coords.append(coord)
+ unique_labels.append(label)
+
+ return unique_coords, unique_labels
+
+
+def image_augmentation_page():
+ pass
+ st.title("Image Augmentation")
+ st.write("Upload an image to apply augmentation techniques.")
+
+ # Initialize session state variables
+ if "inclusive_points" not in st.session_state:
+ st.session_state.inclusive_points = []
+ if "exclusive_points" not in st.session_state:
+ st.session_state.exclusive_points = []
+
+ # Upload an image
+ uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
+
+ if uploaded_file is not None:
+ # Open the uploaded image
+ image = Image.open(uploaded_file)
+
+ # Set the maximum width for display
+ max_display_width = 700 # You can adjust this value
+
+ # Calculate the scaling factor
+ scale_factor = min(max_display_width / image.size[0], 1)
+
+ # Resize the image for display
+ display_width = int(image.size[0] * scale_factor)
+ display_height = int(image.size[1] * scale_factor)
+ resized_image = image.resize((display_width, display_height))
+
+ # Inclusive Points Phase
+ st.subheader("Select Inclusive Points (Green)")
+ canvas_inclusive = st_canvas(
+ fill_color="rgba(0, 0, 0, 0)", # Transparent fill
+ stroke_width=1, # Stroke width for drawing
+ stroke_color="blue", # Color for the outline of clicks
+ background_image=resized_image,
+ update_streamlit=True,
+ height=display_height,
+ width=display_width,
+ drawing_mode="circle", # Drawing mode to capture clicks as circles
+ point_display_radius=3, # Radius of the circle that represents a click
+ key="canvas_inclusive"
+ )
+
+ # Process inclusive clicks
+ if canvas_inclusive.json_data is not None:
+ objects = canvas_inclusive.json_data["objects"]
+ new_clicks = [[(obj["left"] + obj["radius"]) / scale_factor, (obj["top"] + obj["radius"]) / scale_factor] for obj in objects]
+ st.session_state.inclusive_points.extend(new_clicks)
+
+ # Plot the inclusive points on the original image using Matplotlib
+ fig_inclusive, ax = plt.subplots()
+ ax.imshow(image)
+ ax.axis('off') # Hide the axes
+
+ # Prepare data for plotting
+ inclusive_points = np.array(st.session_state.inclusive_points)
+ labels_inclusive = np.array([1] * len(st.session_state.inclusive_points))
+
+ # Call the function to show inclusive points
+ if len(inclusive_points) > 0:
+ show_points(inclusive_points, labels_inclusive, ax)
+
+ st.pyplot(fig_inclusive)
+
+ # Divider
+ st.divider()
+
+ # Exclusive Points Phase
+ st.subheader("Select Exclusive Points (Red)")
+ canvas_exclusive = st_canvas(
+ fill_color="rgba(0, 0, 0, 0)", # Transparent fill
+ stroke_width=1, # Stroke width for drawing
+ stroke_color="blue", # Color for the outline of clicks
+ background_image=resized_image,
+ update_streamlit=True,
+ height=display_height,
+ width=display_width,
+ drawing_mode="circle", # Drawing mode to capture clicks as circles
+ point_display_radius=3, # Radius of the circle that represents a click
+ key="canvas_exclusive"
+ )
+
+ # Process exclusive clicks
+ if canvas_exclusive.json_data is not None:
+ objects = canvas_exclusive.json_data["objects"]
+ new_clicks = [[(obj["left"] + obj["radius"]) / scale_factor, (obj["top"] + obj["radius"]) / scale_factor] for obj in objects]
+ st.session_state.exclusive_points.extend(new_clicks)
+
+ # Plot the exclusive points on the original image using Matplotlib
+ fig_exclusive, ax = plt.subplots()
+ ax.imshow(image)
+ ax.axis('off') # Hide the axes
+
+ # Prepare data for plotting
+ exclusive_points = np.array(st.session_state.exclusive_points)
+ labels_exclusive = np.array([0] * len(st.session_state.exclusive_points))
+
+ # Call the function to show exclusive points
+ if len(exclusive_points) > 0:
+ show_points(exclusive_points, labels_exclusive, ax)
+
+ st.pyplot(fig_exclusive)
+
+ # Grouping coordinates and labels
+ coordinates = st.session_state.inclusive_points + st.session_state.exclusive_points
+ labels = [1] * len(st.session_state.inclusive_points) + [0] * len(st.session_state.exclusive_points)
+
+ # # Display grouped coordinates and labels
+ # st.subheader("Coordinates and Labels")
+ # st.write("Coordinates: ", tuple(coordinates))
+ # st.write("Labels: ", labels)
+
+ # Provide an option to clear the coordinates
+ if st.button("Clear All Points"):
+ st.session_state.inclusive_points = []
+ st.session_state.exclusive_points = []
+ # global unique_coordinates, unique_labels
+ unique_coordinates, unique_labels = remove_duplicates(coordinates, labels)
+
+ st.write("Unique Coordinates:", tuple(unique_coordinates))
+ st.write("Unique Labels:", tuple(unique_labels))
+
+ # image_mask_gen.show_masks(image, masks, scores, point_coords=input_point, input_labels=input_label)
+ sam2_checkpoint = "sam2_hiera_base_plus.pt"
+ model_cfg = "sam2_hiera_b+.yaml"
+
+ sam2_model = build_sam2(model_cfg, sam2_checkpoint, device="cpu")
+
+ predictor = SAM2ImagePredictor(sam2_model)
+
+ image = image
+ predictor.set_image(image)
+
+ input_point = np.array(unique_coordinates)
+ input_label = np.array(unique_labels)
+
+ masks, scores, logits = predictor.predict(
+ point_coords=input_point,
+ point_labels=input_label,
+ multimask_output=True,
+ )
+ sorted_ind = np.argsort(scores)[::-1]
+ masks = masks[sorted_ind]
+ scores = scores[sorted_ind]
+ logits = logits[sorted_ind]
+
+ mask_input = logits[np.argmax(scores), :, :]
+
+ masks, scores, _ = predictor.predict(
+ point_coords=input_point,
+ point_labels=input_label,
+ mask_input=mask_input[None, :, :],
+ multimask_output=False,
+ )
+ image_mask_gen.show_masks(image, masks, scores, point_coords=input_point, input_labels=input_label)
+
+
+ # Get masked images
+ original_image = Image.open(uploaded_file)
+ # st.image(original_image, caption='Original Image', use_column_width=True)
+
+ with st.container(border=True):# Display masked images
+ col1, col2 = st.columns(2)
+ with col1:
+ mask_images = image_mask_gen.show_masks_1(original_image, masks, scores)
+ for idx, (img, score) in enumerate(mask_images):
+ st.image(img, caption=f'Mask {idx+1}, Score: {score:.3f}', use_column_width=True)
+ with col2:
+ inverse_mask_images = image_mask_gen.show_inverse_masks(original_image, masks, scores)
+ for idx, (img, score) in enumerate(inverse_mask_images):
+ st.image(img, caption=f'Inverse Mask {idx+1}, Score: {score:.3f}', use_column_width=True)
+
+ if st.checkbox("Proceed to Image Augmentation"):
+
+ image_aug_select = st.sidebar.selectbox("Select Augmentation for Mask",["Pixelate","Hue Change","Mask Replacement","Generative Img2Img"])
+ if image_aug_select == "Pixelate":
+
+ if st.sidebar.toggle("Proceed to Pixelate Mask"):
+ pixelation_level = st.slider("Select Pixelation Level", min_value=5, max_value=50, value=10)
+ combined_image = image_mask_gen.combine_pixelated_mask(original_image, masks[0], pixelation_level)
+ st.image(combined_image, caption="Combined Pixelated Image", use_column_width=True)
+ elif image_aug_select == "Hue Change":
+
+ if st.sidebar.toggle("Proceed to Hue Change"):
+ # Hue shift slider
+ hue_shift = st.slider("Select Hue Shift", min_value=-180, max_value=180, value=0)
+ # Apply hue change and show the result
+ combined_image = image_mask_gen.combine_hue_changed_mask(original_image, masks[0], hue_shift) # Assuming single mask
+ st.image(combined_image, caption="Combined Hue Changed Image", use_column_width=True)
+ elif image_aug_select == "Mask Replacement":
+
+ if st.sidebar.toggle("Proceed to replace Mask"):
+ replacement_file = st.file_uploader("Upload the replacement image", type=["png", "jpg", "jpeg"])
+ if replacement_file is not None:
+ replacement_image = Image.open(replacement_file) #.convert("RGBA")
+ combined_image = image_mask_gen.combine_mask_replaced_image(original_image, replacement_image, masks[0]) # Assuming single mask
+ st.image(combined_image, caption="Masked Area Replaced Image", use_column_width=True)
+ elif image_aug_select == "Generative Img2Img":
+
+ msk_img = None
+ mask_images_x = image_mask_gen.show_masks_1(original_image, masks, scores)
+ for idx, (img, score) in enumerate(mask_images_x):
+ msk_img = img
+ # st.image(img, caption=f'Mask {idx+1}, Score: {score:.3f}', use_column_width=True)
+
+ rgb_image = msk_img.convert("RGB")
+ # st.image(rgb_image)
+ resized_image = image_mask_gen.resize_image(rgb_image)
+ # st.image(resized_image, caption=f"Resized size: {resized_image.size[0]}x{resized_image.size[1]}", use_column_width=True)
+ width, height = resized_image.size
+
+ # User input for the prompt and API key
+ prompt = st.text_input("Enter your prompt:", "A Beautiful day, in the style reference of starry night by vincent van gogh")
+ api_key = st.text_input("Enter your Stability AI API key:")
+
+ if prompt and api_key:
+ # Set up our connection to the API.
+ os.environ['STABILITY_KEY'] = api_key
+ stability_api = client.StabilityInference(
+ key=os.environ['STABILITY_KEY'], # API Key reference.
+ verbose=True, # Print debug messages.
+ engine="stable-diffusion-xl-1024-v1-0", # Set the engine to use for generation.
+ )
+ style_preset_selector = st.sidebar.selectbox("Select Style Preset",["3d-model", "analog-film", "anime", "cinematic", "comic-book", "digital-art", "enhance", "fantasy-art", "isometric", "line-art", "low-poly", "modeling-compound", "neon-punk",
+ "origami", "photographic", "pixel-art", "tile-texture"],index = 5)
+ if st.sidebar.toggle("Proceed to Generate Image"):
+ # Set up our initial generation parameters.
+ answers2 = stability_api.generate(
+ prompt=prompt,
+ init_image=resized_image, # Assign our uploaded image as our Initial Image for transformation.
+ start_schedule=0.6,
+ steps=250,
+ cfg_scale=10.0,
+ width=width,
+ height=height,
+ sampler=generation.SAMPLER_K_DPMPP_SDE,
+ style_preset=style_preset_selector
+ )
+
+ # Process the response from the API
+ for resp in answers2:
+ for artifact in resp.artifacts:
+ if artifact.finish_reason == generation.FILTER:
+ warnings.warn(
+ "Your request activated the API's safety filters and could not be processed."
+ "Please modify the prompt and try again.")
+ if artifact.type == generation.ARTIFACT_IMAGE:
+ img2 = Image.open(io.BytesIO(artifact.binary))
+ # Display the generated image
+ st.image(img2, caption="Generated Image", use_column_width=True)
+
+ # Combine the generated image with the original image using the mask
+ combined_img = image_mask_gen.combine_mask_and_inverse_gen(original_image, img2, masks[0])
+ st.image(combined_img, caption="Combined Image", use_column_width=True)
\ No newline at end of file
diff --git a/image_mask_gen.py b/image_mask_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..d27685a98c80152396c9e6a7633cff89f2d6f81c
--- /dev/null
+++ b/image_mask_gen.py
@@ -0,0 +1,285 @@
+import streamlit as st
+import cv2
+import numpy as np
+from PIL import Image
+
+def apply_mask(image_cv, mask, color=(0, 255, 0), alpha=0.5):
+ """ Apply a mask to an image with given color and alpha blend """
+ mask_bgr = np.zeros_like(image_cv)
+ mask_bgr[mask > 0] = color
+ return cv2.addWeighted(image_cv, 1 - alpha, mask_bgr, alpha, 0)
+
+def draw_points(image_cv, points, labels):
+ """ Draw points on the image with different colors based on labels """
+ for coord, label in zip(points, labels):
+ color = (0, 255, 0) if label == 1 else (255, 0, 0) # Green for inclusive, Red for exclusive
+ cv2.circle(image_cv, tuple(map(int, coord)), 5, color, -1)
+ return image_cv
+
+def draw_boxes(image_cv, boxes):
+ """ Draw boxes on the image """
+ for box in boxes:
+ x, y, w, h = map(int, box)
+ cv2.rectangle(image_cv, (x, y), (x + w, y + h), (255, 0, 0), 2) # Red boxes
+ return image_cv
+
+def show_masks(image, masks, scores, point_coords=None, box_coords=None, input_labels=None, borders=True):
+ image_cv = np.array(image.convert("RGB"))[..., ::-1] # Convert PIL image to BGR format for OpenCV
+
+ for i, (mask, score) in enumerate(zip(masks, scores)):
+ image_with_mask = apply_mask(image_cv, mask)
+
+ if point_coords is not None:
+ assert input_labels is not None
+ image_with_mask = draw_points(image_with_mask, point_coords, input_labels)
+
+ if box_coords is not None:
+ image_with_mask = draw_boxes(image_with_mask, box_coords)
+
+ # Convert back to RGB and then to PIL for Streamlit
+ image_with_mask = cv2.cvtColor(image_with_mask, cv2.COLOR_BGR2RGB)
+ image_pil = Image.fromarray(image_with_mask)
+
+ # Display the final image with all overlays
+ st.image(image_pil, caption=f"Mask {i+1}, Score: {score:.3f}", use_column_width=True)
+
+
+def apply_mask_to_image(image, mask):
+ # Ensure the image is a NumPy array in BGR format
+ if isinstance(image, Image.Image):
+ image = np.array(image)
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+
+ # Create an alpha channel based on the mask
+ alpha_channel = (mask * 255).astype(np.uint8)
+
+ # Create an image with the mask applied only on masked areas
+ masked_image = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
+ for c in range(3): # Apply the mask only to the RGB channels
+ masked_image[..., c] = image[..., c] * mask
+
+ # Add the alpha channel to make areas outside the mask transparent
+ masked_image[..., 3] = alpha_channel
+
+ return masked_image
+
+def show_masks_1(image, masks, scores):
+ mask_images = []
+ for i, (mask, score) in enumerate(zip(masks, scores)):
+ # Apply the mask to the image
+ masked_image = apply_mask_to_image(image, mask)
+
+ # Convert the masked image to PIL format for Streamlit
+ pil_image = Image.fromarray(cv2.cvtColor(masked_image, cv2.COLOR_BGRA2RGBA))
+ mask_images.append((pil_image, score))
+
+ return mask_images
+
+
+def apply_inverse_mask_to_image(image, mask):
+ # Ensure the image is a NumPy array in BGR format
+ if isinstance(image, Image.Image):
+ image = np.array(image)
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+
+ # Create an alpha channel that is transparent inside the mask and opaque outside
+ alpha_channel = (1 - mask) * 255
+
+ # Create an image with the mask applied to the inverse areas
+ inverse_masked_image = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
+ for c in range(3): # Apply the inverse mask to RGB channels
+ inverse_masked_image[..., c] = image[..., c] * (1 - mask)
+
+ # Add the alpha channel to make areas inside the mask transparent
+ inverse_masked_image[..., 3] = alpha_channel.astype(np.uint8)
+
+ return inverse_masked_image
+
+def show_inverse_masks(image, masks, scores):
+ mask_images = []
+ for i, (mask, score) in enumerate(zip(masks, scores)):
+ # Apply the inverse mask to the image
+ inverse_masked_image = apply_inverse_mask_to_image(image, mask)
+
+ # Convert the masked image to PIL format for Streamlit
+ pil_image = Image.fromarray(cv2.cvtColor(inverse_masked_image, cv2.COLOR_BGRA2RGBA))
+ mask_images.append((pil_image, score))
+
+ return mask_images
+
+import streamlit as st
+import cv2
+import numpy as np
+from PIL import Image
+
+def combine_mask_and_inverse(image, mask):
+
+ # Ensure the image is a NumPy array in BGR format
+ if isinstance(image, Image.Image):
+ image = np.array(image)
+ image = cv2.cvtColor(image, cv2.COLOR_RGBA2BGR)
+
+ # Apply the mask to get the masked region (in original color)
+ masked_region = cv2.bitwise_and(image, image, mask=mask.astype(np.uint8))
+
+ # Apply the inverse mask to get the inverse-masked region (in original color)
+ inverse_mask = 1 - mask
+ inverse_masked_region = cv2.bitwise_and(image, image, mask=inverse_mask.astype(np.uint8))
+
+ # Combine both masked and inverse-masked regions
+ combined_image = cv2.add(masked_region, inverse_masked_region)
+
+ # Convert to RGBA format for transparency
+ combined_image_rgba = cv2.cvtColor(combined_image, cv2.COLOR_BGR2RGBA)
+
+ return combined_image_rgba
+
+def show_combined_masks(image, masks, scores):
+
+ mask_images = []
+ for i, (mask, score) in enumerate(zip(masks, scores)):
+ # Combine masked and inverse masked areas
+ combined_image = combine_mask_and_inverse(image, mask)
+
+ # Convert the combined image to PIL format for Streamlit
+ pil_image = Image.fromarray(combined_image)
+ mask_images.append((pil_image, score))
+
+ return mask_images
+
+
+def pixelate_area(image, mask, pixelation_level):
+ """
+ Apply pixelation to the masked area of an image.
+ """
+ pixelated_image = image.copy()
+ h, w, _ = image.shape
+
+ for y in range(0, h, pixelation_level):
+ for x in range(0, w, pixelation_level):
+ block = (slice(y, min(y + pixelation_level, h)), slice(x, min(x + pixelation_level, w)))
+ if np.any(mask[block]):
+ mean_color = image[block].mean(axis=(0, 1)).astype(int)
+ pixelated_image[block] = mean_color
+
+ return pixelated_image
+
+def combine_pixelated_mask(image, mask, pixelation_level=10):
+ """
+ Combine the pixelated masked areas with the original image.
+ """
+ image_np = np.array(image)
+ mask_np = np.array(mask)
+
+ pixelated_mask = pixelate_area(image_np, mask_np, pixelation_level)
+ combined_image = Image.fromarray(pixelated_mask)
+ return combined_image
+
+
+def change_hue(image, mask, hue_shift):
+
+ # Convert the image from RGB to HSV
+ hsv_image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
+ hsv_image = cv2.cvtColor(hsv_image, cv2.COLOR_RGB2HSV)
+
+ # Apply the hue shift to the masked area
+ hsv_image[..., 0] = (hsv_image[..., 0] + hue_shift) % 180
+
+ # Convert back to RGB format
+ rgb_image = cv2.cvtColor(hsv_image, cv2.COLOR_HSV2RGB)
+
+ # Combine the hue-changed area with the original image using the mask
+ hue_changed_image = np.array(image).copy()
+ hue_changed_image[mask] = np.concatenate((rgb_image[mask], hue_changed_image[mask][..., 3:]), axis=-1)
+
+ return hue_changed_image
+
+def combine_hue_changed_mask(image, mask, hue_shift):
+
+ image_np = np.array(image)
+ mask_np = np.array(mask).astype(bool)
+
+ hue_changed_area = change_hue(image_np, mask_np, hue_shift)
+ combined_image = Image.fromarray(hue_changed_area)
+
+ return combined_image
+
+def replace_masked_area(original_image, replacement_image, mask):
+ # Ensure the replacement image is the same size as the original image
+ replacement_image = cv2.resize(replacement_image, (original_image.shape[1], original_image.shape[0]))
+
+ # Create a copy of the original image
+ replaced_image = original_image.copy()
+
+ # Replace the masked area with the corresponding area from the replacement image
+ replaced_image[mask] = replacement_image[mask]
+
+ return replaced_image
+
+def combine_mask_replaced_image(original_image, replacement_image, mask):
+
+ # Convert images to NumPy arrays
+ original_np = np.array(original_image)
+ replacement_np = np.array(replacement_image)
+ mask_np = np.array(mask).astype(bool)
+
+ # Replace the masked area
+ replaced_area = replace_masked_area(original_np, replacement_np, mask_np)
+ combined_image = Image.fromarray(replaced_area)
+
+ return combined_image
+
+import streamlit as st
+from PIL import Image
+
+def resize_image(image, max_size=1024):
+ # Get the current width and height of the image
+ width, height = image.size
+
+ # Calculate the scaling factor
+ if width > height:
+ scaling_factor = max_size / width
+ else:
+ scaling_factor = max_size / height
+
+ # Only resize if the image is larger than the max_size
+ if scaling_factor < 1:
+ # Calculate new dimensions
+ new_width = int(width * scaling_factor)
+ new_height = int(height * scaling_factor)
+
+ # Resize the image
+ image_resized = image.resize((new_width, new_height))
+ return image_resized
+ else:
+ # Return the original image if it's already within the size limits
+ return image
+
+
+def combine_mask_and_inverse_gen(original_img, generated_img, mask):
+ # Ensure images are in RGBA mode
+ original_img = original_img.convert("RGBA")
+ generated_img = generated_img.convert("RGBA")
+
+ # Resize the generated image to match the original image size
+ generated_img = generated_img.resize(original_img.size)
+
+ # Convert images to arrays
+ orig_array = np.array(original_img)
+ gen_array = np.array(generated_img)
+
+ # Resize the mask to match the original image size
+ mask = Image.fromarray((mask * 255).astype(np.uint8)) # Convert mask to image for resizing
+ mask = mask.resize(original_img.size, Image.NEAREST) # Resize the mask
+ bool_mask = np.array(mask).astype(bool)
+
+ # Ensure the mask has the correct shape (H, W, 1)
+ if bool_mask.ndim == 2:
+ bool_mask = bool_mask[:, :, np.newaxis]
+
+ # Combine images using the mask
+ combined_array = np.where(bool_mask, gen_array, orig_array)
+
+ # Convert combined array back to image
+ combined_img = Image.fromarray(combined_array, "RGBA")
+ return combined_img
diff --git a/images/background.mp4 b/images/background.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..a3722fb2078e439a8c1febb9029de0add332ed76
--- /dev/null
+++ b/images/background.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92ca11934ec6540cf3fb0d5225aff2742683ce986f6269852ed18a751fb76a54
+size 28245879
diff --git a/images/genai shaolin.mp4 b/images/genai shaolin.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..bad2525aa0675f99e8c32a2393e482a8fe1bda59
--- /dev/null
+++ b/images/genai shaolin.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:699d86e4be50ca525808198a816a14fdb584bfc3bcaff61afa755c368ed8fb82
+size 1060558
diff --git a/images/image_annote.mp4 b/images/image_annote.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..d1d233a668965d4616026deccd33e0117ed2ca59
--- /dev/null
+++ b/images/image_annote.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6acf56d66dbb2fa3fc2f4f0ba9e4591282f41cf0829c4a81686e89010963a66f
+size 30740936
diff --git a/images/image_aug.mp4 b/images/image_aug.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..f964388ab2640e98d7624ea0333420b6df557dcf
--- /dev/null
+++ b/images/image_aug.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02edb90435a0024388ec09a6c6a28cf7e670e42de0da5792e29592460c4f44dd
+size 70042465
diff --git a/images/pix_output_video (1).mp4 b/images/pix_output_video (1).mp4
new file mode 100644
index 0000000000000000000000000000000000000000..faa0017ebe94a90fd73306089eb32c6b0104becd
--- /dev/null
+++ b/images/pix_output_video (1).mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d68ee725423da70f72c91ee747f566364c1298303e3e0c2d2c863f0a0b4e01a
+size 2042041
diff --git a/images/redhulk.mp4 b/images/redhulk.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..73034b3e936905c9c94e47165a989ca2a80187c2
--- /dev/null
+++ b/images/redhulk.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d98e9dae8d17acfec3baad33cc7f6445309e9eaf270ced2284b93d17eb42666f
+size 2452133
diff --git a/images/with_replacement_output_video.mp4 b/images/with_replacement_output_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..17b216134cf9f56121c15f6b6ac7735e45cdc577
--- /dev/null
+++ b/images/with_replacement_output_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70884dd88bb7935ff2d492df82a1940487dc8f2bb0194547b7236f43b009faa9
+size 8324371
diff --git a/images/zoe.mp4 b/images/zoe.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..c4dbf28de8a9d19d73916b909352ca76c33e6ad6
--- /dev/null
+++ b/images/zoe.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e24090d2db21ddd34892666e9b1eb907bd1bf3cfe5516c268b8b522180a6eb16
+size 2368843
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bca84ed6136409b99e60007ecd33ced53333df7d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+torch>=2.3.1
+torchvision>=0.18.1
+numpy>=1.24.4
+tqdm>=4.66.1
+hydra-core>=1.3.2
+iopath>=0.1.10
+pillow>=9.4.0
+streamlit-drawable-canvas>=0.9.3
+opencv-python>=4.10.0.84
+stability-sdk>=0.8.6
\ No newline at end of file
diff --git a/sam-2-meta-video-augmentation-with-yolo-and-genai.ipynb b/sam-2-meta-video-augmentation-with-yolo-and-genai.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..81ab0a44f65e53b0f445596287ff25429389a20c
--- /dev/null
+++ b/sam-2-meta-video-augmentation-with-yolo-and-genai.ipynb
@@ -0,0 +1 @@
+{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30762,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Video Augmentation using META SAM-2 Model with YOLO model and Stability AI","metadata":{}},{"cell_type":"markdown","source":"### Importing Images with Annoted text file for Yolov8n Model Training","metadata":{}},{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### upload your image directory with .txt annoted file in the format required by yolo model for training, with video on which model has to predict.\n\n### incase if wants to use pre_trained YOLO model, jump to section of pretrained model., or incase want to manually put coordinates on a frame jump to section of video segmenting.","metadata":{}},{"cell_type":"markdown","source":"### Installing Required Libraries","metadata":{}},{"cell_type":"code","source":"!pip install ultralytics opencv-python\n!pip install -U ipywidgets","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Yolov8n Model training ","metadata":{}},{"cell_type":"markdown","source":"## Yaml file creation and model training\n","metadata":{}},{"cell_type":"code","source":"from ultralytics import YOLO\nimport cv2\nimport matplotlib.pyplot as plt\n\n# Load YOLOv8 model configuration (e.g., YOLOv8 nano model)\nmodel = YOLO('yolov8n.yaml')\n\n# Create a dataset.yaml file for YOLOv8 training\ndataset_yaml_content = \"\"\"\ntrain: \"/kaggle/input/yolov-train-data/Bottle\"\nval: \"/kaggle/input/yolov-train-data/Bottle\"\nnc: 1 # Number of classes (1 in this case)\nnames: ['bottle']\n\"\"\"\n\n# Save the dataset.yaml file\nwith open('dataset.yaml', 'w') as f:\n f.write(dataset_yaml_content)\n\n \n\n# Train the model with the specified dataset and parameters\nmodel.train(\n data='dataset.yaml', # Path to the dataset.yaml file\n epochs=100, # Increase epochs for better results with small datasets\n imgsz=1024, # Use the resized image dimensions\n batch=1, # Set batch size to 4 due to limited data\n patience=50, # Early stopping if no improvement\n lr0=0.0001, # Start with a lower learning rate\n augment=True, # Enable data augmentation\n# weights='yolov8n.pt' # Start training with pre-trained weights (optional)\n)\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Note: You may have to enter wandb.ai api if using Kaggle","metadata":{}},{"cell_type":"markdown","source":"## prediction on an Image","metadata":{}},{"cell_type":"code","source":"# Load a test image\nimg = cv2.imread('/kaggle/input/yolov-train-data/Bottle/IMG202408142240012.jpg')\n\n# Predict\nresults = model.predict(img)\n\n# Alternatively, you can use matplotlib to display the results\nplt.imshow(results[0].plot()) # `plot` returns an image with bounding boxes drawn\nplt.axis('off')\nplt.show()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Predicting on Video & detecting the First Frame, and its center coordinates","metadata":{}},{"cell_type":"code","source":"# Process the video\nvideo_path = '/kaggle/input/yolov-train-data/VID202408142242002.mp4'\ncap = cv2.VideoCapture(video_path)\n\nx_center=0\ny_center=0\nframe_number = 0\nobject_detected = False\n\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n\n frame_number += 1\n\n # Run YOLOv8 detection\n results = model(frame)\n\n for r in results:\n if r.boxes: # Check if any object is detected\n for box in r.boxes:\n # Get the bounding box coordinates\n x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()\n\n # Calculate the center coordinates\n x_center = int((x1 + x2) / 2)\n y_center = int((y1 + y2) / 2)\n \n # Print the first frame number and center coordinates\n print(f\"First detection at frame: {frame_number}\")\n print(f\"Center coordinates: (x={x_center}, y={y_center})\")\n\n object_detected = True\n break\n\n if object_detected:\n break\n\ncap.release()\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(\"x_center:\",x_center)\nprint(\"y_center:\",y_center)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Using Yolov8s pretrained model for direct detection and getting the frame","metadata":{}},{"cell_type":"markdown","source":"#### just mention class name and it will return frame no. and coordinates","metadata":{}},{"cell_type":"code","source":"# Load the YOLOv8s model\nmodel = YOLO('yolov8s.pt') # Make sure the model is trained on the \"bottle\" class\n\n# Process the video\nvideo_path = '/kaggle/input/yolov-train-data/VID202408142242002.mp4'\ncap = cv2.VideoCapture(video_path)\n\nx_center = 0\ny_center = 0\nframe_number = 0\nobject_detected = False\nconfidence_threshold = 0.8 # Set the confidence threshold\n\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n\n frame_number += 1\n\n # Run YOLOv8 detection\n results = model(frame)\n\n for r in results:\n for box in r.boxes:\n # Get the class label for the detected object\n cls = int(box.cls[0].cpu().numpy())\n class_name = model.names[cls]\n\n # Check if the detected object is a \"bottle\" and has confidence > 0.8\n confidence = box.conf[0].cpu().numpy()\n if class_name == 'bottle' and confidence > confidence_threshold:\n # Get the bounding box coordinates\n x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()\n\n # Calculate the center coordinates\n x_center = int((x1 + x2) / 2)\n y_center = int((y1 + y2) / 2)\n \n # Print the first frame number and center coordinates\n print(f\"First bottle detection at frame: {frame_number}\")\n print(f\"Center coordinates: (x={x_center}, y={y_center}) with confidence {confidence:.2f}\")\n\n object_detected = True\n break # Exit the loop after the first detection\n\n if object_detected:\n break # Exit the main loop after the first detection\n\ncap.release()\n\n# If no bottle was detected with confidence > 0.8\nif not object_detected:\n print(\"No requested Object detected in the video with confidence greater than 0.8.\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(\"x_center:\",x_center)\nprint(\"y_center:\",y_center)\nprint(\"Frame No.:\",frame_number)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### clearing GPU cache","metadata":{}},{"cell_type":"code","source":"import torch\ntorch.cuda.empty_cache()\nprint(\"Done\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Video segmenting","metadata":{}},{"cell_type":"markdown","source":"### importing SAM-2 model (may take a while to download)","metadata":{}},{"cell_type":"code","source":"!git clone https://github.com/facebookresearch/segment-anything-2.git\n%cd /kaggle/working/segment-anything-2\n%pip install -e .\n%cd /kaggle/working/segment-anything-2/checkpoints\n!bash /kaggle/working/segment-anything-2/checkpoints/download_ckpts.sh\n%cd /kaggle/working/segment-anything-2","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import numpy as np\nimport torch\nimport matplotlib.pyplot as plt\nfrom PIL import Image","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# use bfloat16 for the entire notebook\ntorch.autocast(device_type=\"cuda\", dtype=torch.float16).__enter__()\n\nif torch.cuda.get_device_properties(0).major >= 8:\n # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)\n torch.backends.cuda.matmul.allow_tf32 = True\n torch.backends.cudnn.allow_tf32 = True","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## video to frames","metadata":{}},{"cell_type":"code","source":"import cv2\nimport os\nimport shutil\n\ndef video_to_frames(video_path, output_folder):\n # Ensure the output folder is clean\n if os.path.exists(output_folder):\n shutil.rmtree(output_folder)\n os.makedirs(output_folder)\n \n # Open the video file\n video_capture = cv2.VideoCapture(video_path)\n \n frame_count = 0\n success = True\n\n while success:\n success, frame = video_capture.read()\n if success:\n # Save the frame with a consistent naming convention\n frame_filename = os.path.join(output_folder, f\"{frame_count:05d}.jpg\")\n cv2.imwrite(frame_filename, frame)\n frame_count += 1\n\n video_capture.release()\n print(f\"Extracted {frame_count} frames to {output_folder}\")\n return frame_count\n\n# Example usage\nvideo_path = \"/kaggle/input/shaolin-soccer/Untitled video - Made with Clipchamp.mp4\"\noutput_folder = \"/kaggle/working/output_frames\"\ntotal_frames = video_to_frames(video_path, output_folder)\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## reordering Frames to video propagation\n","metadata":{}},{"cell_type":"code","source":"frame_number =0 ","metadata":{"execution":{"iopub.status.busy":"2024-08-23T05:45:01.624801Z","iopub.execute_input":"2024-08-23T05:45:01.625582Z","iopub.status.idle":"2024-08-23T05:45:01.636025Z","shell.execute_reply.started":"2024-08-23T05:45:01.625533Z","shell.execute_reply":"2024-08-23T05:45:01.634951Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"markdown","source":"### (replace it with **frame_number** if using YOLO model)\n\n#### frame_number = frame_number","metadata":{}},{"cell_type":"code","source":"import os\nimport shutil\n\ndef reorder_frames(video_dir, ann_frame_idx, output_dir):\n # Ensure the output directory is clean\n if os.path.exists(output_dir):\n shutil.rmtree(output_dir)\n os.makedirs(output_dir)\n \n # Get and sort the list of frame filenames\n frame_names = [\n p for p in os.listdir(video_dir)\n if os.path.splitext(p)[-1] in [\".jpg\", \".jpeg\", \".JPG\", \".JPEG\"]\n ]\n frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))\n \n total_frames = len(frame_names)\n \n # Copy and reorder the frames to the new directory\n for i in range(total_frames):\n if i >= ann_frame_idx:\n new_idx = i - ann_frame_idx\n else:\n new_idx = total_frames - ann_frame_idx + i\n old_path = os.path.join(video_dir, frame_names[i])\n new_path = os.path.join(output_dir, f\"{new_idx:05d}.jpg\")\n shutil.copy2(old_path, new_path)\n \n print(f\"Frames reordered and copied to {output_dir} successfully.\")\n return len(os.listdir(output_dir))\n\n# Example usage\nreordered_dir = \"/kaggle/working/reordered_frames\"\nann_frame_idx = frame_number # Frame index to start as 0\nreordered_count = reorder_frames(output_folder, ann_frame_idx, reordered_dir)\n\n# Verify total frame consistency\nif total_frames == reordered_count:\n print(\"Frame count matches after reordering.\")\nelse:\n print(f\"Frame count mismatch! Extracted: {total_frames}, Reordered: {reordered_count}\")\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Importing Model and creating predictor","metadata":{}},{"cell_type":"code","source":"from sam2.build_sam import build_sam2_video_predictor\n\nsam2_checkpoint = \"/kaggle/working/segment-anything-2/checkpoints/sam2_hiera_base_plus.pt\"\nmodel_cfg = \"sam2_hiera_b+.yaml\"\n\npredictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## checking image where object is detected","metadata":{}},{"cell_type":"code","source":"frame_no = frame_number\n\ndef show_mask(mask, ax, obj_id=None, random_color=False):\n if random_color:\n color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)\n else:\n cmap = plt.get_cmap(\"tab10\")\n cmap_idx = 0 if obj_id is None else obj_id\n color = np.array([*cmap(cmap_idx)[:3], 0.6])\n h, w = mask.shape[-2:]\n mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)\n ax.imshow(mask_image)\n\n\ndef show_points(coords, labels, ax, marker_size=200):\n pos_points = coords[labels==1]\n neg_points = coords[labels==0]\n ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)\n ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)\n \n# `video_dir` a directory of JPEG frames with filenames like `