Spaces:

gvij
/

inpainting-segment

Runtime error

App Files Files Community

gvij commited on Sep 4, 2024

Commit

387d3ff

1 Parent(s): 5ddf7c9

added repo code

Browse files

Files changed (20) hide show

README.md +5 -4
app.py +188 -0
app_old.py +92 -0
checkpoints/sam2_hiera_base_plus.pt +3 -0
checkpoints/sam2_hiera_large.pt +3 -0
checkpoints/sam2_hiera_small.pt +3 -0
checkpoints/sam2_hiera_tiny.pt +3 -0
configs/__init__.py +5 -0
configs/sam2_hiera_b+.yaml +113 -0
configs/sam2_hiera_l.yaml +117 -0
configs/sam2_hiera_s.yaml +116 -0
configs/sam2_hiera_t.yaml +118 -0
gradio_cached_examples/19/log.csv +2 -0
requirements.txt +13 -0
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-311.pyc +0 -0
utils/__pycache__/florence.cpython-311.pyc +0 -0
utils/__pycache__/sam.cpython-311.pyc +0 -0
utils/florence.py +59 -0
utils/sam.py +47 -0

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
-title: Inpainting Segment
-emoji: 👁
 colorFrom: purple
-colorTo: gray
 sdk: gradio
-sdk_version: 4.42.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Florence2 + SAM2 Masking
+emoji: 😷
 colorFrom: purple
+colorTo: green
 sdk: gradio
+sdk_version: 4.40.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# florence-2-segment-flux-inpainting

app.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import os
+import random
+import numpy as np
+import gradio as gr
+import spaces
+import torch
+import supervision as sv
+from PIL import Image
+from typing import Optional, Tuple
+from diffusers import FluxInpaintPipeline
+from utils.florence import load_florence_model, run_florence_inference, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
+from utils.sam import load_sam_image_model, run_sam_inference
+# Set up device and environment
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 2048
+# Load models
+FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
+SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
+FLUX_PIPE = FluxInpaintPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16).to(DEVICE)
+# Set up CUDA optimizations
+torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+def resize_image_dimensions(
+    original_resolution_wh: Tuple[int, int],
+    maximum_dimension: int = 2048
+) -> Tuple[int, int]:
+    width, height = original_resolution_wh
+    if width <= maximum_dimension and height <= maximum_dimension:
+        width = width - (width % 32)
+        height = height - (height % 32)
+        return width, height
+    if width > height:
+        scaling_factor = maximum_dimension / width
+    else:
+        scaling_factor = maximum_dimension / height
+    new_width = int(width * scaling_factor)
+    new_height = int(height * scaling_factor)
+    new_width = new_width - (new_width % 32)
+    new_height = new_height - (new_height % 32)
+    return new_width, new_height
+@spaces.GPU(duration=150)
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+def process_image(
+    image_input,
+    segmentation_text,
+    inpaint_text,
+    seed_slicer: int,
+    randomize_seed: bool,
+    strength: float,
+    num_inference_steps: int,
+    progress=gr.Progress(track_tqdm=True)
+) -> Optional[Image.Image]:
+    if not image_input:
+        gr.Info("Please upload an image.")
+        return None, None
+    if not segmentation_text:
+        gr.Info("Please enter a text prompt for segmentation.")
+        return None, None
+    if not inpaint_text:
+        gr.Info("Please enter a text prompt for inpainting.")
+        return None, None
+    # Florence-SAM segmentation
+    _, result = run_florence_inference(
+        model=FLORENCE_MODEL,
+        processor=FLORENCE_PROCESSOR,
+        device=DEVICE,
+        image=image_input,
+        task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
+        text=segmentation_text
+    )
+    detections = sv.Detections.from_lmm(
+        lmm=sv.LMM.FLORENCE_2,
+        result=result,
+        resolution_wh=image_input.size
+    )
+    detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
+    if len(detections) == 0:
+        gr.Info("No objects detected.")
+        return None, None
+    mask = Image.fromarray(detections.mask[0].astype("uint8") * 255)
+    # Resize images for FLUX
+    width, height = resize_image_dimensions(original_resolution_wh=image_input.size)
+    resized_image = image_input.resize((width, height), Image.LANCZOS)
+    resized_mask = mask.resize((width, height), Image.NEAREST)
+    # FLUX inpainting
+    if randomize_seed:
+        seed_slicer = random.randint(0, MAX_SEED)
+    generator = torch.Generator().manual_seed(seed_slicer)
+    result = FLUX_PIPE(
+        prompt=inpaint_text,
+        image=resized_image,
+        mask_image=resized_mask,
+        width=width,
+        height=height,
+        strength=strength,
+        generator=generator,
+        num_inference_steps=num_inference_steps
+    ).images[0]
+    return result, resized_mask
+# Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# MonsterAPI Prompt Guided Inpainting")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(
+                label='Upload image',
+                type='pil',
+                image_mode='RGB',
+            )
+            segmentation_text = gr.Textbox(
+                label='Segmentation text prompt',
+                placeholder='Enter text for segmentation'
+            )
+            inpaint_text = gr.Textbox(
+                label='Inpainting text prompt',
+                placeholder='Enter text for inpainting'
+            )
+            with gr.Accordion("Advanced Settings", open=False):
+                seed_slicer = gr.Slider(
+                    label="Seed",
+                    minimum=0,
+                    maximum=MAX_SEED,
+                    step=1,
+                    value=42,
+                )
+                randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
+                strength = gr.Slider(
+                    label="Strength",
+                    minimum=0,
+                    maximum=1,
+                    step=0.01,
+                    value=0.75,
+                )
+                num_inference_steps = gr.Slider(
+                    label="Number of inference steps",
+                    minimum=1,
+                    maximum=50,
+                    step=1,
+                    value=20,
+                )
+            submit_button = gr.Button(value='Process', variant='primary')
+        with gr.Column():
+            output_image = gr.Image(label='Output image')
+            with gr.Accordion("Generated Mask", open=False):
+                output_mask = gr.Image(label='Segmentation mask')
+    submit_button.click(
+        fn=process_image,
+        inputs=[
+            image_input,
+            segmentation_text,
+            inpaint_text,
+            seed_slicer,
+            randomize_seed,
+            strength,
+            num_inference_steps
+        ],
+        outputs=[output_image, output_mask]
+    )
+demo.launch(debug=True, show_error=True, server_name="0.0.0.0",share=True)

app_old.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from typing import Optional
+import gradio as gr
+import spaces
+import supervision as sv
+import torch
+from PIL import Image
+from utils.florence import load_florence_model, run_florence_inference, \
+    FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
+from utils.sam import load_sam_image_model, run_sam_inference
+DEVICE = torch.device("cuda")
+# DEVICE = torch.device("cpu")
+torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+if torch.cuda.get_device_properties(0).major >= 8:
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
+SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
+@spaces.GPU(duration=20)
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+def process_image(image_input, text_input) -> Optional[Image.Image]:
+    if not image_input:
+        gr.Info("Please upload an image.")
+        return None
+    if not text_input:
+        gr.Info("Please enter a text prompt.")
+        return None
+    _, result = run_florence_inference(
+        model=FLORENCE_MODEL,
+        processor=FLORENCE_PROCESSOR,
+        device=DEVICE,
+        image=image_input,
+        task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
+        text=text_input
+    )
+    detections = sv.Detections.from_lmm(
+        lmm=sv.LMM.FLORENCE_2,
+        result=result,
+        resolution_wh=image_input.size
+    )
+    detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
+    if len(detections) == 0:
+        gr.Info("No objects detected.")
+        return None
+    return Image.fromarray(detections.mask[0].astype("uint8") * 255)
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            image_input_component = gr.Image(
+                type='pil', label='Upload image')
+            text_input_component = gr.Textbox(
+                label='Text prompt',
+                placeholder='Enter text prompts')
+            submit_button_component = gr.Button(
+                value='Submit', variant='primary')
+        with gr.Column():
+            image_output_component = gr.Image(label='Output mask')
+    submit_button_component.click(
+        fn=process_image,
+        inputs=[
+            image_input_component,
+            text_input_component
+        ],
+        outputs=[
+            image_output_component,
+        ]
+    )
+    text_input_component.submit(
+        fn=process_image,
+        inputs=[
+            image_input_component,
+            text_input_component
+        ],
+        outputs=[
+            image_output_component,
+        ]
+    )
+demo.launch(debug=False, show_error=True)

checkpoints/sam2_hiera_base_plus.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0bb7f236400a49669ffdd1be617959a8b1d1065081789d7bbff88eded3a8071
+size 323493298

checkpoints/sam2_hiera_large.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7442e4e9b732a508f80e141e7c2913437a3610ee0c77381a66658c3a445df87b
+size 897952466

checkpoints/sam2_hiera_small.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95949964d4e548409021d47b22712d5f1abf2564cc0c3c765ba599a24ac7dce3
+size 184309650

checkpoints/sam2_hiera_tiny.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65b50056e05bcb13694174f51bb6da89c894b57b75ccdf0ba6352c597c5d1125
+size 155906050

configs/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

configs/sam2_hiera_b+.yaml ADDED Viewed

	@@ -0,0 +1,113 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 112
+      num_heads: 2
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [896, 448, 224, 112]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

configs/sam2_hiera_l.yaml ADDED Viewed

	@@ -0,0 +1,117 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

configs/sam2_hiera_s.yaml ADDED Viewed

	@@ -0,0 +1,116 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 11, 2]
+      global_att_blocks: [7, 10, 13]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

configs/sam2_hiera_t.yaml ADDED Viewed

	@@ -0,0 +1,118 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 7, 2]
+      global_att_blocks: [5, 7, 9]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  # SAM decoder
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  # HieraT does not currently support compilation, should always be set to False
+  compile_image_encoder: False

gradio_cached_examples/19/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Output image,Segmentation mask,flag,username,timestamp
2	+ ,,,,2024-09-04 10:42:00.000310

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+tqdm
+einops
+spaces
+timm
+samv2
+gradio
+supervision
+opencv-python
+pytest
+accelerate
+transformers==4.42.4
+sentencepiece
+git+https://github.com/Gothos/diffusers.git@flux-inpaint

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (165 Bytes). View file

utils/__pycache__/florence.cpython-311.pyc ADDED Viewed

Binary file (3.62 kB). View file

utils/__pycache__/sam.cpython-311.pyc ADDED Viewed

Binary file (2.22 kB). View file

utils/florence.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+from typing import Union, Any, Tuple, Dict
+from unittest.mock import patch
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+from transformers.dynamic_module_utils import get_imports
+# FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
+FLORENCE_CHECKPOINT = "microsoft/Florence-2-large"
+FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
+FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
+FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
+FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
+FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
+def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
+    """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
+    if not str(filename).endswith("/modeling_florence2.py"):
+        return get_imports(filename)
+    imports = get_imports(filename)
+    imports.remove("flash_attn")
+    return imports
+def load_florence_model(
+    device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
+) -> Tuple[Any, Any]:
+    with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
+        model = AutoModelForCausalLM.from_pretrained(
+            checkpoint, trust_remote_code=True).to(device).eval()
+        processor = AutoProcessor.from_pretrained(
+            checkpoint, trust_remote_code=True)
+        return model, processor
+def run_florence_inference(
+    model: Any,
+    processor: Any,
+    device: torch.device,
+    image: Image,
+    task: str,
+    text: str = ""
+) -> Tuple[str, Dict]:
+    prompt = task + text
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
+    generated_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        num_beams=3
+    )
+    generated_text = processor.batch_decode(
+        generated_ids, skip_special_tokens=False)[0]
+    response = processor.post_process_generation(
+        generated_text, task=task, image_size=image.size)
+    return generated_text, response

utils/sam.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import Any
+import numpy as np
+import supervision as sv
+import torch
+from PIL import Image
+from sam2.build_sam import build_sam2, build_sam2_video_predictor
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+# SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
+# SAM_CONFIG = "sam2_hiera_s.yaml"
+SAM_CHECKPOINT = "checkpoints/sam2_hiera_large.pt"
+SAM_CONFIG = "sam2_hiera_l.yaml"
+def load_sam_image_model(
+    device: torch.device,
+    config: str = SAM_CONFIG,
+    checkpoint: str = SAM_CHECKPOINT
+) -> SAM2ImagePredictor:
+    model = build_sam2(config, checkpoint, device=device)
+    return SAM2ImagePredictor(sam_model=model)
+def load_sam_video_model(
+    device: torch.device,
+    config: str = SAM_CONFIG,
+    checkpoint: str = SAM_CHECKPOINT
+) -> Any:
+    return build_sam2_video_predictor(config, checkpoint, device=device)
+def run_sam_inference(
+    model: Any,
+    image: Image,
+    detections: sv.Detections
+) -> sv.Detections:
+    image = np.array(image.convert("RGB"))
+    model.set_image(image)
+    mask, score, _ = model.predict(box=detections.xyxy, multimask_output=False)
+    # dirty fix; remove this later
+    if len(mask.shape) == 4:
+        mask = np.squeeze(mask)
+    detections.mask = mask.astype(bool)
+    return detections