gvij commited on
Commit
387d3ff
·
1 Parent(s): 5ddf7c9

added repo code

Browse files
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: Inpainting Segment
3
- emoji: 👁
4
  colorFrom: purple
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 4.42.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
+ title: Florence2 + SAM2 Masking
3
+ emoji: 😷
4
  colorFrom: purple
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.40.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+ # florence-2-segment-flux-inpainting
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import numpy as np
4
+ import gradio as gr
5
+ import spaces
6
+ import torch
7
+ import supervision as sv
8
+ from PIL import Image
9
+ from typing import Optional, Tuple
10
+ from diffusers import FluxInpaintPipeline
11
+
12
+ from utils.florence import load_florence_model, run_florence_inference, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
13
+ from utils.sam import load_sam_image_model, run_sam_inference
14
+
15
+ # Set up device and environment
16
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
18
+ MAX_SEED = np.iinfo(np.int32).max
19
+ MAX_IMAGE_SIZE = 2048
20
+
21
+ # Load models
22
+ FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
23
+ SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
24
+ FLUX_PIPE = FluxInpaintPipeline.from_pretrained(
25
+ "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16).to(DEVICE)
26
+
27
+ # Set up CUDA optimizations
28
+ torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
29
+ if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:
30
+ torch.backends.cuda.matmul.allow_tf32 = True
31
+ torch.backends.cudnn.allow_tf32 = True
32
+
33
+ def resize_image_dimensions(
34
+ original_resolution_wh: Tuple[int, int],
35
+ maximum_dimension: int = 2048
36
+ ) -> Tuple[int, int]:
37
+ width, height = original_resolution_wh
38
+
39
+ if width <= maximum_dimension and height <= maximum_dimension:
40
+ width = width - (width % 32)
41
+ height = height - (height % 32)
42
+ return width, height
43
+
44
+ if width > height:
45
+ scaling_factor = maximum_dimension / width
46
+ else:
47
+ scaling_factor = maximum_dimension / height
48
+
49
+ new_width = int(width * scaling_factor)
50
+ new_height = int(height * scaling_factor)
51
+
52
+ new_width = new_width - (new_width % 32)
53
+ new_height = new_height - (new_height % 32)
54
+
55
+ return new_width, new_height
56
+
57
+ @spaces.GPU(duration=150)
58
+ @torch.inference_mode()
59
+ @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
60
+ def process_image(
61
+ image_input,
62
+ segmentation_text,
63
+ inpaint_text,
64
+ seed_slicer: int,
65
+ randomize_seed: bool,
66
+ strength: float,
67
+ num_inference_steps: int,
68
+ progress=gr.Progress(track_tqdm=True)
69
+ ) -> Optional[Image.Image]:
70
+ if not image_input:
71
+ gr.Info("Please upload an image.")
72
+ return None, None
73
+
74
+ if not segmentation_text:
75
+ gr.Info("Please enter a text prompt for segmentation.")
76
+ return None, None
77
+
78
+ if not inpaint_text:
79
+ gr.Info("Please enter a text prompt for inpainting.")
80
+ return None, None
81
+
82
+ # Florence-SAM segmentation
83
+ _, result = run_florence_inference(
84
+ model=FLORENCE_MODEL,
85
+ processor=FLORENCE_PROCESSOR,
86
+ device=DEVICE,
87
+ image=image_input,
88
+ task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
89
+ text=segmentation_text
90
+ )
91
+ detections = sv.Detections.from_lmm(
92
+ lmm=sv.LMM.FLORENCE_2,
93
+ result=result,
94
+ resolution_wh=image_input.size
95
+ )
96
+ detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
97
+
98
+ if len(detections) == 0:
99
+ gr.Info("No objects detected.")
100
+ return None, None
101
+
102
+ mask = Image.fromarray(detections.mask[0].astype("uint8") * 255)
103
+
104
+ # Resize images for FLUX
105
+ width, height = resize_image_dimensions(original_resolution_wh=image_input.size)
106
+ resized_image = image_input.resize((width, height), Image.LANCZOS)
107
+ resized_mask = mask.resize((width, height), Image.NEAREST)
108
+
109
+ # FLUX inpainting
110
+ if randomize_seed:
111
+ seed_slicer = random.randint(0, MAX_SEED)
112
+ generator = torch.Generator().manual_seed(seed_slicer)
113
+
114
+ result = FLUX_PIPE(
115
+ prompt=inpaint_text,
116
+ image=resized_image,
117
+ mask_image=resized_mask,
118
+ width=width,
119
+ height=height,
120
+ strength=strength,
121
+ generator=generator,
122
+ num_inference_steps=num_inference_steps
123
+ ).images[0]
124
+
125
+ return result, resized_mask
126
+
127
+ # Gradio interface
128
+ with gr.Blocks() as demo:
129
+ gr.Markdown("# MonsterAPI Prompt Guided Inpainting")
130
+ with gr.Row():
131
+ with gr.Column():
132
+ image_input = gr.Image(
133
+ label='Upload image',
134
+ type='pil',
135
+ image_mode='RGB',
136
+ )
137
+ segmentation_text = gr.Textbox(
138
+ label='Segmentation text prompt',
139
+ placeholder='Enter text for segmentation'
140
+ )
141
+ inpaint_text = gr.Textbox(
142
+ label='Inpainting text prompt',
143
+ placeholder='Enter text for inpainting'
144
+ )
145
+ with gr.Accordion("Advanced Settings", open=False):
146
+ seed_slicer = gr.Slider(
147
+ label="Seed",
148
+ minimum=0,
149
+ maximum=MAX_SEED,
150
+ step=1,
151
+ value=42,
152
+ )
153
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
154
+ strength = gr.Slider(
155
+ label="Strength",
156
+ minimum=0,
157
+ maximum=1,
158
+ step=0.01,
159
+ value=0.75,
160
+ )
161
+ num_inference_steps = gr.Slider(
162
+ label="Number of inference steps",
163
+ minimum=1,
164
+ maximum=50,
165
+ step=1,
166
+ value=20,
167
+ )
168
+ submit_button = gr.Button(value='Process', variant='primary')
169
+ with gr.Column():
170
+ output_image = gr.Image(label='Output image')
171
+ with gr.Accordion("Generated Mask", open=False):
172
+ output_mask = gr.Image(label='Segmentation mask')
173
+
174
+ submit_button.click(
175
+ fn=process_image,
176
+ inputs=[
177
+ image_input,
178
+ segmentation_text,
179
+ inpaint_text,
180
+ seed_slicer,
181
+ randomize_seed,
182
+ strength,
183
+ num_inference_steps
184
+ ],
185
+ outputs=[output_image, output_mask]
186
+ )
187
+
188
+ demo.launch(debug=True, show_error=True, server_name="0.0.0.0",share=True)
app_old.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import gradio as gr
4
+ import spaces
5
+ import supervision as sv
6
+ import torch
7
+ from PIL import Image
8
+
9
+ from utils.florence import load_florence_model, run_florence_inference, \
10
+ FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
11
+ from utils.sam import load_sam_image_model, run_sam_inference
12
+
13
+ DEVICE = torch.device("cuda")
14
+ # DEVICE = torch.device("cpu")
15
+
16
+ torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
17
+ if torch.cuda.get_device_properties(0).major >= 8:
18
+ torch.backends.cuda.matmul.allow_tf32 = True
19
+ torch.backends.cudnn.allow_tf32 = True
20
+
21
+
22
+ FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
23
+ SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
24
+
25
+
26
+ @spaces.GPU(duration=20)
27
+ @torch.inference_mode()
28
+ @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
29
+ def process_image(image_input, text_input) -> Optional[Image.Image]:
30
+ if not image_input:
31
+ gr.Info("Please upload an image.")
32
+ return None
33
+
34
+ if not text_input:
35
+ gr.Info("Please enter a text prompt.")
36
+ return None
37
+
38
+ _, result = run_florence_inference(
39
+ model=FLORENCE_MODEL,
40
+ processor=FLORENCE_PROCESSOR,
41
+ device=DEVICE,
42
+ image=image_input,
43
+ task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
44
+ text=text_input
45
+ )
46
+ detections = sv.Detections.from_lmm(
47
+ lmm=sv.LMM.FLORENCE_2,
48
+ result=result,
49
+ resolution_wh=image_input.size
50
+ )
51
+ detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
52
+ if len(detections) == 0:
53
+ gr.Info("No objects detected.")
54
+ return None
55
+ return Image.fromarray(detections.mask[0].astype("uint8") * 255)
56
+
57
+
58
+ with gr.Blocks() as demo:
59
+ with gr.Row():
60
+ with gr.Column():
61
+ image_input_component = gr.Image(
62
+ type='pil', label='Upload image')
63
+ text_input_component = gr.Textbox(
64
+ label='Text prompt',
65
+ placeholder='Enter text prompts')
66
+ submit_button_component = gr.Button(
67
+ value='Submit', variant='primary')
68
+ with gr.Column():
69
+ image_output_component = gr.Image(label='Output mask')
70
+
71
+ submit_button_component.click(
72
+ fn=process_image,
73
+ inputs=[
74
+ image_input_component,
75
+ text_input_component
76
+ ],
77
+ outputs=[
78
+ image_output_component,
79
+ ]
80
+ )
81
+ text_input_component.submit(
82
+ fn=process_image,
83
+ inputs=[
84
+ image_input_component,
85
+ text_input_component
86
+ ],
87
+ outputs=[
88
+ image_output_component,
89
+ ]
90
+ )
91
+
92
+ demo.launch(debug=False, show_error=True)
checkpoints/sam2_hiera_base_plus.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0bb7f236400a49669ffdd1be617959a8b1d1065081789d7bbff88eded3a8071
3
+ size 323493298
checkpoints/sam2_hiera_large.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7442e4e9b732a508f80e141e7c2913437a3610ee0c77381a66658c3a445df87b
3
+ size 897952466
checkpoints/sam2_hiera_small.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95949964d4e548409021d47b22712d5f1abf2564cc0c3c765ba599a24ac7dce3
3
+ size 184309650
checkpoints/sam2_hiera_tiny.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65b50056e05bcb13694174f51bb6da89c894b57b75ccdf0ba6352c597c5d1125
3
+ size 155906050
configs/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
configs/sam2_hiera_b+.yaml ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # Model
4
+ model:
5
+ _target_: sam2.modeling.sam2_base.SAM2Base
6
+ image_encoder:
7
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8
+ scalp: 1
9
+ trunk:
10
+ _target_: sam2.modeling.backbones.hieradet.Hiera
11
+ embed_dim: 112
12
+ num_heads: 2
13
+ neck:
14
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
15
+ position_encoding:
16
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
17
+ num_pos_feats: 256
18
+ normalize: true
19
+ scale: null
20
+ temperature: 10000
21
+ d_model: 256
22
+ backbone_channel_list: [896, 448, 224, 112]
23
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
24
+ fpn_interp_model: nearest
25
+
26
+ memory_attention:
27
+ _target_: sam2.modeling.memory_attention.MemoryAttention
28
+ d_model: 256
29
+ pos_enc_at_input: true
30
+ layer:
31
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
32
+ activation: relu
33
+ dim_feedforward: 2048
34
+ dropout: 0.1
35
+ pos_enc_at_attn: false
36
+ self_attention:
37
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
38
+ rope_theta: 10000.0
39
+ feat_sizes: [32, 32]
40
+ embedding_dim: 256
41
+ num_heads: 1
42
+ downsample_rate: 1
43
+ dropout: 0.1
44
+ d_model: 256
45
+ pos_enc_at_cross_attn_keys: true
46
+ pos_enc_at_cross_attn_queries: false
47
+ cross_attention:
48
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
49
+ rope_theta: 10000.0
50
+ feat_sizes: [32, 32]
51
+ rope_k_repeat: True
52
+ embedding_dim: 256
53
+ num_heads: 1
54
+ downsample_rate: 1
55
+ dropout: 0.1
56
+ kv_in_dim: 64
57
+ num_layers: 4
58
+
59
+ memory_encoder:
60
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
61
+ out_dim: 64
62
+ position_encoding:
63
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
64
+ num_pos_feats: 64
65
+ normalize: true
66
+ scale: null
67
+ temperature: 10000
68
+ mask_downsampler:
69
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
70
+ kernel_size: 3
71
+ stride: 2
72
+ padding: 1
73
+ fuser:
74
+ _target_: sam2.modeling.memory_encoder.Fuser
75
+ layer:
76
+ _target_: sam2.modeling.memory_encoder.CXBlock
77
+ dim: 256
78
+ kernel_size: 7
79
+ padding: 3
80
+ layer_scale_init_value: 1e-6
81
+ use_dwconv: True # depth-wise convs
82
+ num_layers: 2
83
+
84
+ num_maskmem: 7
85
+ image_size: 1024
86
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
87
+ sigmoid_scale_for_mem_enc: 20.0
88
+ sigmoid_bias_for_mem_enc: -10.0
89
+ use_mask_input_as_output_without_sam: true
90
+ # Memory
91
+ directly_add_no_mem_embed: true
92
+ # use high-resolution feature map in the SAM mask decoder
93
+ use_high_res_features_in_sam: true
94
+ # output 3 masks on the first click on initial conditioning frames
95
+ multimask_output_in_sam: true
96
+ # SAM heads
97
+ iou_prediction_use_sigmoid: True
98
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
99
+ use_obj_ptrs_in_encoder: true
100
+ add_tpos_enc_to_obj_ptrs: false
101
+ only_obj_ptrs_in_the_past_for_eval: true
102
+ # object occlusion prediction
103
+ pred_obj_scores: true
104
+ pred_obj_scores_mlp: true
105
+ fixed_no_obj_ptr: true
106
+ # multimask tracking settings
107
+ multimask_output_for_tracking: true
108
+ use_multimask_token_for_obj_ptr: true
109
+ multimask_min_pt_num: 0
110
+ multimask_max_pt_num: 1
111
+ use_mlp_for_obj_ptr_proj: true
112
+ # Compilation flag
113
+ compile_image_encoder: False
configs/sam2_hiera_l.yaml ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # Model
4
+ model:
5
+ _target_: sam2.modeling.sam2_base.SAM2Base
6
+ image_encoder:
7
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8
+ scalp: 1
9
+ trunk:
10
+ _target_: sam2.modeling.backbones.hieradet.Hiera
11
+ embed_dim: 144
12
+ num_heads: 2
13
+ stages: [2, 6, 36, 4]
14
+ global_att_blocks: [23, 33, 43]
15
+ window_pos_embed_bkg_spatial_size: [7, 7]
16
+ window_spec: [8, 4, 16, 8]
17
+ neck:
18
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
19
+ position_encoding:
20
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
21
+ num_pos_feats: 256
22
+ normalize: true
23
+ scale: null
24
+ temperature: 10000
25
+ d_model: 256
26
+ backbone_channel_list: [1152, 576, 288, 144]
27
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
28
+ fpn_interp_model: nearest
29
+
30
+ memory_attention:
31
+ _target_: sam2.modeling.memory_attention.MemoryAttention
32
+ d_model: 256
33
+ pos_enc_at_input: true
34
+ layer:
35
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
36
+ activation: relu
37
+ dim_feedforward: 2048
38
+ dropout: 0.1
39
+ pos_enc_at_attn: false
40
+ self_attention:
41
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
42
+ rope_theta: 10000.0
43
+ feat_sizes: [32, 32]
44
+ embedding_dim: 256
45
+ num_heads: 1
46
+ downsample_rate: 1
47
+ dropout: 0.1
48
+ d_model: 256
49
+ pos_enc_at_cross_attn_keys: true
50
+ pos_enc_at_cross_attn_queries: false
51
+ cross_attention:
52
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
53
+ rope_theta: 10000.0
54
+ feat_sizes: [32, 32]
55
+ rope_k_repeat: True
56
+ embedding_dim: 256
57
+ num_heads: 1
58
+ downsample_rate: 1
59
+ dropout: 0.1
60
+ kv_in_dim: 64
61
+ num_layers: 4
62
+
63
+ memory_encoder:
64
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
65
+ out_dim: 64
66
+ position_encoding:
67
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
68
+ num_pos_feats: 64
69
+ normalize: true
70
+ scale: null
71
+ temperature: 10000
72
+ mask_downsampler:
73
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
74
+ kernel_size: 3
75
+ stride: 2
76
+ padding: 1
77
+ fuser:
78
+ _target_: sam2.modeling.memory_encoder.Fuser
79
+ layer:
80
+ _target_: sam2.modeling.memory_encoder.CXBlock
81
+ dim: 256
82
+ kernel_size: 7
83
+ padding: 3
84
+ layer_scale_init_value: 1e-6
85
+ use_dwconv: True # depth-wise convs
86
+ num_layers: 2
87
+
88
+ num_maskmem: 7
89
+ image_size: 1024
90
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
91
+ sigmoid_scale_for_mem_enc: 20.0
92
+ sigmoid_bias_for_mem_enc: -10.0
93
+ use_mask_input_as_output_without_sam: true
94
+ # Memory
95
+ directly_add_no_mem_embed: true
96
+ # use high-resolution feature map in the SAM mask decoder
97
+ use_high_res_features_in_sam: true
98
+ # output 3 masks on the first click on initial conditioning frames
99
+ multimask_output_in_sam: true
100
+ # SAM heads
101
+ iou_prediction_use_sigmoid: True
102
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
103
+ use_obj_ptrs_in_encoder: true
104
+ add_tpos_enc_to_obj_ptrs: false
105
+ only_obj_ptrs_in_the_past_for_eval: true
106
+ # object occlusion prediction
107
+ pred_obj_scores: true
108
+ pred_obj_scores_mlp: true
109
+ fixed_no_obj_ptr: true
110
+ # multimask tracking settings
111
+ multimask_output_for_tracking: true
112
+ use_multimask_token_for_obj_ptr: true
113
+ multimask_min_pt_num: 0
114
+ multimask_max_pt_num: 1
115
+ use_mlp_for_obj_ptr_proj: true
116
+ # Compilation flag
117
+ compile_image_encoder: False
configs/sam2_hiera_s.yaml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # Model
4
+ model:
5
+ _target_: sam2.modeling.sam2_base.SAM2Base
6
+ image_encoder:
7
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8
+ scalp: 1
9
+ trunk:
10
+ _target_: sam2.modeling.backbones.hieradet.Hiera
11
+ embed_dim: 96
12
+ num_heads: 1
13
+ stages: [1, 2, 11, 2]
14
+ global_att_blocks: [7, 10, 13]
15
+ window_pos_embed_bkg_spatial_size: [7, 7]
16
+ neck:
17
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
18
+ position_encoding:
19
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
20
+ num_pos_feats: 256
21
+ normalize: true
22
+ scale: null
23
+ temperature: 10000
24
+ d_model: 256
25
+ backbone_channel_list: [768, 384, 192, 96]
26
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
27
+ fpn_interp_model: nearest
28
+
29
+ memory_attention:
30
+ _target_: sam2.modeling.memory_attention.MemoryAttention
31
+ d_model: 256
32
+ pos_enc_at_input: true
33
+ layer:
34
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
35
+ activation: relu
36
+ dim_feedforward: 2048
37
+ dropout: 0.1
38
+ pos_enc_at_attn: false
39
+ self_attention:
40
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
41
+ rope_theta: 10000.0
42
+ feat_sizes: [32, 32]
43
+ embedding_dim: 256
44
+ num_heads: 1
45
+ downsample_rate: 1
46
+ dropout: 0.1
47
+ d_model: 256
48
+ pos_enc_at_cross_attn_keys: true
49
+ pos_enc_at_cross_attn_queries: false
50
+ cross_attention:
51
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
52
+ rope_theta: 10000.0
53
+ feat_sizes: [32, 32]
54
+ rope_k_repeat: True
55
+ embedding_dim: 256
56
+ num_heads: 1
57
+ downsample_rate: 1
58
+ dropout: 0.1
59
+ kv_in_dim: 64
60
+ num_layers: 4
61
+
62
+ memory_encoder:
63
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
64
+ out_dim: 64
65
+ position_encoding:
66
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
67
+ num_pos_feats: 64
68
+ normalize: true
69
+ scale: null
70
+ temperature: 10000
71
+ mask_downsampler:
72
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
73
+ kernel_size: 3
74
+ stride: 2
75
+ padding: 1
76
+ fuser:
77
+ _target_: sam2.modeling.memory_encoder.Fuser
78
+ layer:
79
+ _target_: sam2.modeling.memory_encoder.CXBlock
80
+ dim: 256
81
+ kernel_size: 7
82
+ padding: 3
83
+ layer_scale_init_value: 1e-6
84
+ use_dwconv: True # depth-wise convs
85
+ num_layers: 2
86
+
87
+ num_maskmem: 7
88
+ image_size: 1024
89
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
90
+ sigmoid_scale_for_mem_enc: 20.0
91
+ sigmoid_bias_for_mem_enc: -10.0
92
+ use_mask_input_as_output_without_sam: true
93
+ # Memory
94
+ directly_add_no_mem_embed: true
95
+ # use high-resolution feature map in the SAM mask decoder
96
+ use_high_res_features_in_sam: true
97
+ # output 3 masks on the first click on initial conditioning frames
98
+ multimask_output_in_sam: true
99
+ # SAM heads
100
+ iou_prediction_use_sigmoid: True
101
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
102
+ use_obj_ptrs_in_encoder: true
103
+ add_tpos_enc_to_obj_ptrs: false
104
+ only_obj_ptrs_in_the_past_for_eval: true
105
+ # object occlusion prediction
106
+ pred_obj_scores: true
107
+ pred_obj_scores_mlp: true
108
+ fixed_no_obj_ptr: true
109
+ # multimask tracking settings
110
+ multimask_output_for_tracking: true
111
+ use_multimask_token_for_obj_ptr: true
112
+ multimask_min_pt_num: 0
113
+ multimask_max_pt_num: 1
114
+ use_mlp_for_obj_ptr_proj: true
115
+ # Compilation flag
116
+ compile_image_encoder: False
configs/sam2_hiera_t.yaml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # Model
4
+ model:
5
+ _target_: sam2.modeling.sam2_base.SAM2Base
6
+ image_encoder:
7
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8
+ scalp: 1
9
+ trunk:
10
+ _target_: sam2.modeling.backbones.hieradet.Hiera
11
+ embed_dim: 96
12
+ num_heads: 1
13
+ stages: [1, 2, 7, 2]
14
+ global_att_blocks: [5, 7, 9]
15
+ window_pos_embed_bkg_spatial_size: [7, 7]
16
+ neck:
17
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
18
+ position_encoding:
19
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
20
+ num_pos_feats: 256
21
+ normalize: true
22
+ scale: null
23
+ temperature: 10000
24
+ d_model: 256
25
+ backbone_channel_list: [768, 384, 192, 96]
26
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
27
+ fpn_interp_model: nearest
28
+
29
+ memory_attention:
30
+ _target_: sam2.modeling.memory_attention.MemoryAttention
31
+ d_model: 256
32
+ pos_enc_at_input: true
33
+ layer:
34
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
35
+ activation: relu
36
+ dim_feedforward: 2048
37
+ dropout: 0.1
38
+ pos_enc_at_attn: false
39
+ self_attention:
40
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
41
+ rope_theta: 10000.0
42
+ feat_sizes: [32, 32]
43
+ embedding_dim: 256
44
+ num_heads: 1
45
+ downsample_rate: 1
46
+ dropout: 0.1
47
+ d_model: 256
48
+ pos_enc_at_cross_attn_keys: true
49
+ pos_enc_at_cross_attn_queries: false
50
+ cross_attention:
51
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
52
+ rope_theta: 10000.0
53
+ feat_sizes: [32, 32]
54
+ rope_k_repeat: True
55
+ embedding_dim: 256
56
+ num_heads: 1
57
+ downsample_rate: 1
58
+ dropout: 0.1
59
+ kv_in_dim: 64
60
+ num_layers: 4
61
+
62
+ memory_encoder:
63
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
64
+ out_dim: 64
65
+ position_encoding:
66
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
67
+ num_pos_feats: 64
68
+ normalize: true
69
+ scale: null
70
+ temperature: 10000
71
+ mask_downsampler:
72
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
73
+ kernel_size: 3
74
+ stride: 2
75
+ padding: 1
76
+ fuser:
77
+ _target_: sam2.modeling.memory_encoder.Fuser
78
+ layer:
79
+ _target_: sam2.modeling.memory_encoder.CXBlock
80
+ dim: 256
81
+ kernel_size: 7
82
+ padding: 3
83
+ layer_scale_init_value: 1e-6
84
+ use_dwconv: True # depth-wise convs
85
+ num_layers: 2
86
+
87
+ num_maskmem: 7
88
+ image_size: 1024
89
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
90
+ # SAM decoder
91
+ sigmoid_scale_for_mem_enc: 20.0
92
+ sigmoid_bias_for_mem_enc: -10.0
93
+ use_mask_input_as_output_without_sam: true
94
+ # Memory
95
+ directly_add_no_mem_embed: true
96
+ # use high-resolution feature map in the SAM mask decoder
97
+ use_high_res_features_in_sam: true
98
+ # output 3 masks on the first click on initial conditioning frames
99
+ multimask_output_in_sam: true
100
+ # SAM heads
101
+ iou_prediction_use_sigmoid: True
102
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
103
+ use_obj_ptrs_in_encoder: true
104
+ add_tpos_enc_to_obj_ptrs: false
105
+ only_obj_ptrs_in_the_past_for_eval: true
106
+ # object occlusion prediction
107
+ pred_obj_scores: true
108
+ pred_obj_scores_mlp: true
109
+ fixed_no_obj_ptr: true
110
+ # multimask tracking settings
111
+ multimask_output_for_tracking: true
112
+ use_multimask_token_for_obj_ptr: true
113
+ multimask_min_pt_num: 0
114
+ multimask_max_pt_num: 1
115
+ use_mlp_for_obj_ptr_proj: true
116
+ # Compilation flag
117
+ # HieraT does not currently support compilation, should always be set to False
118
+ compile_image_encoder: False
gradio_cached_examples/19/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Output image,Segmentation mask,flag,username,timestamp
2
+ ,,,,2024-09-04 10:42:00.000310
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tqdm
2
+ einops
3
+ spaces
4
+ timm
5
+ samv2
6
+ gradio
7
+ supervision
8
+ opencv-python
9
+ pytest
10
+ accelerate
11
+ transformers==4.42.4
12
+ sentencepiece
13
+ git+https://github.com/Gothos/diffusers.git@flux-inpaint
utils/__init__.py ADDED
File without changes
utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (165 Bytes). View file
 
utils/__pycache__/florence.cpython-311.pyc ADDED
Binary file (3.62 kB). View file
 
utils/__pycache__/sam.cpython-311.pyc ADDED
Binary file (2.22 kB). View file
 
utils/florence.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Union, Any, Tuple, Dict
3
+ from unittest.mock import patch
4
+
5
+ import torch
6
+ from PIL import Image
7
+ from transformers import AutoModelForCausalLM, AutoProcessor
8
+ from transformers.dynamic_module_utils import get_imports
9
+
10
+ # FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
11
+ FLORENCE_CHECKPOINT = "microsoft/Florence-2-large"
12
+ FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
13
+ FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
14
+ FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
15
+ FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
16
+ FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
17
+
18
+
19
+ def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
20
+ """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
21
+ if not str(filename).endswith("/modeling_florence2.py"):
22
+ return get_imports(filename)
23
+ imports = get_imports(filename)
24
+ imports.remove("flash_attn")
25
+ return imports
26
+
27
+
28
+ def load_florence_model(
29
+ device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
30
+ ) -> Tuple[Any, Any]:
31
+ with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
32
+ model = AutoModelForCausalLM.from_pretrained(
33
+ checkpoint, trust_remote_code=True).to(device).eval()
34
+ processor = AutoProcessor.from_pretrained(
35
+ checkpoint, trust_remote_code=True)
36
+ return model, processor
37
+
38
+
39
+ def run_florence_inference(
40
+ model: Any,
41
+ processor: Any,
42
+ device: torch.device,
43
+ image: Image,
44
+ task: str,
45
+ text: str = ""
46
+ ) -> Tuple[str, Dict]:
47
+ prompt = task + text
48
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
49
+ generated_ids = model.generate(
50
+ input_ids=inputs["input_ids"],
51
+ pixel_values=inputs["pixel_values"],
52
+ max_new_tokens=1024,
53
+ num_beams=3
54
+ )
55
+ generated_text = processor.batch_decode(
56
+ generated_ids, skip_special_tokens=False)[0]
57
+ response = processor.post_process_generation(
58
+ generated_text, task=task, image_size=image.size)
59
+ return generated_text, response
utils/sam.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ import numpy as np
4
+ import supervision as sv
5
+ import torch
6
+ from PIL import Image
7
+ from sam2.build_sam import build_sam2, build_sam2_video_predictor
8
+ from sam2.sam2_image_predictor import SAM2ImagePredictor
9
+
10
+ # SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
11
+ # SAM_CONFIG = "sam2_hiera_s.yaml"
12
+ SAM_CHECKPOINT = "checkpoints/sam2_hiera_large.pt"
13
+ SAM_CONFIG = "sam2_hiera_l.yaml"
14
+
15
+
16
+ def load_sam_image_model(
17
+ device: torch.device,
18
+ config: str = SAM_CONFIG,
19
+ checkpoint: str = SAM_CHECKPOINT
20
+ ) -> SAM2ImagePredictor:
21
+ model = build_sam2(config, checkpoint, device=device)
22
+ return SAM2ImagePredictor(sam_model=model)
23
+
24
+
25
+ def load_sam_video_model(
26
+ device: torch.device,
27
+ config: str = SAM_CONFIG,
28
+ checkpoint: str = SAM_CHECKPOINT
29
+ ) -> Any:
30
+ return build_sam2_video_predictor(config, checkpoint, device=device)
31
+
32
+
33
+ def run_sam_inference(
34
+ model: Any,
35
+ image: Image,
36
+ detections: sv.Detections
37
+ ) -> sv.Detections:
38
+ image = np.array(image.convert("RGB"))
39
+ model.set_image(image)
40
+ mask, score, _ = model.predict(box=detections.xyxy, multimask_output=False)
41
+
42
+ # dirty fix; remove this later
43
+ if len(mask.shape) == 4:
44
+ mask = np.squeeze(mask)
45
+
46
+ detections.mask = mask.astype(bool)
47
+ return detections