Spaces:
Sleeping
Sleeping
added repo code
Browse files- README.md +5 -4
- app.py +188 -0
- app_old.py +92 -0
- checkpoints/sam2_hiera_base_plus.pt +3 -0
- checkpoints/sam2_hiera_large.pt +3 -0
- checkpoints/sam2_hiera_small.pt +3 -0
- checkpoints/sam2_hiera_tiny.pt +3 -0
- configs/__init__.py +5 -0
- configs/sam2_hiera_b+.yaml +113 -0
- configs/sam2_hiera_l.yaml +117 -0
- configs/sam2_hiera_s.yaml +116 -0
- configs/sam2_hiera_t.yaml +118 -0
- gradio_cached_examples/19/log.csv +2 -0
- requirements.txt +13 -0
- utils/__init__.py +0 -0
- utils/__pycache__/__init__.cpython-311.pyc +0 -0
- utils/__pycache__/florence.cpython-311.pyc +0 -0
- utils/__pycache__/sam.cpython-311.pyc +0 -0
- utils/florence.py +59 -0
- utils/sam.py +47 -0
README.md
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: purple
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
1 |
---
|
2 |
+
title: Florence2 + SAM2 Masking
|
3 |
+
emoji: 😷
|
4 |
colorFrom: purple
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.40.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
+
# florence-2-segment-flux-inpainting
|
app.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
import numpy as np
|
4 |
+
import gradio as gr
|
5 |
+
import spaces
|
6 |
+
import torch
|
7 |
+
import supervision as sv
|
8 |
+
from PIL import Image
|
9 |
+
from typing import Optional, Tuple
|
10 |
+
from diffusers import FluxInpaintPipeline
|
11 |
+
|
12 |
+
from utils.florence import load_florence_model, run_florence_inference, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
|
13 |
+
from utils.sam import load_sam_image_model, run_sam_inference
|
14 |
+
|
15 |
+
# Set up device and environment
|
16 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
17 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
18 |
+
MAX_SEED = np.iinfo(np.int32).max
|
19 |
+
MAX_IMAGE_SIZE = 2048
|
20 |
+
|
21 |
+
# Load models
|
22 |
+
FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
|
23 |
+
SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
|
24 |
+
FLUX_PIPE = FluxInpaintPipeline.from_pretrained(
|
25 |
+
"black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16).to(DEVICE)
|
26 |
+
|
27 |
+
# Set up CUDA optimizations
|
28 |
+
torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
|
29 |
+
if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:
|
30 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
31 |
+
torch.backends.cudnn.allow_tf32 = True
|
32 |
+
|
33 |
+
def resize_image_dimensions(
|
34 |
+
original_resolution_wh: Tuple[int, int],
|
35 |
+
maximum_dimension: int = 2048
|
36 |
+
) -> Tuple[int, int]:
|
37 |
+
width, height = original_resolution_wh
|
38 |
+
|
39 |
+
if width <= maximum_dimension and height <= maximum_dimension:
|
40 |
+
width = width - (width % 32)
|
41 |
+
height = height - (height % 32)
|
42 |
+
return width, height
|
43 |
+
|
44 |
+
if width > height:
|
45 |
+
scaling_factor = maximum_dimension / width
|
46 |
+
else:
|
47 |
+
scaling_factor = maximum_dimension / height
|
48 |
+
|
49 |
+
new_width = int(width * scaling_factor)
|
50 |
+
new_height = int(height * scaling_factor)
|
51 |
+
|
52 |
+
new_width = new_width - (new_width % 32)
|
53 |
+
new_height = new_height - (new_height % 32)
|
54 |
+
|
55 |
+
return new_width, new_height
|
56 |
+
|
57 |
+
@spaces.GPU(duration=150)
|
58 |
+
@torch.inference_mode()
|
59 |
+
@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
|
60 |
+
def process_image(
|
61 |
+
image_input,
|
62 |
+
segmentation_text,
|
63 |
+
inpaint_text,
|
64 |
+
seed_slicer: int,
|
65 |
+
randomize_seed: bool,
|
66 |
+
strength: float,
|
67 |
+
num_inference_steps: int,
|
68 |
+
progress=gr.Progress(track_tqdm=True)
|
69 |
+
) -> Optional[Image.Image]:
|
70 |
+
if not image_input:
|
71 |
+
gr.Info("Please upload an image.")
|
72 |
+
return None, None
|
73 |
+
|
74 |
+
if not segmentation_text:
|
75 |
+
gr.Info("Please enter a text prompt for segmentation.")
|
76 |
+
return None, None
|
77 |
+
|
78 |
+
if not inpaint_text:
|
79 |
+
gr.Info("Please enter a text prompt for inpainting.")
|
80 |
+
return None, None
|
81 |
+
|
82 |
+
# Florence-SAM segmentation
|
83 |
+
_, result = run_florence_inference(
|
84 |
+
model=FLORENCE_MODEL,
|
85 |
+
processor=FLORENCE_PROCESSOR,
|
86 |
+
device=DEVICE,
|
87 |
+
image=image_input,
|
88 |
+
task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
|
89 |
+
text=segmentation_text
|
90 |
+
)
|
91 |
+
detections = sv.Detections.from_lmm(
|
92 |
+
lmm=sv.LMM.FLORENCE_2,
|
93 |
+
result=result,
|
94 |
+
resolution_wh=image_input.size
|
95 |
+
)
|
96 |
+
detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
|
97 |
+
|
98 |
+
if len(detections) == 0:
|
99 |
+
gr.Info("No objects detected.")
|
100 |
+
return None, None
|
101 |
+
|
102 |
+
mask = Image.fromarray(detections.mask[0].astype("uint8") * 255)
|
103 |
+
|
104 |
+
# Resize images for FLUX
|
105 |
+
width, height = resize_image_dimensions(original_resolution_wh=image_input.size)
|
106 |
+
resized_image = image_input.resize((width, height), Image.LANCZOS)
|
107 |
+
resized_mask = mask.resize((width, height), Image.NEAREST)
|
108 |
+
|
109 |
+
# FLUX inpainting
|
110 |
+
if randomize_seed:
|
111 |
+
seed_slicer = random.randint(0, MAX_SEED)
|
112 |
+
generator = torch.Generator().manual_seed(seed_slicer)
|
113 |
+
|
114 |
+
result = FLUX_PIPE(
|
115 |
+
prompt=inpaint_text,
|
116 |
+
image=resized_image,
|
117 |
+
mask_image=resized_mask,
|
118 |
+
width=width,
|
119 |
+
height=height,
|
120 |
+
strength=strength,
|
121 |
+
generator=generator,
|
122 |
+
num_inference_steps=num_inference_steps
|
123 |
+
).images[0]
|
124 |
+
|
125 |
+
return result, resized_mask
|
126 |
+
|
127 |
+
# Gradio interface
|
128 |
+
with gr.Blocks() as demo:
|
129 |
+
gr.Markdown("# MonsterAPI Prompt Guided Inpainting")
|
130 |
+
with gr.Row():
|
131 |
+
with gr.Column():
|
132 |
+
image_input = gr.Image(
|
133 |
+
label='Upload image',
|
134 |
+
type='pil',
|
135 |
+
image_mode='RGB',
|
136 |
+
)
|
137 |
+
segmentation_text = gr.Textbox(
|
138 |
+
label='Segmentation text prompt',
|
139 |
+
placeholder='Enter text for segmentation'
|
140 |
+
)
|
141 |
+
inpaint_text = gr.Textbox(
|
142 |
+
label='Inpainting text prompt',
|
143 |
+
placeholder='Enter text for inpainting'
|
144 |
+
)
|
145 |
+
with gr.Accordion("Advanced Settings", open=False):
|
146 |
+
seed_slicer = gr.Slider(
|
147 |
+
label="Seed",
|
148 |
+
minimum=0,
|
149 |
+
maximum=MAX_SEED,
|
150 |
+
step=1,
|
151 |
+
value=42,
|
152 |
+
)
|
153 |
+
randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
|
154 |
+
strength = gr.Slider(
|
155 |
+
label="Strength",
|
156 |
+
minimum=0,
|
157 |
+
maximum=1,
|
158 |
+
step=0.01,
|
159 |
+
value=0.75,
|
160 |
+
)
|
161 |
+
num_inference_steps = gr.Slider(
|
162 |
+
label="Number of inference steps",
|
163 |
+
minimum=1,
|
164 |
+
maximum=50,
|
165 |
+
step=1,
|
166 |
+
value=20,
|
167 |
+
)
|
168 |
+
submit_button = gr.Button(value='Process', variant='primary')
|
169 |
+
with gr.Column():
|
170 |
+
output_image = gr.Image(label='Output image')
|
171 |
+
with gr.Accordion("Generated Mask", open=False):
|
172 |
+
output_mask = gr.Image(label='Segmentation mask')
|
173 |
+
|
174 |
+
submit_button.click(
|
175 |
+
fn=process_image,
|
176 |
+
inputs=[
|
177 |
+
image_input,
|
178 |
+
segmentation_text,
|
179 |
+
inpaint_text,
|
180 |
+
seed_slicer,
|
181 |
+
randomize_seed,
|
182 |
+
strength,
|
183 |
+
num_inference_steps
|
184 |
+
],
|
185 |
+
outputs=[output_image, output_mask]
|
186 |
+
)
|
187 |
+
|
188 |
+
demo.launch(debug=True, show_error=True, server_name="0.0.0.0",share=True)
|
app_old.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import spaces
|
5 |
+
import supervision as sv
|
6 |
+
import torch
|
7 |
+
from PIL import Image
|
8 |
+
|
9 |
+
from utils.florence import load_florence_model, run_florence_inference, \
|
10 |
+
FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
|
11 |
+
from utils.sam import load_sam_image_model, run_sam_inference
|
12 |
+
|
13 |
+
DEVICE = torch.device("cuda")
|
14 |
+
# DEVICE = torch.device("cpu")
|
15 |
+
|
16 |
+
torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
|
17 |
+
if torch.cuda.get_device_properties(0).major >= 8:
|
18 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
19 |
+
torch.backends.cudnn.allow_tf32 = True
|
20 |
+
|
21 |
+
|
22 |
+
FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
|
23 |
+
SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
|
24 |
+
|
25 |
+
|
26 |
+
@spaces.GPU(duration=20)
|
27 |
+
@torch.inference_mode()
|
28 |
+
@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
|
29 |
+
def process_image(image_input, text_input) -> Optional[Image.Image]:
|
30 |
+
if not image_input:
|
31 |
+
gr.Info("Please upload an image.")
|
32 |
+
return None
|
33 |
+
|
34 |
+
if not text_input:
|
35 |
+
gr.Info("Please enter a text prompt.")
|
36 |
+
return None
|
37 |
+
|
38 |
+
_, result = run_florence_inference(
|
39 |
+
model=FLORENCE_MODEL,
|
40 |
+
processor=FLORENCE_PROCESSOR,
|
41 |
+
device=DEVICE,
|
42 |
+
image=image_input,
|
43 |
+
task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
|
44 |
+
text=text_input
|
45 |
+
)
|
46 |
+
detections = sv.Detections.from_lmm(
|
47 |
+
lmm=sv.LMM.FLORENCE_2,
|
48 |
+
result=result,
|
49 |
+
resolution_wh=image_input.size
|
50 |
+
)
|
51 |
+
detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
|
52 |
+
if len(detections) == 0:
|
53 |
+
gr.Info("No objects detected.")
|
54 |
+
return None
|
55 |
+
return Image.fromarray(detections.mask[0].astype("uint8") * 255)
|
56 |
+
|
57 |
+
|
58 |
+
with gr.Blocks() as demo:
|
59 |
+
with gr.Row():
|
60 |
+
with gr.Column():
|
61 |
+
image_input_component = gr.Image(
|
62 |
+
type='pil', label='Upload image')
|
63 |
+
text_input_component = gr.Textbox(
|
64 |
+
label='Text prompt',
|
65 |
+
placeholder='Enter text prompts')
|
66 |
+
submit_button_component = gr.Button(
|
67 |
+
value='Submit', variant='primary')
|
68 |
+
with gr.Column():
|
69 |
+
image_output_component = gr.Image(label='Output mask')
|
70 |
+
|
71 |
+
submit_button_component.click(
|
72 |
+
fn=process_image,
|
73 |
+
inputs=[
|
74 |
+
image_input_component,
|
75 |
+
text_input_component
|
76 |
+
],
|
77 |
+
outputs=[
|
78 |
+
image_output_component,
|
79 |
+
]
|
80 |
+
)
|
81 |
+
text_input_component.submit(
|
82 |
+
fn=process_image,
|
83 |
+
inputs=[
|
84 |
+
image_input_component,
|
85 |
+
text_input_component
|
86 |
+
],
|
87 |
+
outputs=[
|
88 |
+
image_output_component,
|
89 |
+
]
|
90 |
+
)
|
91 |
+
|
92 |
+
demo.launch(debug=False, show_error=True)
|
checkpoints/sam2_hiera_base_plus.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0bb7f236400a49669ffdd1be617959a8b1d1065081789d7bbff88eded3a8071
|
3 |
+
size 323493298
|
checkpoints/sam2_hiera_large.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7442e4e9b732a508f80e141e7c2913437a3610ee0c77381a66658c3a445df87b
|
3 |
+
size 897952466
|
checkpoints/sam2_hiera_small.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:95949964d4e548409021d47b22712d5f1abf2564cc0c3c765ba599a24ac7dce3
|
3 |
+
size 184309650
|
checkpoints/sam2_hiera_tiny.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65b50056e05bcb13694174f51bb6da89c894b57b75ccdf0ba6352c597c5d1125
|
3 |
+
size 155906050
|
configs/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
configs/sam2_hiera_b+.yaml
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
# Model
|
4 |
+
model:
|
5 |
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
+
image_encoder:
|
7 |
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
+
scalp: 1
|
9 |
+
trunk:
|
10 |
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
+
embed_dim: 112
|
12 |
+
num_heads: 2
|
13 |
+
neck:
|
14 |
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
15 |
+
position_encoding:
|
16 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
17 |
+
num_pos_feats: 256
|
18 |
+
normalize: true
|
19 |
+
scale: null
|
20 |
+
temperature: 10000
|
21 |
+
d_model: 256
|
22 |
+
backbone_channel_list: [896, 448, 224, 112]
|
23 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
24 |
+
fpn_interp_model: nearest
|
25 |
+
|
26 |
+
memory_attention:
|
27 |
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
28 |
+
d_model: 256
|
29 |
+
pos_enc_at_input: true
|
30 |
+
layer:
|
31 |
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
32 |
+
activation: relu
|
33 |
+
dim_feedforward: 2048
|
34 |
+
dropout: 0.1
|
35 |
+
pos_enc_at_attn: false
|
36 |
+
self_attention:
|
37 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
38 |
+
rope_theta: 10000.0
|
39 |
+
feat_sizes: [32, 32]
|
40 |
+
embedding_dim: 256
|
41 |
+
num_heads: 1
|
42 |
+
downsample_rate: 1
|
43 |
+
dropout: 0.1
|
44 |
+
d_model: 256
|
45 |
+
pos_enc_at_cross_attn_keys: true
|
46 |
+
pos_enc_at_cross_attn_queries: false
|
47 |
+
cross_attention:
|
48 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
49 |
+
rope_theta: 10000.0
|
50 |
+
feat_sizes: [32, 32]
|
51 |
+
rope_k_repeat: True
|
52 |
+
embedding_dim: 256
|
53 |
+
num_heads: 1
|
54 |
+
downsample_rate: 1
|
55 |
+
dropout: 0.1
|
56 |
+
kv_in_dim: 64
|
57 |
+
num_layers: 4
|
58 |
+
|
59 |
+
memory_encoder:
|
60 |
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
61 |
+
out_dim: 64
|
62 |
+
position_encoding:
|
63 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
64 |
+
num_pos_feats: 64
|
65 |
+
normalize: true
|
66 |
+
scale: null
|
67 |
+
temperature: 10000
|
68 |
+
mask_downsampler:
|
69 |
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
70 |
+
kernel_size: 3
|
71 |
+
stride: 2
|
72 |
+
padding: 1
|
73 |
+
fuser:
|
74 |
+
_target_: sam2.modeling.memory_encoder.Fuser
|
75 |
+
layer:
|
76 |
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
77 |
+
dim: 256
|
78 |
+
kernel_size: 7
|
79 |
+
padding: 3
|
80 |
+
layer_scale_init_value: 1e-6
|
81 |
+
use_dwconv: True # depth-wise convs
|
82 |
+
num_layers: 2
|
83 |
+
|
84 |
+
num_maskmem: 7
|
85 |
+
image_size: 1024
|
86 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
87 |
+
sigmoid_scale_for_mem_enc: 20.0
|
88 |
+
sigmoid_bias_for_mem_enc: -10.0
|
89 |
+
use_mask_input_as_output_without_sam: true
|
90 |
+
# Memory
|
91 |
+
directly_add_no_mem_embed: true
|
92 |
+
# use high-resolution feature map in the SAM mask decoder
|
93 |
+
use_high_res_features_in_sam: true
|
94 |
+
# output 3 masks on the first click on initial conditioning frames
|
95 |
+
multimask_output_in_sam: true
|
96 |
+
# SAM heads
|
97 |
+
iou_prediction_use_sigmoid: True
|
98 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
99 |
+
use_obj_ptrs_in_encoder: true
|
100 |
+
add_tpos_enc_to_obj_ptrs: false
|
101 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
102 |
+
# object occlusion prediction
|
103 |
+
pred_obj_scores: true
|
104 |
+
pred_obj_scores_mlp: true
|
105 |
+
fixed_no_obj_ptr: true
|
106 |
+
# multimask tracking settings
|
107 |
+
multimask_output_for_tracking: true
|
108 |
+
use_multimask_token_for_obj_ptr: true
|
109 |
+
multimask_min_pt_num: 0
|
110 |
+
multimask_max_pt_num: 1
|
111 |
+
use_mlp_for_obj_ptr_proj: true
|
112 |
+
# Compilation flag
|
113 |
+
compile_image_encoder: False
|
configs/sam2_hiera_l.yaml
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
# Model
|
4 |
+
model:
|
5 |
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
+
image_encoder:
|
7 |
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
+
scalp: 1
|
9 |
+
trunk:
|
10 |
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
+
embed_dim: 144
|
12 |
+
num_heads: 2
|
13 |
+
stages: [2, 6, 36, 4]
|
14 |
+
global_att_blocks: [23, 33, 43]
|
15 |
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
+
window_spec: [8, 4, 16, 8]
|
17 |
+
neck:
|
18 |
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
19 |
+
position_encoding:
|
20 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
21 |
+
num_pos_feats: 256
|
22 |
+
normalize: true
|
23 |
+
scale: null
|
24 |
+
temperature: 10000
|
25 |
+
d_model: 256
|
26 |
+
backbone_channel_list: [1152, 576, 288, 144]
|
27 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
28 |
+
fpn_interp_model: nearest
|
29 |
+
|
30 |
+
memory_attention:
|
31 |
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
32 |
+
d_model: 256
|
33 |
+
pos_enc_at_input: true
|
34 |
+
layer:
|
35 |
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
36 |
+
activation: relu
|
37 |
+
dim_feedforward: 2048
|
38 |
+
dropout: 0.1
|
39 |
+
pos_enc_at_attn: false
|
40 |
+
self_attention:
|
41 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
42 |
+
rope_theta: 10000.0
|
43 |
+
feat_sizes: [32, 32]
|
44 |
+
embedding_dim: 256
|
45 |
+
num_heads: 1
|
46 |
+
downsample_rate: 1
|
47 |
+
dropout: 0.1
|
48 |
+
d_model: 256
|
49 |
+
pos_enc_at_cross_attn_keys: true
|
50 |
+
pos_enc_at_cross_attn_queries: false
|
51 |
+
cross_attention:
|
52 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
53 |
+
rope_theta: 10000.0
|
54 |
+
feat_sizes: [32, 32]
|
55 |
+
rope_k_repeat: True
|
56 |
+
embedding_dim: 256
|
57 |
+
num_heads: 1
|
58 |
+
downsample_rate: 1
|
59 |
+
dropout: 0.1
|
60 |
+
kv_in_dim: 64
|
61 |
+
num_layers: 4
|
62 |
+
|
63 |
+
memory_encoder:
|
64 |
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
65 |
+
out_dim: 64
|
66 |
+
position_encoding:
|
67 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
68 |
+
num_pos_feats: 64
|
69 |
+
normalize: true
|
70 |
+
scale: null
|
71 |
+
temperature: 10000
|
72 |
+
mask_downsampler:
|
73 |
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
74 |
+
kernel_size: 3
|
75 |
+
stride: 2
|
76 |
+
padding: 1
|
77 |
+
fuser:
|
78 |
+
_target_: sam2.modeling.memory_encoder.Fuser
|
79 |
+
layer:
|
80 |
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
81 |
+
dim: 256
|
82 |
+
kernel_size: 7
|
83 |
+
padding: 3
|
84 |
+
layer_scale_init_value: 1e-6
|
85 |
+
use_dwconv: True # depth-wise convs
|
86 |
+
num_layers: 2
|
87 |
+
|
88 |
+
num_maskmem: 7
|
89 |
+
image_size: 1024
|
90 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
91 |
+
sigmoid_scale_for_mem_enc: 20.0
|
92 |
+
sigmoid_bias_for_mem_enc: -10.0
|
93 |
+
use_mask_input_as_output_without_sam: true
|
94 |
+
# Memory
|
95 |
+
directly_add_no_mem_embed: true
|
96 |
+
# use high-resolution feature map in the SAM mask decoder
|
97 |
+
use_high_res_features_in_sam: true
|
98 |
+
# output 3 masks on the first click on initial conditioning frames
|
99 |
+
multimask_output_in_sam: true
|
100 |
+
# SAM heads
|
101 |
+
iou_prediction_use_sigmoid: True
|
102 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
103 |
+
use_obj_ptrs_in_encoder: true
|
104 |
+
add_tpos_enc_to_obj_ptrs: false
|
105 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
106 |
+
# object occlusion prediction
|
107 |
+
pred_obj_scores: true
|
108 |
+
pred_obj_scores_mlp: true
|
109 |
+
fixed_no_obj_ptr: true
|
110 |
+
# multimask tracking settings
|
111 |
+
multimask_output_for_tracking: true
|
112 |
+
use_multimask_token_for_obj_ptr: true
|
113 |
+
multimask_min_pt_num: 0
|
114 |
+
multimask_max_pt_num: 1
|
115 |
+
use_mlp_for_obj_ptr_proj: true
|
116 |
+
# Compilation flag
|
117 |
+
compile_image_encoder: False
|
configs/sam2_hiera_s.yaml
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
# Model
|
4 |
+
model:
|
5 |
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
+
image_encoder:
|
7 |
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
+
scalp: 1
|
9 |
+
trunk:
|
10 |
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
+
embed_dim: 96
|
12 |
+
num_heads: 1
|
13 |
+
stages: [1, 2, 11, 2]
|
14 |
+
global_att_blocks: [7, 10, 13]
|
15 |
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
+
neck:
|
17 |
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
18 |
+
position_encoding:
|
19 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
20 |
+
num_pos_feats: 256
|
21 |
+
normalize: true
|
22 |
+
scale: null
|
23 |
+
temperature: 10000
|
24 |
+
d_model: 256
|
25 |
+
backbone_channel_list: [768, 384, 192, 96]
|
26 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
27 |
+
fpn_interp_model: nearest
|
28 |
+
|
29 |
+
memory_attention:
|
30 |
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
31 |
+
d_model: 256
|
32 |
+
pos_enc_at_input: true
|
33 |
+
layer:
|
34 |
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
35 |
+
activation: relu
|
36 |
+
dim_feedforward: 2048
|
37 |
+
dropout: 0.1
|
38 |
+
pos_enc_at_attn: false
|
39 |
+
self_attention:
|
40 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
41 |
+
rope_theta: 10000.0
|
42 |
+
feat_sizes: [32, 32]
|
43 |
+
embedding_dim: 256
|
44 |
+
num_heads: 1
|
45 |
+
downsample_rate: 1
|
46 |
+
dropout: 0.1
|
47 |
+
d_model: 256
|
48 |
+
pos_enc_at_cross_attn_keys: true
|
49 |
+
pos_enc_at_cross_attn_queries: false
|
50 |
+
cross_attention:
|
51 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
52 |
+
rope_theta: 10000.0
|
53 |
+
feat_sizes: [32, 32]
|
54 |
+
rope_k_repeat: True
|
55 |
+
embedding_dim: 256
|
56 |
+
num_heads: 1
|
57 |
+
downsample_rate: 1
|
58 |
+
dropout: 0.1
|
59 |
+
kv_in_dim: 64
|
60 |
+
num_layers: 4
|
61 |
+
|
62 |
+
memory_encoder:
|
63 |
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
64 |
+
out_dim: 64
|
65 |
+
position_encoding:
|
66 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
67 |
+
num_pos_feats: 64
|
68 |
+
normalize: true
|
69 |
+
scale: null
|
70 |
+
temperature: 10000
|
71 |
+
mask_downsampler:
|
72 |
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
73 |
+
kernel_size: 3
|
74 |
+
stride: 2
|
75 |
+
padding: 1
|
76 |
+
fuser:
|
77 |
+
_target_: sam2.modeling.memory_encoder.Fuser
|
78 |
+
layer:
|
79 |
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
80 |
+
dim: 256
|
81 |
+
kernel_size: 7
|
82 |
+
padding: 3
|
83 |
+
layer_scale_init_value: 1e-6
|
84 |
+
use_dwconv: True # depth-wise convs
|
85 |
+
num_layers: 2
|
86 |
+
|
87 |
+
num_maskmem: 7
|
88 |
+
image_size: 1024
|
89 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
90 |
+
sigmoid_scale_for_mem_enc: 20.0
|
91 |
+
sigmoid_bias_for_mem_enc: -10.0
|
92 |
+
use_mask_input_as_output_without_sam: true
|
93 |
+
# Memory
|
94 |
+
directly_add_no_mem_embed: true
|
95 |
+
# use high-resolution feature map in the SAM mask decoder
|
96 |
+
use_high_res_features_in_sam: true
|
97 |
+
# output 3 masks on the first click on initial conditioning frames
|
98 |
+
multimask_output_in_sam: true
|
99 |
+
# SAM heads
|
100 |
+
iou_prediction_use_sigmoid: True
|
101 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
102 |
+
use_obj_ptrs_in_encoder: true
|
103 |
+
add_tpos_enc_to_obj_ptrs: false
|
104 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
105 |
+
# object occlusion prediction
|
106 |
+
pred_obj_scores: true
|
107 |
+
pred_obj_scores_mlp: true
|
108 |
+
fixed_no_obj_ptr: true
|
109 |
+
# multimask tracking settings
|
110 |
+
multimask_output_for_tracking: true
|
111 |
+
use_multimask_token_for_obj_ptr: true
|
112 |
+
multimask_min_pt_num: 0
|
113 |
+
multimask_max_pt_num: 1
|
114 |
+
use_mlp_for_obj_ptr_proj: true
|
115 |
+
# Compilation flag
|
116 |
+
compile_image_encoder: False
|
configs/sam2_hiera_t.yaml
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
# Model
|
4 |
+
model:
|
5 |
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
+
image_encoder:
|
7 |
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
+
scalp: 1
|
9 |
+
trunk:
|
10 |
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
+
embed_dim: 96
|
12 |
+
num_heads: 1
|
13 |
+
stages: [1, 2, 7, 2]
|
14 |
+
global_att_blocks: [5, 7, 9]
|
15 |
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
+
neck:
|
17 |
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
18 |
+
position_encoding:
|
19 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
20 |
+
num_pos_feats: 256
|
21 |
+
normalize: true
|
22 |
+
scale: null
|
23 |
+
temperature: 10000
|
24 |
+
d_model: 256
|
25 |
+
backbone_channel_list: [768, 384, 192, 96]
|
26 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
27 |
+
fpn_interp_model: nearest
|
28 |
+
|
29 |
+
memory_attention:
|
30 |
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
31 |
+
d_model: 256
|
32 |
+
pos_enc_at_input: true
|
33 |
+
layer:
|
34 |
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
35 |
+
activation: relu
|
36 |
+
dim_feedforward: 2048
|
37 |
+
dropout: 0.1
|
38 |
+
pos_enc_at_attn: false
|
39 |
+
self_attention:
|
40 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
41 |
+
rope_theta: 10000.0
|
42 |
+
feat_sizes: [32, 32]
|
43 |
+
embedding_dim: 256
|
44 |
+
num_heads: 1
|
45 |
+
downsample_rate: 1
|
46 |
+
dropout: 0.1
|
47 |
+
d_model: 256
|
48 |
+
pos_enc_at_cross_attn_keys: true
|
49 |
+
pos_enc_at_cross_attn_queries: false
|
50 |
+
cross_attention:
|
51 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
52 |
+
rope_theta: 10000.0
|
53 |
+
feat_sizes: [32, 32]
|
54 |
+
rope_k_repeat: True
|
55 |
+
embedding_dim: 256
|
56 |
+
num_heads: 1
|
57 |
+
downsample_rate: 1
|
58 |
+
dropout: 0.1
|
59 |
+
kv_in_dim: 64
|
60 |
+
num_layers: 4
|
61 |
+
|
62 |
+
memory_encoder:
|
63 |
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
64 |
+
out_dim: 64
|
65 |
+
position_encoding:
|
66 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
67 |
+
num_pos_feats: 64
|
68 |
+
normalize: true
|
69 |
+
scale: null
|
70 |
+
temperature: 10000
|
71 |
+
mask_downsampler:
|
72 |
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
73 |
+
kernel_size: 3
|
74 |
+
stride: 2
|
75 |
+
padding: 1
|
76 |
+
fuser:
|
77 |
+
_target_: sam2.modeling.memory_encoder.Fuser
|
78 |
+
layer:
|
79 |
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
80 |
+
dim: 256
|
81 |
+
kernel_size: 7
|
82 |
+
padding: 3
|
83 |
+
layer_scale_init_value: 1e-6
|
84 |
+
use_dwconv: True # depth-wise convs
|
85 |
+
num_layers: 2
|
86 |
+
|
87 |
+
num_maskmem: 7
|
88 |
+
image_size: 1024
|
89 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
90 |
+
# SAM decoder
|
91 |
+
sigmoid_scale_for_mem_enc: 20.0
|
92 |
+
sigmoid_bias_for_mem_enc: -10.0
|
93 |
+
use_mask_input_as_output_without_sam: true
|
94 |
+
# Memory
|
95 |
+
directly_add_no_mem_embed: true
|
96 |
+
# use high-resolution feature map in the SAM mask decoder
|
97 |
+
use_high_res_features_in_sam: true
|
98 |
+
# output 3 masks on the first click on initial conditioning frames
|
99 |
+
multimask_output_in_sam: true
|
100 |
+
# SAM heads
|
101 |
+
iou_prediction_use_sigmoid: True
|
102 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
103 |
+
use_obj_ptrs_in_encoder: true
|
104 |
+
add_tpos_enc_to_obj_ptrs: false
|
105 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
106 |
+
# object occlusion prediction
|
107 |
+
pred_obj_scores: true
|
108 |
+
pred_obj_scores_mlp: true
|
109 |
+
fixed_no_obj_ptr: true
|
110 |
+
# multimask tracking settings
|
111 |
+
multimask_output_for_tracking: true
|
112 |
+
use_multimask_token_for_obj_ptr: true
|
113 |
+
multimask_min_pt_num: 0
|
114 |
+
multimask_max_pt_num: 1
|
115 |
+
use_mlp_for_obj_ptr_proj: true
|
116 |
+
# Compilation flag
|
117 |
+
# HieraT does not currently support compilation, should always be set to False
|
118 |
+
compile_image_encoder: False
|
gradio_cached_examples/19/log.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Output image,Segmentation mask,flag,username,timestamp
|
2 |
+
,,,,2024-09-04 10:42:00.000310
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tqdm
|
2 |
+
einops
|
3 |
+
spaces
|
4 |
+
timm
|
5 |
+
samv2
|
6 |
+
gradio
|
7 |
+
supervision
|
8 |
+
opencv-python
|
9 |
+
pytest
|
10 |
+
accelerate
|
11 |
+
transformers==4.42.4
|
12 |
+
sentencepiece
|
13 |
+
git+https://github.com/Gothos/diffusers.git@flux-inpaint
|
utils/__init__.py
ADDED
File without changes
|
utils/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (165 Bytes). View file
|
|
utils/__pycache__/florence.cpython-311.pyc
ADDED
Binary file (3.62 kB). View file
|
|
utils/__pycache__/sam.cpython-311.pyc
ADDED
Binary file (2.22 kB). View file
|
|
utils/florence.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Union, Any, Tuple, Dict
|
3 |
+
from unittest.mock import patch
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from PIL import Image
|
7 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
8 |
+
from transformers.dynamic_module_utils import get_imports
|
9 |
+
|
10 |
+
# FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
|
11 |
+
FLORENCE_CHECKPOINT = "microsoft/Florence-2-large"
|
12 |
+
FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
|
13 |
+
FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
|
14 |
+
FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
|
15 |
+
FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
|
16 |
+
FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
|
17 |
+
|
18 |
+
|
19 |
+
def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
|
20 |
+
"""Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
|
21 |
+
if not str(filename).endswith("/modeling_florence2.py"):
|
22 |
+
return get_imports(filename)
|
23 |
+
imports = get_imports(filename)
|
24 |
+
imports.remove("flash_attn")
|
25 |
+
return imports
|
26 |
+
|
27 |
+
|
28 |
+
def load_florence_model(
|
29 |
+
device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
|
30 |
+
) -> Tuple[Any, Any]:
|
31 |
+
with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
|
32 |
+
model = AutoModelForCausalLM.from_pretrained(
|
33 |
+
checkpoint, trust_remote_code=True).to(device).eval()
|
34 |
+
processor = AutoProcessor.from_pretrained(
|
35 |
+
checkpoint, trust_remote_code=True)
|
36 |
+
return model, processor
|
37 |
+
|
38 |
+
|
39 |
+
def run_florence_inference(
|
40 |
+
model: Any,
|
41 |
+
processor: Any,
|
42 |
+
device: torch.device,
|
43 |
+
image: Image,
|
44 |
+
task: str,
|
45 |
+
text: str = ""
|
46 |
+
) -> Tuple[str, Dict]:
|
47 |
+
prompt = task + text
|
48 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
|
49 |
+
generated_ids = model.generate(
|
50 |
+
input_ids=inputs["input_ids"],
|
51 |
+
pixel_values=inputs["pixel_values"],
|
52 |
+
max_new_tokens=1024,
|
53 |
+
num_beams=3
|
54 |
+
)
|
55 |
+
generated_text = processor.batch_decode(
|
56 |
+
generated_ids, skip_special_tokens=False)[0]
|
57 |
+
response = processor.post_process_generation(
|
58 |
+
generated_text, task=task, image_size=image.size)
|
59 |
+
return generated_text, response
|
utils/sam.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import supervision as sv
|
5 |
+
import torch
|
6 |
+
from PIL import Image
|
7 |
+
from sam2.build_sam import build_sam2, build_sam2_video_predictor
|
8 |
+
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
9 |
+
|
10 |
+
# SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
|
11 |
+
# SAM_CONFIG = "sam2_hiera_s.yaml"
|
12 |
+
SAM_CHECKPOINT = "checkpoints/sam2_hiera_large.pt"
|
13 |
+
SAM_CONFIG = "sam2_hiera_l.yaml"
|
14 |
+
|
15 |
+
|
16 |
+
def load_sam_image_model(
|
17 |
+
device: torch.device,
|
18 |
+
config: str = SAM_CONFIG,
|
19 |
+
checkpoint: str = SAM_CHECKPOINT
|
20 |
+
) -> SAM2ImagePredictor:
|
21 |
+
model = build_sam2(config, checkpoint, device=device)
|
22 |
+
return SAM2ImagePredictor(sam_model=model)
|
23 |
+
|
24 |
+
|
25 |
+
def load_sam_video_model(
|
26 |
+
device: torch.device,
|
27 |
+
config: str = SAM_CONFIG,
|
28 |
+
checkpoint: str = SAM_CHECKPOINT
|
29 |
+
) -> Any:
|
30 |
+
return build_sam2_video_predictor(config, checkpoint, device=device)
|
31 |
+
|
32 |
+
|
33 |
+
def run_sam_inference(
|
34 |
+
model: Any,
|
35 |
+
image: Image,
|
36 |
+
detections: sv.Detections
|
37 |
+
) -> sv.Detections:
|
38 |
+
image = np.array(image.convert("RGB"))
|
39 |
+
model.set_image(image)
|
40 |
+
mask, score, _ = model.predict(box=detections.xyxy, multimask_output=False)
|
41 |
+
|
42 |
+
# dirty fix; remove this later
|
43 |
+
if len(mask.shape) == 4:
|
44 |
+
mask = np.squeeze(mask)
|
45 |
+
|
46 |
+
detections.mask = mask.astype(bool)
|
47 |
+
return detections
|