Spaces:

afmck
/

stable-diffusion-inpainting-segmentation

Runtime error

Alexander McKinney commited on Nov 13, 2022

Commit

b4542eb

1 Parent(s): 04bf3ab

interface example

need to change to blocks, so we can compute segmentation once, diffusion
once. Only repeated components are on CPU.
unsure how to resolve onclick canvas, need to check what canvas can do.

Files changed (1) hide show

app.py +107 -48

app.py CHANGED Viewed

@@ -12,6 +12,18 @@ from transformers.models.detr.feature_extraction_detr import rgb_to_id
 from diffusers import StableDiffusionInpaintPipeline
 def load_segmentation_models(model_name: str = 'facebook/detr-resnet-50-panoptic'):
     feature_extractor = DetrFeatureExtractor.from_pretrained(model_name)
     model = DetrForSegmentation.from_pretrained(model_name)
@@ -29,9 +41,6 @@ def load_diffusion_pipeline(model_name: str = 'runwayml/stable-diffusion-inpaint
 def get_device(try_cuda=True):
     return torch.device('cuda' if try_cuda and torch.cuda.is_available() else 'cpu')
-def greet(name):
-    return "Hello " + name + "!"
 def min_pool(x: torch.Tensor, kernel_size: int):
     pad_size = (kernel_size - 1) // 2
     return -torch.nn.functional.max_pool2d(-x, kernel_size, (1, 1), padding=pad_size)
@@ -47,55 +56,105 @@ def clean_mask(mask, min_kernel: int = 5, max_kernel: int = 23):
     mask = mask.bool().squeeze().numpy()
     return mask
-# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-# iface.launch()
 device = get_device()
 feature_extractor, segmentation_model, segmentation_cfg = load_segmentation_models()
-model = segmentation_model.to(device)
-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-# prepare image for the model
-inputs = feature_extractor(images=image, return_tensors="pt").to(device)
-# forward pass
-outputs = segmentation_model(**inputs)
-processed_sizes = torch.as_tensor(inputs["pixel_values"].shape[-2:]).unsqueeze(0)
-result = feature_extractor.post_process_panoptic(outputs, processed_sizes)[0]
-panoptic_seg = Image.open(io.BytesIO(result["png_string"])).resize((image.width, image.height))
-panoptic_seg = np.array(panoptic_seg, dtype=np.uint8)
-panoptic_seg_id = rgb_to_id(panoptic_seg)
-print(result['segments_info'])
-# cat_mask = (panoptic_seg_id == 1) | (panoptic_seg_id == 5)
-cat_mask = (panoptic_seg_id == 5)
-cat_mask = clean_mask(cat_mask)
-masked_image = np.array(image).copy()
-masked_image[cat_mask] = 0
-masked_image = Image.fromarray(masked_image)
-masked_image.save('masked_cat.png')
 pipe = load_diffusion_pipeline()
 pipe = pipe.to(device)
-print(cat_mask)
-resize_ratio = 512 / 480
-new_width = int(640 * resize_ratio)
-new_width += 8 - (new_width % 8)
-print(new_width)
-cat_mask = Image.fromarray(cat_mask.astype(np.uint8) * 255).convert("RGB").resize((new_width, 512))
-masked_image = masked_image.resize((new_width, 512))
-prompt = "Two cats on the sofa together."
-inpainted_image = pipe(height=512, width=new_width, prompt=prompt, image=masked_image, mask_image=cat_mask).images[0]
-inpainted_image.save('inpaint_cat.png')

 from diffusers import StableDiffusionInpaintPipeline
+# TODO: maybe need to port to `Blocks` system
+# allegedly provides:
+# Have multi-step interfaces, in which the output of one model becomes the
+# input to the next model, or have more flexible data flows in general.
+# and:
+# Change a component’s properties (for example, the choices in a dropdown) or its visibility based on user input
+# https://huggingface.co/course/chapter9/7?fw=pt
+torch.inference_mode()
+torch.no_grad()
 def load_segmentation_models(model_name: str = 'facebook/detr-resnet-50-panoptic'):
     feature_extractor = DetrFeatureExtractor.from_pretrained(model_name)
     model = DetrForSegmentation.from_pretrained(model_name)
 def get_device(try_cuda=True):
     return torch.device('cuda' if try_cuda and torch.cuda.is_available() else 'cpu')
 def min_pool(x: torch.Tensor, kernel_size: int):
     pad_size = (kernel_size - 1) // 2
     return -torch.nn.functional.max_pool2d(-x, kernel_size, (1, 1), padding=pad_size)
     mask = mask.bool().squeeze().numpy()
     return mask
 device = get_device()
 feature_extractor, segmentation_model, segmentation_cfg = load_segmentation_models()
+# segmentation_model = segmentation_model.to(device)
 pipe = load_diffusion_pipeline()
 pipe = pipe.to(device)
+# TODO: potentially use `gr.Gallery` to display different masks
+def fn_segmentation_diffusion(prompt, mask_indices, image, max_kernel, min_kernel, num_diffusion_steps):
+    mask_indices = [int(i) for i in mask_indices.split(',')]
+    inputs = feature_extractor(images=image, return_tensors="pt")
+    outputs = segmentation_model(**inputs)
+    processed_sizes = torch.as_tensor(inputs["pixel_values"].shape[-2:]).unsqueeze(0)
+    result = feature_extractor.post_process_panoptic(outputs, processed_sizes)[0]
+    panoptic_seg = Image.open(io.BytesIO(result["png_string"])).resize((image.width, image.height))
+    panoptic_seg = np.array(panoptic_seg, dtype=np.uint8)
+    class_str = '\n'.join(segmentation_cfg.id2label[s['category_id']] for s in result['segments_info'])
+    panoptic_seg_id = rgb_to_id(panoptic_seg)
+    if len(mask_indices) > 0:
+        mask = (panoptic_seg_id == mask_indices[0])
+    for idx in mask_indices[1:]:
+        mask = mask | (panoptic_seg_id == idx)
+    mask = clean_mask(mask, min_kernel=min_kernel, max_kernel=max_kernel)
+    masked_image = np.array(image).copy()
+    masked_image[mask] = 0
+    masked_image = Image.fromarray(masked_image).resize(image.size)
+    mask = Image.fromarray(mask.astype(np.uint8) * 255).resize(image.size)
+    if num_diffusion_steps == 0:
+        return masked_image, masked_image, class_str
+    STABLE_DIFFUSION_SMALL_EDGE = 512
+    assert masked_image.size == mask.size
+    w, h = masked_image.size
+    is_width_larger = w > h
+    resize_ratio = STABLE_DIFFUSION_SMALL_EDGE / (h if is_width_larger else w)
+    new_width = int(w * resize_ratio) if is_width_larger else STABLE_DIFFUSION_SMALL_EDGE
+    new_height = STABLE_DIFFUSION_SMALL_EDGE if is_width_larger else int(h * resize_ratio)
+    new_width += 8 - (new_width % 8) if is_width_larger else 0
+    new_height += 0 if is_width_larger else 8 - (new_height % 8)
+    mask = mask.convert("RGB").resize((new_width, new_height))
+    masked_image = masked_image.convert("RGB").resize((new_width, new_height))
+    inpainted_image = pipe(
+        height=new_height,
+        width=new_width,
+        prompt=prompt,
+        image=masked_image,
+        mask_image=mask,
+        num_inference_steps=num_diffusion_steps
+    ).images[0]
+    return masked_image, inpainted_image, class_str
+# iface_segmentation = gr.Interface(
+    # fn=fn_segmentation,
+    # inputs=[
+        # "text",
+        # "text",
+        # gr.Image(value="http://images.cocodataset.org/val2017/000000039769.jpg"),
+        # gr.Slider(minimum=1, maximum=99, value=23, step=2),
+        # gr.Slider(minimum=1, maximum=99, value=5, step=2),
+        # gr.Slider(minimum=0, maximum=100, value=50, step=1),
+    # ],
+    # outputs=["text", gr.Image(type="pil"), gr.Image(type="pil"), "number", "text"]
+# )
+# iface_diffusion = gr.Interface(
+    # fn=fn_diffusion,
+    # inputs=["text", gr.Image(type='pil'), gr.Image(type='pil'), "number", "text"],
+    # outputs=[gr.Image(), gr.Image(), gr.Textbox()]
+# )
+# iface = gr.Series(
+    # iface_segmentation, iface_diffusion,
+iface = gr.Interface(
+    fn=fn_segmentation_diffusion,
+    inputs=[
+        "text",
+        "text",
+        gr.Image(value="http://images.cocodataset.org/val2017/000000039769.jpg", type='pil'),
+        gr.Slider(minimum=1, maximum=99, value=23, step=2),
+        gr.Slider(minimum=1, maximum=99, value=5, step=2),
+        gr.Slider(minimum=0, maximum=100, value=50, step=1),
+    ],
+    outputs=[gr.Image(), gr.Image(), gr.Textbox(interactive=False)]
+)
+iface.launch()