Spaces:

Ryukijano
/

Control_net_on_surface_normals

Runtime error

App Files Files Community

Ryukijano commited on Dec 27, 2024

Commit

eafa330

verified ·

1 Parent(s): 4ee90cf

Upload 5 files

Browse files

Files changed (5) hide show

README.md +11 -13
app.py +136 -0
app_mast3r.py +206 -0
catmlp_dpt_head.py +94 -0
requirements.txt +15 -0

README.md CHANGED Viewed

@@ -1,13 +1,11 @@
----
-title: Control Net On Surface Normals
-emoji: 🏃
-colorFrom: yellow
-colorTo: green
-sdk: gradio
-sdk_version: 5.9.1
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: ControlNet_on_human_surface_normals
+emoji: ⚡
+colorFrom: purple
+colorTo: pink
+sdk: gradio
+sdk_version: 4.26.0
+app_file: app.py
+pinned: false
+license: mit
+---

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import spaces
+from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL, EulerAncestralDiscreteScheduler
+from diffusers.utils import load_image
+from PIL import Image
+import torch
+import numpy as np
+import cv2
+import gradio as gr
+from torchvision import transforms
+import fire
+import os
+controlnet = ControlNetModel.from_pretrained(
+    "geyongtao/HumanWild",
+    torch_dtype=torch.float16
+).to('cuda')
+vae = AutoencoderKL.from_pretrained(
+    "madebyollin/sdxl-vae-fp16-fix",
+    torch_dtype=torch.float16).to("cuda")
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    controlnet=controlnet,
+    vae=vae,
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+    low_cpu_mem_usage=True,
+    offload_state_dict=True,
+).to('cuda')
+pipe.controlnet.to(memory_format=torch.channels_last)
+# pipe.enable_xformers_memory_efficient_attention()
+pipe.force_zeros_for_empty_prompt = False
+def resize_image(image):
+    image = image.convert('RGB')
+    current_size = image.size
+    if current_size[0] > current_size[1]:
+        center_cropped_image = transforms.functional.center_crop(image, (current_size[1], current_size[1]))
+    else:
+        center_cropped_image = transforms.functional.center_crop(image, (current_size[0], current_size[0]))
+    resized_image = transforms.functional.resize(center_cropped_image, (1024, 1024))
+    return resized_image
+def get_normal_map(image):
+    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
+    with torch.no_grad(), torch.autocast("cuda"):
+        depth_map = depth_estimator(image).predicted_depth
+    image = transforms.functional.center_crop(image, min(image.shape[-2:]))
+    depth_map = torch.nn.functional.interpolate(
+        depth_map.unsqueeze(1),
+        size=(1024, 1024),
+        mode="bicubic",
+        align_corners=False,
+    )
+    depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
+    depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
+    depth_map = (depth_map - depth_min) / (depth_max - depth_min)
+    image = torch.cat([depth_map] * 3, dim=1)
+    image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
+    image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
+    return image
+@spaces.GPU
+def generate_(prompt, negative_prompt, normal_image, num_steps, controlnet_conditioning_scale, seed):
+    generator = torch.Generator("cuda").manual_seed(seed)
+    images = pipe(
+        prompt,
+        negative_prompt=negative_prompt,
+        image=normal_image,
+        num_inference_steps=num_steps,
+        controlnet_conditioning_scale=float(controlnet_conditioning_scale),
+        num_images_per_prompt=2,
+        generator=generator,
+    ).images
+    return images
+@spaces.GPU
+def process(normal_image, prompt, negative_prompt, num_steps, controlnet_conditioning_scale, seed):
+    # resize input_image to 1024x1024
+    normal_image = resize_image(normal_image)
+    # depth_image = get_depth_map(input_image)
+    images = generate_(prompt, negative_prompt, normal_image, num_steps, controlnet_conditioning_scale, seed)
+    return [images[0], images[1]]
+def run_demo():
+    _TITLE = '''3D Human Reconstruction in the Wild with Synthetic Data Using Generative Models'''
+    block = gr.Blocks().queue()
+    with block:
+        gr.Markdown("# 3D Human Reconstruction in the Wild with Synthetic Data Using Generative Models ")
+        gr.HTML('''
+          <p style="margin-bottom: 10px; font-size: 94%">
+            This is a demo for Surface Normal ControlNet that using
+            <a href="https://huggingface.co/geyongtao/HumanWild" target="_blank"> HumanWild model</a> pretrained weight.
+            <a style="display:inline-block; margin-left: .5em" href='https://github.com/YongtaoGe/WildHuman/'><img src='https://img.shields.io/github/stars/YongtaoGe/WildHuman?style=social' /></a>
+          </p>
+        ''')
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(sources=None, type="pil") # None for upload, ctrl+v and webcam
+                example_folder = os.path.join(os.path.dirname(__file__), "./assets")
+                example_fns = [os.path.join(example_folder, example) for example in os.listdir(example_folder)]
+                gr.Examples(
+                    examples=example_fns,
+                    inputs=[input_image],
+                    cache_examples=False,
+                    label='Examples (click one of the images below to start)',
+                    examples_per_page=30
+                )
+                prompt = gr.Textbox(label="Prompt", value="a person, in the wild")
+                negative_prompt = gr.Textbox(visible=False, label="Negative prompt", value="Logo,Watermark,Text,Ugly,Morbid,Extra fingers,Poorly drawn hands,Mutation,Blurry,Extra limbs,Gross proportions,Missing arms,Mutated hands,Long neck,Duplicate,Mutilated,Mutilated hands,Poorly drawn face,Deformed,Bad anatomy,Cloned face,Malformed limbs,Missing legs,Too many fingers")
+                num_steps = gr.Slider(label="Number of steps", minimum=25, maximum=50, value=30, step=1)
+                controlnet_conditioning_scale = gr.Slider(label="ControlNet conditioning scale", minimum=0.1, maximum=1.0, value=0.95, step=0.05)
+                seed = gr.Slider(label="Seed", minimum=0, maximum=2147483647, step=1, randomize=True,)
+                run_button = gr.Button(value="Run")
+            with gr.Column():
+                result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery", columns=[2], height='auto')
+        ips = [input_image, prompt, negative_prompt, num_steps, controlnet_conditioning_scale, seed]
+        run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
+    block.launch(debug = True)
+if __name__ == '__main__':
+    fire.Fire(run_demo)

app_mast3r.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import gradio as gr
+import spaces
+import torch
+from gradio_rerun import Rerun
+import rerun as rr
+import rerun.blueprint as rrb
+from pathlib import Path
+import uuid
+from mini_dust3r.api import OptimizedResult, inferece_dust3r, log_optimized_result
+from mini_dust3r.model import AsymmetricCroCo3DStereo
+from mini_dust3r.utils.misc import (
+    fill_default_args,
+    freeze_all_params,
+    is_symmetrized,
+    interleave,
+    transpose_to_landscape,
+)
+import os
+from mini_dust3r.model import load_model
+from catmlp_dpt_head import Cat_MLP_LocalFeatures_DPT_Pts3d, postprocess
+DEVICE = "cuda" if torch.cuda.is_available() else "CPU"
+# model = AsymmetricCroCo3DStereo.from_pretrained(
+#    "naver/DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+# ).to(DEVICE)
+from mini_dust3r.heads.linear_head import LinearPts3d
+from mini_dust3r.heads.dpt_head import create_dpt_head
+def head_factory(head_type, output_mode, net, has_conf=False):
+    """" build a prediction head for the decoder
+    """
+    if head_type == 'linear' and output_mode == 'pts3d':
+        return LinearPts3d(net, has_conf)
+    elif head_type == 'dpt' and output_mode == 'pts3d':
+        return create_dpt_head(net, has_conf=has_conf)
+    if head_type == 'catmlp+dpt' and output_mode.startswith('pts3d+desc'):
+        local_feat_dim = int(output_mode[10:])
+        assert net.dec_depth > 9
+        l2 = net.dec_depth
+        feature_dim = 256
+        last_dim = feature_dim // 2
+        out_nchan = 3
+        ed = net.enc_embed_dim
+        dd = net.dec_embed_dim
+        return Cat_MLP_LocalFeatures_DPT_Pts3d(net, local_feat_dim=local_feat_dim, has_conf=has_conf,
+                                               num_channels=out_nchan + has_conf,
+                                               feature_dim=feature_dim,
+                                               last_dim=last_dim,
+                                               hooks_idx=[0, l2 * 2 // 4, l2 * 3 // 4, l2],
+                                               dim_tokens=[ed, dd, dd, dd],
+                                               postprocess=postprocess,
+                                               depth_mode=net.depth_mode,
+                                               conf_mode=net.conf_mode,
+                                               head_type='regression')
+    else:
+        raise NotImplementedError(f"unexpected {head_type=} and {output_mode=}")
+class AsymmetricMASt3R(AsymmetricCroCo3DStereo):
+    def __init__(self, desc_mode=('norm'), two_confs=False, desc_conf_mode=None, **kwargs):
+        self.desc_mode = desc_mode
+        self.two_confs = two_confs
+        self.desc_conf_mode = desc_conf_mode
+        super().__init__(**kwargs)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kw):
+        if os.path.isfile(pretrained_model_name_or_path):
+            return load_model(pretrained_model_name_or_path, device='cpu')
+        else:
+            return super(AsymmetricMASt3R, cls).from_pretrained(pretrained_model_name_or_path, **kw)
+    def set_downstream_head(self, output_mode, head_type, landscape_only, depth_mode, conf_mode, patch_size, img_size, **kw):
+        assert img_size[0] % patch_size == 0 and img_size[
+            1] % patch_size == 0, f'{img_size=} must be multiple of {patch_size=}'
+        self.output_mode = output_mode
+        self.head_type = head_type
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+        if self.desc_conf_mode is None:
+            self.desc_conf_mode = conf_mode
+        # allocate heads
+        self.downstream_head1 = head_factory(head_type, output_mode, self, has_conf=bool(conf_mode))
+        self.downstream_head2 = head_factory(head_type, output_mode, self, has_conf=bool(conf_mode))
+        # magic wrapper
+        self.head1 = transpose_to_landscape(self.downstream_head1, activate=landscape_only)
+        self.head2 = transpose_to_landscape(self.downstream_head2, activate=landscape_only)
+model = AsymmetricMASt3R.from_pretrained(
+    "naver/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric").to(DEVICE)
+def create_blueprint(image_name_list: list[str], log_path: Path) -> rrb.Blueprint:
+    # dont show 2d views if there are more than 4 images as to not clutter the view
+    if len(image_name_list) > 4:
+        blueprint = rrb.Blueprint(
+            rrb.Horizontal(
+                rrb.Spatial3DView(origin=f"{log_path}"),
+            ),
+            collapse_panels=True,
+        )
+    else:
+        blueprint = rrb.Blueprint(
+            rrb.Horizontal(
+                contents=[
+                    rrb.Spatial3DView(origin=f"{log_path}"),
+                    rrb.Vertical(
+                        contents=[
+                            rrb.Spatial2DView(
+                                origin=f"{log_path}/camera_{i}/pinhole/",
+                                contents=[
+                                    "+ $origin/**",
+                                ],
+                            )
+                            for i in range(len(image_name_list))
+                        ]
+                    ),
+                ],
+                column_shares=[3, 1],
+            ),
+            collapse_panels=True,
+        )
+    return blueprint
+@spaces.GPU
+def predict(image_name_list: list[str] | str):
+    # check if is list or string and if not raise error
+    if not isinstance(image_name_list, list) and not isinstance(image_name_list, str):
+        raise gr.Error(
+            f"Input must be a list of strings or a string, got: {type(image_name_list)}"
+        )
+    uuid_str = str(uuid.uuid4())
+    filename = Path(f"/tmp/gradio/{uuid_str}.rrd")
+    rr.init(f"{uuid_str}")
+    log_path = Path("world")
+    if isinstance(image_name_list, str):
+        image_name_list = [image_name_list]
+    optimized_results: OptimizedResult = inferece_dust3r(
+        image_dir_or_list=image_name_list,
+        model=model,
+        device=DEVICE,
+        batch_size=1,
+    )
+    blueprint: rrb.Blueprint = create_blueprint(image_name_list, log_path)
+    rr.send_blueprint(blueprint)
+    rr.set_time_sequence("sequence", 0)
+    log_optimized_result(optimized_results, log_path)
+    rr.save(filename.as_posix())
+    return filename.as_posix()
+with gr.Blocks(
+    css=""".gradio-container {margin: 0 !important; min-width: 100%};""",
+    title="Mini-DUSt3R Demo",
+) as demo:
+    # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
+    gr.HTML('<h2 style="text-align: center;">Mini-DUSt3R Demo</h2>')
+    gr.HTML(
+        '<p style="text-align: center;">Unofficial DUSt3R demo using the mini-dust3r pip package</p>'
+    )
+    gr.HTML(
+        '<p style="text-align: center;">More info <a href="https://github.com/pablovela5620/mini-dust3r">here</a></p>'
+    )
+    with gr.Tab(label="Single Image"):
+        with gr.Column():
+            single_image = gr.Image(type="filepath", height=300)
+            run_btn_single = gr.Button("Run")
+            rerun_viewer_single = Rerun(height=900)
+            run_btn_single.click(
+                fn=predict, inputs=[single_image], outputs=[rerun_viewer_single]
+            )
+            example_single_dir = Path("examples/single_image")
+            example_single_files = sorted(example_single_dir.glob("*.png"))
+            examples_single = gr.Examples(
+                examples=example_single_files,
+                inputs=[single_image],
+                outputs=[rerun_viewer_single],
+                fn=predict,
+                cache_examples="lazy",
+            )
+    with gr.Tab(label="Multi Image"):
+        with gr.Column():
+            multi_files = gr.File(file_count="multiple")
+            run_btn_multi = gr.Button("Run")
+            rerun_viewer_multi = Rerun(height=900)
+            run_btn_multi.click(
+                fn=predict, inputs=[multi_files], outputs=[rerun_viewer_multi]
+            )
+demo.launch()

catmlp_dpt_head.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# MASt3R heads
+# --------------------------------------------------------
+import torch
+import torch.nn.functional as F
+from mini_dust3r.heads.postprocess import reg_dense_depth, reg_dense_conf  # noqa
+from mini_dust3r.heads.dpt_head import PixelwiseTaskWithDPT  # noqa
+from mini_dust3r.croco.blocks import Mlp  # noqa
+def reg_desc(desc, mode):
+    if 'norm' in mode:
+        desc = desc / desc.norm(dim=-1, keepdim=True)
+    else:
+        raise ValueError(f"Unknown desc mode {mode}")
+    return desc
+def postprocess(out, depth_mode, conf_mode, desc_dim=None, desc_mode='norm', two_confs=False, desc_conf_mode=None):
+    if desc_conf_mode is None:
+        desc_conf_mode = conf_mode
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W,D
+    res = dict(pts3d=reg_dense_depth(fmap[..., 0:3], mode=depth_mode))
+    if conf_mode is not None:
+        res['conf'] = reg_dense_conf(fmap[..., 3], mode=conf_mode)
+    if desc_dim is not None:
+        start = 3 + int(conf_mode is not None)
+        res['desc'] = reg_desc(fmap[..., start:start + desc_dim], mode=desc_mode)
+        if two_confs:
+            res['desc_conf'] = reg_dense_conf(fmap[..., start + desc_dim], mode=desc_conf_mode)
+        else:
+            res['desc_conf'] = res['conf'].clone()
+    return res
+class Cat_MLP_LocalFeatures_DPT_Pts3d(PixelwiseTaskWithDPT):
+    """ Mixture between MLP and DPT head that outputs 3d points and local features (with MLP).
+    The input for both heads is a concatenation of Encoder and Decoder outputs
+    """
+    def __init__(self, net, has_conf=False, local_feat_dim=16, hidden_dim_factor=4., hooks_idx=None, dim_tokens=None,
+                 num_channels=1, postprocess=None, feature_dim=256, last_dim=32, depth_mode=None, conf_mode=None, head_type="regression", **kwargs):
+        super().__init__(num_channels=num_channels, feature_dim=feature_dim, last_dim=last_dim, hooks_idx=hooks_idx,
+                         dim_tokens=dim_tokens, depth_mode=depth_mode, postprocess=postprocess, conf_mode=conf_mode, head_type=head_type)
+        self.local_feat_dim = local_feat_dim
+        patch_size = net.patch_embed.patch_size
+        if isinstance(patch_size, tuple):
+            assert len(patch_size) == 2 and isinstance(patch_size[0], int) and isinstance(
+                patch_size[1], int), "What is your patchsize format? Expected a single int or a tuple of two ints."
+            assert patch_size[0] == patch_size[1], "Error, non square patches not managed"
+            patch_size = patch_size[0]
+        self.patch_size = patch_size
+        self.desc_mode = net.desc_mode
+        self.has_conf = has_conf
+        self.two_confs = net.two_confs  # independent confs for 3D regr and descs
+        self.desc_conf_mode = net.desc_conf_mode
+        idim = net.enc_embed_dim + net.dec_embed_dim
+        self.head_local_features = Mlp(in_features=idim,
+                                       hidden_features=int(hidden_dim_factor * idim),
+                                       out_features=(self.local_feat_dim + self.two_confs) * self.patch_size**2)
+    def forward(self, decout, img_shape):
+        # pass through the heads
+        pts3d = self.dpt(decout, image_size=(img_shape[0], img_shape[1]))
+        # recover encoder and decoder outputs
+        enc_output, dec_output = decout[0], decout[-1]
+        cat_output = torch.cat([enc_output, dec_output], dim=-1)  # concatenate
+        H, W = img_shape
+        B, S, D = cat_output.shape
+        # extract local_features
+        local_features = self.head_local_features(cat_output)  # B,S,D
+        local_features = local_features.transpose(-1, -2).view(B, -1, H // self.patch_size, W // self.patch_size)
+        local_features = F.pixel_shuffle(local_features, self.patch_size)  # B,d,H,W
+        # post process 3D pts, descriptors and confidences
+        out = torch.cat([pts3d, local_features], dim=1)
+        if self.postprocess:
+            out = self.postprocess(out,
+                                   depth_mode=self.depth_mode,
+                                   conf_mode=self.conf_mode,
+                                   desc_dim=self.local_feat_dim,
+                                   desc_mode=self.desc_mode,
+                                   two_confs=self.two_confs,
+                                   desc_conf_mode=self.desc_conf_mode)
+        return out

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+#mini-dust3r==0.1.1
+#pillow-heif
+#rerun-sdk==0.15.1
+accelerate
+spaces
+transformers
+safetensors
+opencv-python
+diffusers
+gradio
+torch
+torchvision
+xformers
+fire