Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Add Adaptive Strength Diffusion
Browse files- app.py +2 -11
 - pipeline_objectclear.py +13 -2
 
    	
        app.py
    CHANGED
    
    | 
         @@ -186,7 +186,7 @@ pipe = ObjectClearPipeline.from_pretrained_with_custom_modules( 
     | 
|
| 186 | 
         
             
            pipe.to(device)
         
     | 
| 187 | 
         | 
| 188 | 
         
             
            @spaces.GPU
         
     | 
| 189 | 
         
            -
            def process(image_state, interactive_state, mask_dropdown, guidance_scale, seed, num_inference_steps 
     | 
| 190 | 
         
             
                        ):
         
     | 
| 191 | 
         
             
                generator = torch.Generator(device="cuda").manual_seed(seed)
         
     | 
| 192 | 
         
             
                image_np = image_state["origin_image"]
         
     | 
| 
         @@ -219,7 +219,6 @@ def process(image_state, interactive_state, mask_dropdown, guidance_scale, seed, 
     | 
|
| 219 | 
         
             
                    mask_image=mask,
         
     | 
| 220 | 
         
             
                    generator=generator,
         
     | 
| 221 | 
         
             
                    num_inference_steps=num_inference_steps,
         
     | 
| 222 | 
         
            -
                    strength=strength,
         
     | 
| 223 | 
         
             
                    guidance_scale=guidance_scale,
         
     | 
| 224 | 
         
             
                    height=h,
         
     | 
| 225 | 
         
             
                    width=w,
         
     | 
| 
         @@ -432,13 +431,6 @@ with gr.Blocks(css=custom_css) as demo: 
     | 
|
| 432 | 
         
             
                        )
         
     | 
| 433 | 
         | 
| 434 | 
         
             
                        with gr.Accordion('ObjectClear Settings', open=True):
         
     | 
| 435 | 
         
            -
                            strength = gr.Radio(
         
     | 
| 436 | 
         
            -
                                choices=[0.99, 1.0],
         
     | 
| 437 | 
         
            -
                                value=0.99,
         
     | 
| 438 | 
         
            -
                                label="Strength",
         
     | 
| 439 | 
         
            -
                                info="0.99 better preserves the background and color; use 1.0 if object/shadow is not fully removed (default: 0.99)"
         
     | 
| 440 | 
         
            -
                            )
         
     | 
| 441 | 
         
            -
                            
         
     | 
| 442 | 
         
             
                            guidance_scale = gr.Slider(
         
     | 
| 443 | 
         
             
                                minimum=1, maximum=10, step=0.5, value=2.5,
         
     | 
| 444 | 
         
             
                                label="Guidance Scale",
         
     | 
| 
         @@ -517,8 +509,7 @@ with gr.Blocks(css=custom_css) as demo: 
     | 
|
| 517 | 
         
             
                        mask_dropdown,
         
     | 
| 518 | 
         
             
                        guidance_scale,
         
     | 
| 519 | 
         
             
                        seed,
         
     | 
| 520 | 
         
            -
                        num_inference_steps 
     | 
| 521 | 
         
            -
                        strength
         
     | 
| 522 | 
         
             
                    ],
         
     | 
| 523 | 
         
             
                    outputs=[
         
     | 
| 524 | 
         
             
                        output_image_component, output_compare_image_component
         
     | 
| 
         | 
|
| 186 | 
         
             
            pipe.to(device)
         
     | 
| 187 | 
         | 
| 188 | 
         
             
            @spaces.GPU
         
     | 
| 189 | 
         
            +
            def process(image_state, interactive_state, mask_dropdown, guidance_scale, seed, num_inference_steps         
         
     | 
| 190 | 
         
             
                        ):
         
     | 
| 191 | 
         
             
                generator = torch.Generator(device="cuda").manual_seed(seed)
         
     | 
| 192 | 
         
             
                image_np = image_state["origin_image"]
         
     | 
| 
         | 
|
| 219 | 
         
             
                    mask_image=mask,
         
     | 
| 220 | 
         
             
                    generator=generator,
         
     | 
| 221 | 
         
             
                    num_inference_steps=num_inference_steps,
         
     | 
| 
         | 
|
| 222 | 
         
             
                    guidance_scale=guidance_scale,
         
     | 
| 223 | 
         
             
                    height=h,
         
     | 
| 224 | 
         
             
                    width=w,
         
     | 
| 
         | 
|
| 431 | 
         
             
                        )
         
     | 
| 432 | 
         | 
| 433 | 
         
             
                        with gr.Accordion('ObjectClear Settings', open=True):
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 434 | 
         
             
                            guidance_scale = gr.Slider(
         
     | 
| 435 | 
         
             
                                minimum=1, maximum=10, step=0.5, value=2.5,
         
     | 
| 436 | 
         
             
                                label="Guidance Scale",
         
     | 
| 
         | 
|
| 509 | 
         
             
                        mask_dropdown,
         
     | 
| 510 | 
         
             
                        guidance_scale,
         
     | 
| 511 | 
         
             
                        seed,
         
     | 
| 512 | 
         
            +
                        num_inference_steps
         
     | 
| 
         | 
|
| 513 | 
         
             
                    ],
         
     | 
| 514 | 
         
             
                    outputs=[
         
     | 
| 515 | 
         
             
                        output_image_component, output_compare_image_component
         
     | 
    	
        pipeline_objectclear.py
    CHANGED
    
    | 
         @@ -1352,7 +1352,7 @@ class ObjectClearPipeline( 
     | 
|
| 1352 | 
         
             
                    height: Optional[int] = None,
         
     | 
| 1353 | 
         
             
                    width: Optional[int] = None,
         
     | 
| 1354 | 
         
             
                    padding_mask_crop: Optional[int] = None,
         
     | 
| 1355 | 
         
            -
                    strength: float = 0 
     | 
| 1356 | 
         
             
                    num_inference_steps: int = 50,
         
     | 
| 1357 | 
         
             
                    timesteps: List[int] = None,
         
     | 
| 1358 | 
         
             
                    sigmas: List[float] = None,
         
     | 
| 
         @@ -1426,7 +1426,7 @@ class ObjectClearPipeline( 
     | 
|
| 1426 | 
         
             
                            on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
         
     | 
| 1427 | 
         
             
                            resizing to the original image size for inpainting. This is useful when the masked area is small while
         
     | 
| 1428 | 
         
             
                            the image is large and contain information irrelevant for inpainting, such as background.
         
     | 
| 1429 | 
         
            -
                        strength (`float`, *optional*, defaults to 0 
     | 
| 1430 | 
         
             
                            Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
         
     | 
| 1431 | 
         
             
                            between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
         
     | 
| 1432 | 
         
             
                            `strength`. The number of denoising steps depends on the amount of noise initially added. When
         
     | 
| 
         @@ -1914,6 +1914,17 @@ class ObjectClearPipeline( 
     | 
|
| 1914 | 
         
             
                            # progressive attention mask blending
         
     | 
| 1915 | 
         
             
                            fuse_index = 5
         
     | 
| 1916 | 
         
             
                            if self.config.apply_attention_guided_fusion:
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1917 | 
         
             
                                if i == len(timesteps) - 1:
         
     | 
| 1918 | 
         
             
                                    attn_key, attn_map = next(iter(self.cross_attention_scores.items()))
         
     | 
| 1919 | 
         
             
                                    attn_map = self.resize_attn_map_divide2(attn_map, mask, fuse_index)
         
     | 
| 
         | 
|
| 1352 | 
         
             
                    height: Optional[int] = None,
         
     | 
| 1353 | 
         
             
                    width: Optional[int] = None,
         
     | 
| 1354 | 
         
             
                    padding_mask_crop: Optional[int] = None,
         
     | 
| 1355 | 
         
            +
                    strength: float = 1.0
         
     | 
| 1356 | 
         
             
                    num_inference_steps: int = 50,
         
     | 
| 1357 | 
         
             
                    timesteps: List[int] = None,
         
     | 
| 1358 | 
         
             
                    sigmas: List[float] = None,
         
     | 
| 
         | 
|
| 1426 | 
         
             
                            on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
         
     | 
| 1427 | 
         
             
                            resizing to the original image size for inpainting. This is useful when the masked area is small while
         
     | 
| 1428 | 
         
             
                            the image is large and contain information irrelevant for inpainting, such as background.
         
     | 
| 1429 | 
         
            +
                        strength (`float`, *optional*, defaults to 1.0):
         
     | 
| 1430 | 
         
             
                            Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
         
     | 
| 1431 | 
         
             
                            between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
         
     | 
| 1432 | 
         
             
                            `strength`. The number of denoising steps depends on the amount of noise initially added. When
         
     | 
| 
         | 
|
| 1914 | 
         
             
                            # progressive attention mask blending
         
     | 
| 1915 | 
         
             
                            fuse_index = 5
         
     | 
| 1916 | 
         
             
                            if self.config.apply_attention_guided_fusion:
         
     | 
| 1917 | 
         
            +
                                if i == 0:
         
     | 
| 1918 | 
         
            +
                                    init_latents_proper = image_latents
         
     | 
| 1919 | 
         
            +
                                    init_mask = mask[0:1]
         
     | 
| 1920 | 
         
            +
             
     | 
| 1921 | 
         
            +
                                    noise_timestep = timesteps[i + 1]
         
     | 
| 1922 | 
         
            +
                                    init_latents_proper = self.scheduler.add_noise(
         
     | 
| 1923 | 
         
            +
                                        init_latents_proper, noise, torch.tensor([noise_timestep])
         
     | 
| 1924 | 
         
            +
                                    )
         
     | 
| 1925 | 
         
            +
                                    
         
     | 
| 1926 | 
         
            +
                                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
         
     | 
| 1927 | 
         
            +
             
     | 
| 1928 | 
         
             
                                if i == len(timesteps) - 1:
         
     | 
| 1929 | 
         
             
                                    attn_key, attn_map = next(iter(self.cross_attention_scores.items()))
         
     | 
| 1930 | 
         
             
                                    attn_map = self.resize_attn_map_divide2(attn_map, mask, fuse_index)
         
     |