Spaces:

PAIR
/

HD-Painter

Running on A10G

App Files Files

Andranik Sargsyan commited on Dec 25, 2023

Commit

736e88e

1 Parent(s): 08504da

refactor code

Browse files

Files changed (4) hide show

app.py +70 -38
lib/methods/rasg.py +24 -6
lib/methods/sd.py +14 -10
lib/methods/sr.py +42 -37

app.py CHANGED Viewed

@@ -75,8 +75,12 @@ def set_model_from_name(inp_model_name):
     inp_model = inpainting_models[inp_model_name]
-def rasg_run(use_painta, prompt, input, seed, eta, negative_prompt, positive_prompt, ddim_steps,
-guidance_scale=7.5, batch_size=4):
     torch.cuda.empty_cache()
     seed = int(seed)
@@ -87,35 +91,44 @@ guidance_scale=7.5, batch_size=4):
     method = ['rasg']
     if use_painta: method.append('painta')
     inpainted_images = []
     blended_images = []
     for i in range(batch_size):
         inpainted_image = rasg.run(
-            ddim = inp_model,
-            method = '-'.join(method),
-            prompt = prompt,
-            image = image.padx(64),
-            mask = mask.alpha().padx(64),
-            seed = seed+i*1000,
-            eta = eta,
-            prefix = '{}',
-            negative_prompt = negative_prompt,
-            positive_prompt = f', {positive_prompt}',
-            dt = 1000 // ddim_steps,
-            guidance_scale = guidance_scale
         ).crop(image.size)
-        blended_image = poisson_blend(orig_img = image.data[0], fake_img = inpainted_image.data[0],
-            mask = mask.data[0], dilation = 12)
         blended_images.append(blended_image)
         inpainted_images.append(inpainted_image.numpy()[0])
     return blended_images, inpainted_images
-def sd_run(use_painta, prompt, input, seed, eta, negative_prompt, positive_prompt, ddim_steps,
-guidance_scale=7.5, batch_size=4):
     torch.cuda.empty_cache()
     seed = int(seed)
@@ -126,28 +139,33 @@ guidance_scale=7.5, batch_size=4):
     method = ['default']
     if use_painta: method.append('painta')
     inpainted_images = []
     blended_images = []
     for i in range(batch_size):
         inpainted_image = sd.run(
-            ddim = inp_model,
-            method = '-'.join(method),
-            prompt = prompt,
-            image = image.padx(64),
-            mask = mask.alpha().padx(64),
-            seed = seed+i*1000,
-            eta = eta,
-            prefix = '{}',
-            negative_prompt = negative_prompt,
-            positive_prompt = f', {positive_prompt}',
-            dt = 1000 // ddim_steps,
-            guidance_scale = guidance_scale
         ).crop(image.size)
-        blended_image = poisson_blend(orig_img = image.data[0], fake_img = inpainted_image.data[0],
-            mask = mask.data[0], dilation = 12)
         blended_images.append(blended_image)
         inpainted_images.append(inpainted_image.numpy()[0])
@@ -156,7 +174,9 @@ guidance_scale=7.5, batch_size=4):
 def upscale_run(
     prompt, input, ddim_steps, seed, use_sam_mask, gallery, img_index,
-negative_prompt='', positive_prompt=', high resolution professional photo'):
     torch.cuda.empty_cache()
     seed = int(seed)
@@ -169,10 +189,22 @@ negative_prompt='', positive_prompt=', high resolution professional photo'):
     lr_image = IImage(inpainted_image)
     hr_image = IImage(input['image']).resize(2048)
     hr_mask = IImage(input['mask']).resize(2048)
-    output_image = sr.run(sr_model, sam_predictor, lr_image, hr_image, hr_mask, prompt=prompt + positive_prompt,
-        noise_level=0, blend_trick=True, blend_output=True, negative_prompt=negative_prompt,
-        seed=seed, use_sam_mask=use_sam_mask)
-    return output_image.numpy()[0], output_image.numpy()[0]
 def switch_run(use_rasg, model_name, *args):

     inp_model = inpainting_models[inp_model_name]
+def rasg_run(
+    use_painta, prompt, input, seed, eta,
+    negative_prompt, positive_prompt, ddim_steps,
+    guidance_scale=7.5,
+    batch_size=1
+):
     torch.cuda.empty_cache()
     seed = int(seed)
     method = ['rasg']
     if use_painta: method.append('painta')
+    method = '-'.join(method)
     inpainted_images = []
     blended_images = []
     for i in range(batch_size):
+        seed = seed + i * 1000
         inpainted_image = rasg.run(
+            ddim=inp_model,
+            method=method,
+            prompt=prompt,
+            image=image,
+            mask=mask,
+            seed=seed,
+            eta=eta,
+            negative_prompt=negative_prompt,
+            positive_prompt=positive_prompt,
+            num_steps=ddim_steps,
+            guidance_scale=guidance_scale
         ).crop(image.size)
+        blended_image = poisson_blend(
+            orig_img=image.data[0],
+            fake_img=inpainted_image.data[0],
+            mask=mask.data[0],
+            dilation=12
+        )
         blended_images.append(blended_image)
         inpainted_images.append(inpainted_image.numpy()[0])
     return blended_images, inpainted_images
+def sd_run(use_painta, prompt, input, seed, eta,
+    negative_prompt, positive_prompt, ddim_steps,
+    guidance_scale=7.5,
+    batch_size=1
+):
     torch.cuda.empty_cache()
     seed = int(seed)
     method = ['default']
     if use_painta: method.append('painta')
+    method = '-'.join(method)
     inpainted_images = []
     blended_images = []
     for i in range(batch_size):
+        seed = seed + i * 1000
         inpainted_image = sd.run(
+            ddim=inp_model,
+            method=method,
+            prompt=prompt,
+            image=image,
+            mask=mask,
+            seed=seed,
+            eta=eta,
+            negative_prompt=negative_prompt,
+            positive_prompt=positive_prompt,
+            num_steps=ddim_steps,
+            guidance_scale=guidance_scale
         ).crop(image.size)
+        blended_image = poisson_blend(
+            orig_img=image.data[0],
+            fake_img=inpainted_image.data[0],
+            mask=mask.data[0],
+            dilation=12
+        )
         blended_images.append(blended_image)
         inpainted_images.append(inpainted_image.numpy()[0])
 def upscale_run(
     prompt, input, ddim_steps, seed, use_sam_mask, gallery, img_index,
+    negative_prompt='',
+    positive_prompt=', high resolution professional photo'
+):
     torch.cuda.empty_cache()
     seed = int(seed)
     lr_image = IImage(inpainted_image)
     hr_image = IImage(input['image']).resize(2048)
     hr_mask = IImage(input['mask']).resize(2048)
+    output_image = sr.run(
+        sr_model,
+        sam_predictor,
+        lr_image,
+        hr_image,
+        hr_mask,
+        prompt=prompt + positive_prompt,
+        noise_level=20,
+        blend_trick=True,
+        blend_output=True,
+        negative_prompt=negative_prompt,
+        seed=seed,
+        use_sam_mask=use_sam_mask
+    )
+    output_image.info = input['image'].info  # save metadata
+    return output_image, output_image
 def switch_run(use_rasg, model_name, *args):

lib/methods/rasg.py CHANGED Viewed

@@ -23,12 +23,28 @@ def init_guidance():
     router.attention_forward = attentionpatch.default.forward_and_save
     router.basic_transformer_forward = transformerpatch.default.forward
-def run(ddim, method, prompt, image, mask, seed, eta, prefix, negative_prompt, positive_prompt, dt, guidance_scale):
     # Text condition
-    prompt = prefix.format(prompt)
-    context = ddim.encoder.encode([negative_prompt, prompt + positive_prompt])
-    token_idx = list(range(1 + prefix.split(' ').index('{}'), tokenize(prompt).index('<end_of_text>')))
-    token_idx += [tokenize(prompt + positive_prompt).index('<end_of_text>')]
     # Initialize painta
     if 'painta' in method: init_painta(token_idx)
@@ -84,7 +100,9 @@ def run(ddim, method, prompt, image, mask, seed, eta, prefix, negative_prompt, p
             grad -= grad.mean()
             grad /= grad.std()
-            zt = share.schedule.sqrt_alphas[share.timestep - dt] * z0 + torch.sqrt(1 - share.schedule.alphas[share.timestep - dt] - sigma ** 2) * eps + eta * sigma * grad
     with torch.no_grad():
         output_image = IImage(ddim.vae.decode(z0 / ddim.config.scale_factor))

     router.attention_forward = attentionpatch.default.forward_and_save
     router.basic_transformer_forward = transformerpatch.default.forward
+def run(
+    ddim,
+    method,
+    prompt,
+    image,
+    mask,
+    seed=0,
+    eta=0.1,
+    negative_prompt='',
+    positive_prompt='',
+    num_steps=50,
+    guidance_scale=7.5
+):
+    image = image.padx(64)
+    mask = mask.alpha().padx(64)
+    full_prompt = f'{prompt}, {positive_prompt}'
+    dt = 1000 // num_steps
     # Text condition
+    context = ddim.encoder.encode([negative_prompt, full_prompt])
+    token_idx = list(range(1, tokenize(prompt).index('<end_of_text>')))
+    token_idx += [tokenize(full_prompt).index('<end_of_text>')]
     # Initialize painta
     if 'painta' in method: init_painta(token_idx)
             grad -= grad.mean()
             grad /= grad.std()
+            zt = share.schedule.sqrt_alphas[share.timestep - dt] * z0 + \
+                torch.sqrt(1 - share.schedule.alphas[share.timestep - dt] - sigma ** 2) * eps + \
+                eta * sigma * grad
     with torch.no_grad():
         output_image = IImage(ddim.vae.decode(z0 / ddim.config.scale_factor))

lib/methods/sd.py CHANGED Viewed

@@ -24,18 +24,22 @@ def run(
     prompt,
     image,
     mask,
-    seed,
-    eta,
-    prefix,
-    negative_prompt,
-    positive_prompt,
-    dt,
-    guidance_scale
 ):
     # Text condition
-    context = ddim.encoder.encode([negative_prompt, prompt + positive_prompt])
-    token_idx = list(range(1 + prefix.split(' ').index('{}'), tokenize(prompt).index('<end_of_text>')))
-    token_idx += [tokenize(prompt + positive_prompt).index('<end_of_text>')]
     # Setup painta if needed
     if 'painta' in method: init_painta(token_idx)

     prompt,
     image,
     mask,
+    seed=0,
+    eta=0.1,
+    negative_prompt='',
+    positive_prompt='',
+    num_steps=50,
+    guidance_scale=7.5
 ):
+    image = image.padx(64)
+    mask = mask.alpha().padx(64)
+    full_prompt = f'{prompt}, {positive_prompt}'
+    dt = 1000 // num_steps
     # Text condition
+    context = ddim.encoder.encode([negative_prompt, full_prompt])
+    token_idx = list(range(1, tokenize(prompt).index('<end_of_text>')))
+    token_idx += [tokenize(full_prompt).index('<end_of_text>')]
     # Setup painta if needed
     if 'painta' in method: init_painta(token_idx)

lib/methods/sr.py CHANGED Viewed

@@ -57,9 +57,22 @@ def refine_mask(hr_image, hr_mask, lr_image, sam_predictor):
     return new_mask
-def run(ddim, sam_predictor, lr_image, hr_image, hr_mask, prompt = 'high resolution professional photo', noise_level=20,
-blend_output = True, blend_trick = True, no_superres = False,
-dt = 50, seed = 1, guidance_scale = 7.5, negative_prompt = '', use_sam_mask = False):
     torch.manual_seed(seed)
     dtype = ddim.vae.encoder.conv_in.weight.dtype
     device = ddim.vae.encoder.conv_in.weight.device
@@ -67,6 +80,9 @@ dt = 50, seed = 1, guidance_scale = 7.5, negative_prompt = '', use_sam_mask = Fa
     router.attention_forward = attentionpatch.default.forward_xformers
     router.basic_transformer_forward = transformerpatch.default.forward
     if use_sam_mask:
         with torch.no_grad():
             hr_mask = refine_mask(hr_image, hr_mask, lr_image, sam_predictor)
@@ -74,70 +90,59 @@ dt = 50, seed = 1, guidance_scale = 7.5, negative_prompt = '', use_sam_mask = Fa
     orig_h, orig_w = hr_image.torch().shape[2], hr_image.torch().shape[3]
     hr_image = hr_image.padx(256, padding_mode='reflect')
     hr_mask = hr_mask.padx(256, padding_mode='reflect').dilate(19)
-    hr_mask_orig = hr_mask
-    lr_image = lr_image.padx(64, padding_mode='reflect')
-    lr_mask = hr_mask.resize((lr_image.torch().shape[2], lr_image.torch().shape[3]), resample = Image.BICUBIC).alpha().torch(vmin=0).to(device)
     lr_mask = TvF.gaussian_blur(lr_mask, kernel_size=19)
-    if no_superres:
-        output_tensor = lr_image.resize((hr_image.torch().shape[2], hr_image.torch().shape[3]), resample = Image.BICUBIC).torch().cuda()
-        output_tensor = (255*((output_tensor.clip(-1, 1) + 1) / 2)).to(torch.uint8)
-        output_tensor = poisson_blend(
-            orig_img=hr_image.data[0][:orig_h, :orig_w, :],
-            fake_img=output_tensor.cpu().permute(0, 2, 3, 1)[0].numpy()[:orig_h, :orig_w, :],
-            mask=hr_mask_orig.alpha().data[0][:orig_h, :orig_w, :]
-        )
-        return IImage(output_tensor[:orig_h, :orig_w, :])
     # encode hr image
     with torch.no_grad():
-        hr_z0 = ddim.vae.encode(hr_image.torch().cuda().to(dtype=dtype, device=device)).mean * ddim.config.scale_factor
-    assert hr_z0.shape[2] == lr_image.torch().shape[2]
-    assert hr_z0.shape[3] == lr_image.torch().shape[3]
-    unet_condition = lr_image.cuda().torch().to(memory_format=torch.contiguous_format).to(dtype=dtype, device=device)
-    zT = torch.randn((1,4,unet_condition.shape[2], unet_condition.shape[3])).cuda().to(dtype=dtype, device=device)
     with torch.no_grad():
         context = ddim.encoder.encode([negative_prompt, prompt])
     noise_level = torch.Tensor(1 * [noise_level]).to(device=device).long()
     unet_condition, noise_level = ddim.low_scale_model(unet_condition, noise_level=noise_level)
     with torch.autocast('cuda'), torch.no_grad():
-        zt = zT
         for index,t in enumerate(range(999, 0, -dt)):
             _zt = zt if unet_condition is None else torch.cat([zt, unet_condition], 1)
             eps_uncond, eps = ddim.unet(
                 torch.cat([_zt, _zt]).to(dtype=dtype, device=device),
                 timesteps = torch.tensor([t, t]).to(device=device),
                 context = context,
                 y=torch.cat([noise_level]*2)
             ).chunk(2)
             ts = torch.full((zt.shape[0],), t, device=device, dtype=torch.long)
             model_output = (eps_uncond + guidance_scale * (eps - eps_uncond))
             eps = predict_eps_from_z_and_v(ddim.schedule, zt, ts, model_output).to(dtype)
             z0 = predict_start_from_z_and_v(ddim.schedule, zt, ts, model_output).to(dtype)
             if blend_trick:
                 z0 = z0 * lr_mask + hr_z0 * (1-lr_mask)
             zt = ddim.schedule.sqrt_alphas[t - dt] * z0 + ddim.schedule.sqrt_one_minus_alphas[t - dt] * eps
     with torch.no_grad():
-        output_tensor = ddim.vae.decode(z0.to(dtype) / ddim.config.scale_factor)
     if blend_output:
-        output_tensor = (255*((output_tensor + 1) / 2).clip(0, 1)).to(torch.uint8)
-        output_tensor = poisson_blend(
-            orig_img=hr_image.data[0][:orig_h, :orig_w, :],
-            fake_img=output_tensor.cpu().permute(0, 2, 3, 1)[0].numpy()[:orig_h, :orig_w, :],
-            mask=hr_mask_orig.alpha().data[0][:orig_h, :orig_w, :]
         )
-        return IImage(output_tensor[:orig_h, :orig_w, :])
     else:
-        return IImage(output_tensor[:, :, :orig_h, :orig_w])

     return new_mask
+def run(
+    ddim,
+    sam_predictor,
+    lr_image,
+    hr_image,
+    hr_mask,
+    prompt = 'high resolution professional photo',
+    noise_level=20,
+    blend_output = True,
+    blend_trick = True,
+    dt = 50,
+    seed = 1,
+    guidance_scale = 7.5,
+    negative_prompt = '',
+    use_sam_mask = False
+):
     torch.manual_seed(seed)
     dtype = ddim.vae.encoder.conv_in.weight.dtype
     device = ddim.vae.encoder.conv_in.weight.device
     router.attention_forward = attentionpatch.default.forward_xformers
     router.basic_transformer_forward = transformerpatch.default.forward
+    hr_image_orig = hr_image
+    hr_mask_orig = hr_mask
     if use_sam_mask:
         with torch.no_grad():
             hr_mask = refine_mask(hr_image, hr_mask, lr_image, sam_predictor)
     orig_h, orig_w = hr_image.torch().shape[2], hr_image.torch().shape[3]
     hr_image = hr_image.padx(256, padding_mode='reflect')
     hr_mask = hr_mask.padx(256, padding_mode='reflect').dilate(19)
+    lr_image = lr_image.padx(64, padding_mode='reflect').torch()
+    lr_mask = hr_mask.resize((lr_image.shape[2:]), resample = Image.BICUBIC)
+    lr_mask = lr_mask.alpha().torch(vmin=0).to(device)
     lr_mask = TvF.gaussian_blur(lr_mask, kernel_size=19)
     # encode hr image
     with torch.no_grad():
+        hr_image = hr_image.torch().to(dtype=dtype, device=device)
+        hr_z0 = ddim.vae.encode(hr_image).mean * ddim.config.scale_factor
+    assert hr_z0.shape[2] == lr_image.shape[2]
+    assert hr_z0.shape[3] == lr_image.shape[3]
     with torch.no_grad():
         context = ddim.encoder.encode([negative_prompt, prompt])
     noise_level = torch.Tensor(1 * [noise_level]).to(device=device).long()
+    unet_condition = lr_image.to(dtype=dtype, device=device, memory_format=torch.contiguous_format)
     unet_condition, noise_level = ddim.low_scale_model(unet_condition, noise_level=noise_level)
     with torch.autocast('cuda'), torch.no_grad():
+        zt = torch.randn((1,4,unet_condition.shape[2], unet_condition.shape[3]))
+        zt = zt.cuda().to(dtype=dtype, device=device)
         for index,t in enumerate(range(999, 0, -dt)):
             _zt = zt if unet_condition is None else torch.cat([zt, unet_condition], 1)
             eps_uncond, eps = ddim.unet(
                 torch.cat([_zt, _zt]).to(dtype=dtype, device=device),
                 timesteps = torch.tensor([t, t]).to(device=device),
                 context = context,
                 y=torch.cat([noise_level]*2)
             ).chunk(2)
             ts = torch.full((zt.shape[0],), t, device=device, dtype=torch.long)
             model_output = (eps_uncond + guidance_scale * (eps - eps_uncond))
             eps = predict_eps_from_z_and_v(ddim.schedule, zt, ts, model_output).to(dtype)
             z0 = predict_start_from_z_and_v(ddim.schedule, zt, ts, model_output).to(dtype)
             if blend_trick:
                 z0 = z0 * lr_mask + hr_z0 * (1-lr_mask)
             zt = ddim.schedule.sqrt_alphas[t - dt] * z0 + ddim.schedule.sqrt_one_minus_alphas[t - dt] * eps
     with torch.no_grad():
+        hr_result = ddim.vae.decode(z0.to(dtype) / ddim.config.scale_factor)
+        # postprocess
+        hr_result = (255 * ((hr_result + 1) / 2).clip(0, 1)).to(torch.uint8)
+        hr_result = hr_result.cpu().permute(0, 2, 3, 1)[0].numpy()
+        hr_result = hr_result[:orig_h, :orig_w, :]
     if blend_output:
+        hr_result = poisson_blend(
+            orig_img=hr_image_orig.data[0],
+            fake_img=hr_result,
+            mask=hr_mask_orig.alpha().data[0]
         )
+        return Image.fromarray(hr_result)
     else:
+        return Image.fromarray(hr_result)