File size: 2,966 Bytes
59be1d1
b294284
5287b40
855a558
5287b40
 
d32fde8
214f5df
59be1d1
 
855a558
 
 
 
 
5287b40
b294284
855a558
5287b40
 
b294284
d32fde8
855a558
5287b40
 
b294284
5287b40
 
b294284
5287b40
 
b294284
5287b40
214f5df
5287b40
 
b294284
855a558
5287b40
 
 
855a558
5287b40
b294284
855a558
5287b40
b294284
855a558
b294284
855a558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b294284
59be1d1
 
855a558
b294284
 
5287b40
b294284
 
855a558
 
214f5df
b294284
855a558
59be1d1
a3a7ec2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import gradio as gr
import torch
import numpy as np
from diffusers import DiffusionPipeline
from PIL import Image, ImageDraw
from transformers import DetrImageProcessor, DetrForObjectDetection
import spaces

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Inpainting Pipeline
pipe = DiffusionPipeline.from_pretrained(
    "SG161222/RealVisXL_V4.0",  # ✅ Realistic human generation model
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    use_safetensors=True
).to(device)

# Load DETR for human detection
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
detector = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)

@spaces.GPU
def detect_and_replace(input_image, prompt, negative_prompt=""):
    if input_image is None or prompt == "":
        return None

    image_np = np.array(input_image)
    inputs = processor(images=input_image, return_tensors="pt").to(device)

    outputs = detector(**inputs)
    target_sizes = torch.tensor([image_np.shape[:2]]).to(device)

    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

    mask = Image.new("L", input_image.size, 0)
    draw = ImageDraw.Draw(mask)

    boxes = []
    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        if detector.config.id2label[label.item()] == "person":
            box = [int(i) for i in box.tolist()]
            boxes.append(box)
            draw.rectangle(box, fill=255)

    if not boxes:
        return "No human detected."

    output_image = input_image.copy()

    for box in boxes:
        x1, y1, x2, y2 = box
        width, height = x2 - x1, y2 - y1

        # Generate imaginary person image
        generated_image = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=512,
            height=768,
            guidance_scale=7.5,
            num_inference_steps=30,
            output_type="pil"
        ).images[0]

        # Resize generated image to fit the detected box
        resized_generated = generated_image.resize((width, height))

        # Paste the generated image on the original image at the detected location
        output_image.paste(resized_generated, (x1, y1))

    return output_image

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## Replace Bride and Groom with Imaginary Realistic Characters")

    with gr.Row():
        input_image = gr.Image(type="pil", label="Input Image")
        output_image = gr.Image(type="pil", label="Output Image")

    prompt_text = gr.Textbox(label="Prompt", placeholder="Describe the imaginary bride/groom")
    negative_prompt_text = gr.Textbox(label="Negative Prompt", placeholder="Optional negative prompt")
    submit = gr.Button("Submit")

    submit.click(detect_and_replace, inputs=[input_image, prompt_text, negative_prompt_text], outputs=output_image)

demo.launch()