File size: 2,955 Bytes
2571a09
 
 
 
 
9fc4651
305c867
2571a09
42a1e5c
b40804f
 
68bba7b
b40804f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42a1e5c
a826a95
 
68bba7b
2571a09
42a1e5c
 
b40804f
2571a09
 
 
 
 
 
 
 
10ca3e2
 
 
 
 
305c867
a826a95
10ca3e2
305c867
a68f6ce
f32daf2
 
2571a09
b40804f
2571a09
 
8279f18
2571a09
a826a95
2571a09
 
 
f32daf2
 
 
2ad848e
68bba7b
f32daf2
2571a09
 
 
 
 
 
 
42a1e5c
2571a09
 
b40804f
305c867
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
import torch
from PIL import Image
from diffusers import AutoPipelineForText2Image, DDIMScheduler
import numpy as np
from torchvision import transforms
import spaces

# Initialize the pipeline
pipeline = AutoPipelineForText2Image.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.float16
)

# Configure the scheduler for the pipeline
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)

# Load IP adapter with specified weights and set the scale for each component
pipeline.load_ip_adapter(
    "h94/IP-Adapter",
    subfolder="sdxl_models",
    weight_name=[
        "ip-adapter-plus_sdxl_vit-h.safetensors",
        "ip-adapter-plus-face_sdxl_vit-h.safetensors"
    ]
)
pipeline.set_ip_adapter_scale([0.7, 0.5])

# Define the desired size for the images
desired_size = (1024, 1024)

@spaces.GPU
def transform_image(face_image):
    # Move the pipeline to the GPU inside the function
    pipeline.to("cuda")
    generator = torch.Generator(device="cuda").manual_seed(0)

    # Process the input face image
    if isinstance(face_image, Image.Image):
        processed_face_image = face_image
    elif isinstance(face_image, np.ndarray):
        processed_face_image = Image.fromarray(face_image)
    else:
        raise ValueError("Unsupported image format")
    
    # Convert the processed face image to RGB format if it has only 1 channel
    if processed_face_image.mode == 'L':
        processed_face_image = processed_face_image.convert('RGB')
    
    # Resize the face image to 1024x1024
    processed_face_image = processed_face_image.resize(desired_size, Image.LANCZOS)

    # Load the style image from the local path, resize it to 1024x1024, and convert to tensor
    style_image_path = "examples/soyjak2.jpg"  # Ensure this path is correct
    style_image = Image.open(style_image_path).resize(desired_size, Image.LANCZOS)
    style_image_tensor = transforms.ToTensor()(style_image).unsqueeze(0).to("cuda")

    # Perform the transformation using the configured pipeline
    image = pipeline(
        prompt="soyjak",
        ip_adapter_image=[style_image_tensor, processed_face_image],  # Ensure these are tensors
        negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
        num_inference_steps=30,
        generator=generator,
    ).images[0]

    # Convert the tensor to a PIL Image to display it in Gradio
    image = transforms.ToPILImage()(image.squeeze(0))
    
    # Move the pipeline back to CPU after processing to release GPU resources
    pipeline.to("cpu")
    return image

# Gradio interface setup
demo = gr.Interface(
    fn=transform_image,
    inputs=gr.Image(label="Upload your face image"),
    outputs=gr.Image(label="Your Soyjak"),
    title="InstaSoyjak - turn anyone into a Soyjak",
    description="All you need to do is upload an image. Please use responsibly.",
)

demo.queue(max_size=20)
demo.launch()