|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
|
import spaces |
|
|
|
|
|
import argparse |
|
|
import inspect |
|
|
import os |
|
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union |
|
|
import matplotlib.pyplot as plt |
|
|
from PIL import Image |
|
|
|
|
|
import torch |
|
|
import torch.nn.functional as F |
|
|
import numpy as np |
|
|
import random |
|
|
import warnings |
|
|
from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer |
|
|
from utils import * |
|
|
import hashlib |
|
|
|
|
|
from diffusers.image_processor import VaeImageProcessor |
|
|
from diffusers.loaders import ( |
|
|
FromSingleFileMixin, |
|
|
LoraLoaderMixin, |
|
|
TextualInversionLoaderMixin, |
|
|
) |
|
|
from diffusers.models import AutoencoderKL, UNet2DConditionModel |
|
|
from diffusers.models.attention_processor import ( |
|
|
AttnProcessor2_0, |
|
|
LoRAAttnProcessor2_0, |
|
|
LoRAXFormersAttnProcessor, |
|
|
XFormersAttnProcessor, |
|
|
) |
|
|
from diffusers.models.lora import adjust_lora_scale_text_encoder |
|
|
from diffusers.schedulers import KarrasDiffusionSchedulers |
|
|
from diffusers.utils import ( |
|
|
is_accelerate_available, |
|
|
is_accelerate_version, |
|
|
is_invisible_watermark_available, |
|
|
logging, |
|
|
replace_example_docstring, |
|
|
) |
|
|
from diffusers.utils.torch_utils import randn_tensor |
|
|
from diffusers.pipelines.pipeline_utils import DiffusionPipeline |
|
|
from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput |
|
|
from accelerate.utils import set_seed |
|
|
from tqdm import tqdm |
|
|
if is_invisible_watermark_available(): |
|
|
from .watermark import StableDiffusionXLWatermarker |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
EXAMPLE_DOC_STRING = """ |
|
|
Examples: |
|
|
```py |
|
|
>>> import torch |
|
|
>>> from diffusers import StableDiffusionXLPipeline |
|
|
|
|
|
>>> pipe = StableDiffusionXLPipeline.from_pretrained( |
|
|
... "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 |
|
|
... ) |
|
|
>>> pipe = pipe.to("cuda") |
|
|
|
|
|
>>> prompt = "a photo of an astronaut riding a horse on mars" |
|
|
>>> image = pipe(prompt).images[0] |
|
|
``` |
|
|
""" |
|
|
|
|
|
def gaussian_kernel(kernel_size=3, sigma=1.0, channels=3): |
|
|
x_coord = torch.arange(kernel_size) |
|
|
gaussian_1d = torch.exp(-(x_coord - (kernel_size - 1) / 2) ** 2 / (2 * sigma ** 2)) |
|
|
gaussian_1d = gaussian_1d / gaussian_1d.sum() |
|
|
gaussian_2d = gaussian_1d[:, None] * gaussian_1d[None, :] |
|
|
kernel = gaussian_2d[None, None, :, :].repeat(channels, 1, 1, 1) |
|
|
|
|
|
return kernel |
|
|
|
|
|
def gaussian_filter(latents, kernel_size=3, sigma=1.0): |
|
|
channels = latents.shape[1] |
|
|
kernel = gaussian_kernel(kernel_size, sigma, channels).to(latents.device, latents.dtype) |
|
|
blurred_latents = F.conv2d(latents, kernel, padding=kernel_size//2, groups=channels) |
|
|
return blurred_latents |
|
|
|
|
|
|
|
|
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): |
|
|
""" |
|
|
Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and |
|
|
Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 |
|
|
""" |
|
|
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) |
|
|
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) |
|
|
|
|
|
noise_pred_rescaled = noise_cfg * (std_text / std_cfg) |
|
|
|
|
|
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg |
|
|
return noise_cfg |
|
|
|
|
|
|
|
|
class AccDiffusionSDXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin): |
|
|
""" |
|
|
Pipeline for text-to-image generation using Stable Diffusion XL. |
|
|
|
|
|
[ํด๋์ค ์ค๋ช
์๋ต โฆ] |
|
|
""" |
|
|
model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
vae: AutoencoderKL, |
|
|
text_encoder: CLIPTextModel, |
|
|
text_encoder_2: CLIPTextModelWithProjection, |
|
|
tokenizer: CLIPTokenizer, |
|
|
tokenizer_2: CLIPTokenizer, |
|
|
unet: UNet2DConditionModel, |
|
|
scheduler: KarrasDiffusionSchedulers, |
|
|
force_zeros_for_empty_prompt: bool = True, |
|
|
add_watermarker: Optional[bool] = None, |
|
|
): |
|
|
super().__init__() |
|
|
|
|
|
self.register_modules( |
|
|
vae=vae, |
|
|
text_encoder=text_encoder, |
|
|
text_encoder_2=text_encoder_2, |
|
|
tokenizer=tokenizer, |
|
|
tokenizer_2=tokenizer_2, |
|
|
unet=unet, |
|
|
scheduler=scheduler, |
|
|
) |
|
|
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) |
|
|
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) |
|
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) |
|
|
self.default_sample_size = self.unet.config.sample_size |
|
|
|
|
|
add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() |
|
|
|
|
|
if add_watermarker: |
|
|
self.watermark = StableDiffusionXLWatermarker() |
|
|
else: |
|
|
self.watermark = None |
|
|
|
|
|
|
|
|
|
|
|
@torch.no_grad() |
|
|
@replace_example_docstring(EXAMPLE_DOC_STRING) |
|
|
def __call__( |
|
|
self, |
|
|
prompt: Union[str, List[str]] = None, |
|
|
prompt_2: Optional[Union[str, List[str]]] = None, |
|
|
height: Optional[int] = None, |
|
|
width: Optional[int] = None, |
|
|
num_inference_steps: int = 50, |
|
|
denoising_end: Optional[float] = None, |
|
|
guidance_scale: float = 5.0, |
|
|
negative_prompt: Optional[Union[str, List[str]]] = None, |
|
|
negative_prompt_2: Optional[Union[str, List[str]]] = None, |
|
|
num_images_per_prompt: Optional[int] = 1, |
|
|
eta: float = 0.0, |
|
|
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, |
|
|
latents: Optional[torch.FloatTensor] = None, |
|
|
prompt_embeds: Optional[torch.FloatTensor] = None, |
|
|
negative_prompt_embeds: Optional[torch.FloatTensor] = None, |
|
|
pooled_prompt_embeds: Optional[torch.FloatTensor] = None, |
|
|
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, |
|
|
output_type: Optional[str] = "pil", |
|
|
return_dict: bool = False, |
|
|
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, |
|
|
callback_steps: int = 1, |
|
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None, |
|
|
guidance_rescale: float = 0.0, |
|
|
original_size: Optional[Tuple[int, int]] = None, |
|
|
crops_coords_top_left: Tuple[int, int] = (0, 0), |
|
|
target_size: Optional[Tuple[int, int]] = None, |
|
|
negative_original_size: Optional[Tuple[int, int]] = None, |
|
|
negative_crops_coords_top_left: Tuple[int, int] = (0, 0), |
|
|
negative_target_size: Optional[Tuple[int, int]] = None, |
|
|
|
|
|
image_lr: Optional[torch.FloatTensor] = None, |
|
|
view_batch_size: int = 16, |
|
|
multi_decoder: bool = True, |
|
|
stride: Optional[int] = 64, |
|
|
cosine_scale_1: Optional[float] = 3., |
|
|
cosine_scale_2: Optional[float] = 1., |
|
|
cosine_scale_3: Optional[float] = 1., |
|
|
sigma: Optional[float] = 1.0, |
|
|
lowvram: bool = False, |
|
|
multi_guidance_scale: Optional[float] = 7.5, |
|
|
use_guassian: bool = True, |
|
|
upscale_mode: Union[str, List[str]] = 'bicubic_latent', |
|
|
use_multidiffusion: bool = True, |
|
|
use_dilated_sampling : bool = True, |
|
|
use_skip_residual: bool = True, |
|
|
use_progressive_upscaling: bool = True, |
|
|
shuffle: bool = False, |
|
|
result_path: str = './outputs/AccDiffusion', |
|
|
debug: bool = False, |
|
|
use_md_prompt: bool = False, |
|
|
attn_res=None, |
|
|
save_attention_map: bool = False, |
|
|
seed: Optional[int] = None, |
|
|
c : Optional[float] = 0.3, |
|
|
): |
|
|
r""" |
|
|
[ํจ์ ์ค๋ช
์๋ต โฆ] |
|
|
""" |
|
|
|
|
|
|
|
|
output_images = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return output_images |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser() |
|
|
|
|
|
parser.add_argument('--model_ckpt', default='stabilityai/stable-diffusion-xl-base-1.0') |
|
|
parser.add_argument('--seed', type=int, default=42) |
|
|
parser.add_argument('--prompt', default="Astronaut on Mars During sunset.") |
|
|
parser.add_argument('--negative_prompt', default="blurry, ugly, duplicate, poorly drawn, deformed, mosaic") |
|
|
parser.add_argument('--cosine_scale_1', default=3, type=float, help="cosine scale 1") |
|
|
parser.add_argument('--cosine_scale_2', default=1, type=float, help="cosine scale 2") |
|
|
parser.add_argument('--cosine_scale_3', default=1, type=float, help="cosine scale 3") |
|
|
parser.add_argument('--sigma', default=0.8, type=float, help="sigma") |
|
|
parser.add_argument('--multi_decoder', default=True, type=bool, help="multi decoder or not") |
|
|
parser.add_argument('--num_inference_steps', default=50, type=int, help="num inference steps") |
|
|
parser.add_argument('--resolution', default='1024,1024', help="target resolution") |
|
|
parser.add_argument('--use_multidiffusion', default=False, action='store_true', help="use multidiffusion or not") |
|
|
parser.add_argument('--use_guassian', default=False, action='store_true', help="use guassian or not") |
|
|
parser.add_argument('--use_dilated_sampling', default=True, action='store_true', help="use dilated sampling or not") |
|
|
parser.add_argument('--use_progressive_upscaling', default=False, action='store_true', help="use progressive upscaling or not") |
|
|
parser.add_argument('--shuffle', default=False, action='store_true', help="shuffle or not") |
|
|
parser.add_argument('--use_skip_residual', default=False, action='store_true', help="use skip_residual or not") |
|
|
parser.add_argument('--save_attention_map', default=False, action='store_true', help="save attention map or not") |
|
|
parser.add_argument('--multi_guidance_scale', default=7.5, type=float, help="multi guidance scale") |
|
|
parser.add_argument('--upscale_mode', default="bicubic_latent", help="bicubic_image or bicubic_latent ") |
|
|
parser.add_argument('--use_md_prompt', default=False, action='store_true', help="use md prompt or not") |
|
|
parser.add_argument('--view_batch_size', default=16, type=int, help="view_batch_size") |
|
|
parser.add_argument('--stride', default=64, type=int, help="stride") |
|
|
parser.add_argument('--c', default=0.3, type=float, help="threshold") |
|
|
|
|
|
parser.add_argument('--debug', default=False, action='store_true') |
|
|
parser.add_argument('--experiment_name', default="AccDiffusion") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
pipe = AccDiffusionSDXLPipeline.from_pretrained(args.model_ckpt, torch_dtype=torch.float16).to("cuda") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
css = """ |
|
|
body { |
|
|
background: linear-gradient(135deg, #2c3e50, #4ca1af); |
|
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; |
|
|
color: #ffffff; |
|
|
} |
|
|
#col-container { |
|
|
margin: 20px auto; |
|
|
padding: 20px; |
|
|
max-width: 900px; |
|
|
background-color: rgba(0, 0, 0, 0.5); |
|
|
border-radius: 12px; |
|
|
box-shadow: 0 4px 12px rgba(0,0,0,0.5); |
|
|
} |
|
|
h1, h2 { |
|
|
text-align: center; |
|
|
margin-bottom: 10px; |
|
|
} |
|
|
footer { |
|
|
visibility: hidden; |
|
|
} |
|
|
""" |
|
|
|
|
|
@spaces.GPU(duration=200) |
|
|
def infer(prompt, resolution, num_inference_steps, guidance_scale, seed, use_multidiffusion, use_skip_residual, use_dilated_sampling, use_progressive_upscaling, shuffle, use_md_prompt, progress=gr.Progress(track_tqdm=True)): |
|
|
set_seed(seed) |
|
|
width, height = list(map(int, resolution.split(','))) |
|
|
cross_attention_kwargs = {"edit_type": "visualize", |
|
|
"n_self_replace": 0.4, |
|
|
"n_cross_replace": {"default_": 1.0, "confetti": 0.8}, |
|
|
} |
|
|
generator = torch.Generator(device='cuda').manual_seed(seed) |
|
|
|
|
|
print(f"Prompt: {prompt}") |
|
|
md5_hash = hashlib.md5(prompt.encode()).hexdigest() |
|
|
result_path = f"./output/{args.experiment_name}/{md5_hash}/{width}_{height}_{seed}/" |
|
|
|
|
|
images = pipe( |
|
|
prompt, |
|
|
negative_prompt=args.negative_prompt, |
|
|
generator=generator, |
|
|
width=width, |
|
|
height=height, |
|
|
view_batch_size=args.view_batch_size, |
|
|
stride=args.stride, |
|
|
cross_attention_kwargs=cross_attention_kwargs, |
|
|
num_inference_steps=num_inference_steps, |
|
|
guidance_scale=guidance_scale, |
|
|
multi_guidance_scale=args.multi_guidance_scale, |
|
|
cosine_scale_1=args.cosine_scale_1, |
|
|
cosine_scale_2=args.cosine_scale_2, |
|
|
cosine_scale_3=args.cosine_scale_3, |
|
|
sigma=args.sigma, |
|
|
use_guassian=args.use_guassian, |
|
|
multi_decoder=args.multi_decoder, |
|
|
upscale_mode=args.upscale_mode, |
|
|
use_multidiffusion=use_multidiffusion, |
|
|
use_skip_residual=use_skip_residual, |
|
|
use_progressive_upscaling=use_progressive_upscaling, |
|
|
use_dilated_sampling=use_dilated_sampling, |
|
|
shuffle=shuffle, |
|
|
result_path=result_path, |
|
|
debug=args.debug, |
|
|
save_attention_map=args.save_attention_map, |
|
|
use_md_prompt=use_md_prompt, |
|
|
c=args.c |
|
|
) |
|
|
print(images) |
|
|
|
|
|
return images |
|
|
|
|
|
|
|
|
MAX_SEED = np.iinfo(np.int32).max |
|
|
|
|
|
with gr.Blocks(css=css) as demo: |
|
|
with gr.Column(elem_id="col-container"): |
|
|
gr.Markdown("<h1>AccDiffusion: Advanced AI Art Generator</h1>") |
|
|
gr.Markdown( |
|
|
"์์ฑํ ์ด๋ฏธ์ง๋ฅผ ์ํ ์ฐฝ์์ ์ธ ํ๋กฌํํธ๋ฅผ ์
๋ ฅํ์ธ์. ์ด ๋ชจ๋ธ์ ์ต์ AccDiffusion ๊ธฐ๋ฒ์ ์ ์ฉํ์ฌ ๋ค์ํ ์คํ์ผ๊ณผ ํด์๋์ ์์ ์ํ์ ๋ง๋ค์ด๋
๋๋ค." |
|
|
) |
|
|
with gr.Row(): |
|
|
prompt = gr.Textbox(label="Prompt", placeholder="์: A surreal landscape with floating islands and vibrant colors.", lines=2, scale=4) |
|
|
submit_btn = gr.Button("Generate", scale=1) |
|
|
|
|
|
with gr.Accordion("Advanced Settings", open=False): |
|
|
with gr.Row(): |
|
|
resolution = gr.Radio( |
|
|
label="Resolution", |
|
|
choices=[ |
|
|
"1024,1024", "2048,2048", "2048,1024", "1536,3072", "3072,3072", "4096,4096", "4096,2048" |
|
|
], |
|
|
value="1024,1024", |
|
|
interactive=True |
|
|
) |
|
|
with gr.Column(): |
|
|
num_inference_steps = gr.Slider(label="Inference Steps", minimum=2, maximum=50, step=1, value=30, info="Number of denoising steps") |
|
|
guidance_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=510, step=0.1, value=7.5, info="Higher values increase adherence to the prompt") |
|
|
seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, info="Set a seed for reproducibility") |
|
|
with gr.Row(): |
|
|
use_multidiffusion = gr.Checkbox(label="Use MultiDiffusion", value=True) |
|
|
use_skip_residual = gr.Checkbox(label="Use Skip Residual", value=True) |
|
|
use_dilated_sampling = gr.Checkbox(label="Use Dilated Sampling", value=True) |
|
|
with gr.Row(): |
|
|
use_progressive_upscaling = gr.Checkbox(label="Use Progressive Upscaling", value=False) |
|
|
shuffle = gr.Checkbox(label="Shuffle", value=False) |
|
|
use_md_prompt = gr.Checkbox(label="Use MD Prompt", value=False) |
|
|
|
|
|
output_images = gr.Gallery(label="Output Images", format="png").style(grid=[2], height="auto") |
|
|
gr.Markdown("### Example Prompts") |
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["A surreal landscape with floating islands and vibrant colors."], |
|
|
["Cyberpunk cityscape at night with neon lights and futuristic architecture."], |
|
|
["A majestic dragon soaring over a medieval castle amidst stormy skies."], |
|
|
["Futuristic robot exploring an alien planet with mysterious flora."], |
|
|
["Abstract geometric patterns in vivid, pulsating colors."], |
|
|
["A mystical forest illuminated by bioluminescent plants under a starry sky."] |
|
|
], |
|
|
inputs=[prompt], |
|
|
label="Click an example to populate the prompt box." |
|
|
) |
|
|
submit_btn.click( |
|
|
fn=infer, |
|
|
inputs=[prompt, resolution, num_inference_steps, guidance_scale, seed, |
|
|
use_multidiffusion, use_skip_residual, use_dilated_sampling, use_progressive_upscaling, shuffle, use_md_prompt], |
|
|
outputs=[output_images], |
|
|
show_api=False |
|
|
) |
|
|
demo.launch(show_api=False, show_error=True) |
|
|
|