diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..90e94b8de914481355ec5c234278b62cd4d95e9e --- /dev/null +++ b/app.py @@ -0,0 +1,554 @@ +import json +from argparse import ArgumentParser + +from PIL import Image + +import constants +from backend.controlnet import controlnet_settings_from_dict +from backend.device import get_device_name +from backend.models.gen_images import ImageFormat +from backend.models.lcmdiffusion_setting import DiffusionTask +from backend.upscale.tiled_upscale import generate_upscaled_image +from constants import APP_VERSION, DEVICE +from frontend.webui.image_variations_ui import generate_image_variations +from models.interface_types import InterfaceType +from paths import FastStableDiffusionPaths, ensure_path +from state import get_context, get_settings +from utils import show_system_info + +parser = ArgumentParser(description=f"FAST SD CPU {constants.APP_VERSION}") +parser.add_argument( + "-s", + "--share", + action="store_true", + help="Create sharable link(Web UI)", + required=False, +) +group = parser.add_mutually_exclusive_group(required=False) +group.add_argument( + "-g", + "--gui", + action="store_true", + help="Start desktop GUI", +) +group.add_argument( + "-w", + "--webui", + action="store_true", + help="Start Web UI", +) +group.add_argument( + "-a", + "--api", + action="store_true", + help="Start Web API server", +) +group.add_argument( + "-m", + "--mcp", + action="store_true", + help="Start MCP(Model Context Protocol) server", +) +group.add_argument( + "-r", + "--realtime", + action="store_true", + help="Start realtime inference UI(experimental)", +) +group.add_argument( + "-v", + "--version", + action="store_true", + help="Version", +) + +parser.add_argument( + "-b", + "--benchmark", + action="store_true", + help="Run inference benchmark on the selected device", +) +parser.add_argument( + "--lcm_model_id", + type=str, + help="Model ID or path,Default stabilityai/sd-turbo", + default="stabilityai/sd-turbo", +) +parser.add_argument( + "--openvino_lcm_model_id", + type=str, + help="OpenVINO Model ID or path,Default rupeshs/sd-turbo-openvino", + default="rupeshs/sd-turbo-openvino", +) +parser.add_argument( + "--prompt", + type=str, + help="Describe the image you want to generate", + default="", +) +parser.add_argument( + "--negative_prompt", + type=str, + help="Describe what you want to exclude from the generation", + default="", +) +parser.add_argument( + "--image_height", + type=int, + help="Height of the image", + default=512, +) +parser.add_argument( + "--image_width", + type=int, + help="Width of the image", + default=512, +) +parser.add_argument( + "--inference_steps", + type=int, + help="Number of steps,default : 1", + default=1, +) +parser.add_argument( + "--guidance_scale", + type=float, + help="Guidance scale,default : 1.0", + default=1.0, +) + +parser.add_argument( + "--number_of_images", + type=int, + help="Number of images to generate ,default : 1", + default=1, +) +parser.add_argument( + "--seed", + type=int, + help="Seed,default : -1 (disabled) ", + default=-1, +) +parser.add_argument( + "--use_openvino", + action="store_true", + help="Use OpenVINO model", +) + +parser.add_argument( + "--use_offline_model", + action="store_true", + help="Use offline model", +) +parser.add_argument( + "--clip_skip", + type=int, + help="CLIP Skip (1-12), default : 1 (disabled) ", + default=1, +) +parser.add_argument( + "--token_merging", + type=float, + help="Token merging scale, 0.0 - 1.0, default : 0.0", + default=0.0, +) + +parser.add_argument( + "--use_safety_checker", + action="store_true", + help="Use safety checker", +) +parser.add_argument( + "--use_lcm_lora", + action="store_true", + help="Use LCM-LoRA", +) +parser.add_argument( + "--base_model_id", + type=str, + help="LCM LoRA base model ID,Default Lykon/dreamshaper-8", + default="Lykon/dreamshaper-8", +) +parser.add_argument( + "--lcm_lora_id", + type=str, + help="LCM LoRA model ID,Default latent-consistency/lcm-lora-sdv1-5", + default="latent-consistency/lcm-lora-sdv1-5", +) +parser.add_argument( + "-i", + "--interactive", + action="store_true", + help="Interactive CLI mode", +) +parser.add_argument( + "-t", + "--use_tiny_auto_encoder", + action="store_true", + help="Use tiny auto encoder for SD (TAESD)", +) +parser.add_argument( + "-f", + "--file", + type=str, + help="Input image for img2img mode", + default="", +) +parser.add_argument( + "--img2img", + action="store_true", + help="img2img mode; requires input file via -f argument", +) +parser.add_argument( + "--batch_count", + type=int, + help="Number of sequential generations", + default=1, +) +parser.add_argument( + "--strength", + type=float, + help="Denoising strength for img2img and Image variations", + default=0.3, +) +parser.add_argument( + "--sdupscale", + action="store_true", + help="Tiled SD upscale,works only for the resolution 512x512,(2x upscale)", +) +parser.add_argument( + "--upscale", + action="store_true", + help="EDSR SD upscale ", +) +parser.add_argument( + "--custom_settings", + type=str, + help="JSON file containing custom generation settings", + default=None, +) +parser.add_argument( + "--usejpeg", + action="store_true", + help="Images will be saved as JPEG format", +) +parser.add_argument( + "--noimagesave", + action="store_true", + help="Disable image saving", +) +parser.add_argument( + "--imagequality", type=int, help="Output image quality [0 to 100]", default=90 +) +parser.add_argument( + "--lora", + type=str, + help="LoRA model full path e.g D:\lora_models\CuteCartoon15V-LiberteRedmodModel-Cartoon-CuteCartoonAF.safetensors", + default=None, +) +parser.add_argument( + "--lora_weight", + type=float, + help="LoRA adapter weight [0 to 1.0]", + default=0.5, +) +parser.add_argument( + "--port", + type=int, + help="Web server port", + default=8000, +) + +args = parser.parse_args() + +if args.version: + print(APP_VERSION) + exit() + +# parser.print_help() +print("FastSD CPU - ", APP_VERSION) +show_system_info() +print(f"Using device : {constants.DEVICE}") + + +if args.webui: + app_settings = get_settings() +else: + app_settings = get_settings() + +print(f"Output path : {app_settings.settings.generated_images.path}") +ensure_path(app_settings.settings.generated_images.path) + +print(f"Found {len(app_settings.lcm_models)} LCM models in config/lcm-models.txt") +print( + f"Found {len(app_settings.stable_diffsuion_models)} stable diffusion models in config/stable-diffusion-models.txt" +) +print( + f"Found {len(app_settings.lcm_lora_models)} LCM-LoRA models in config/lcm-lora-models.txt" +) +print( + f"Found {len(app_settings.openvino_lcm_models)} OpenVINO LCM models in config/openvino-lcm-models.txt" +) + +if args.noimagesave: + app_settings.settings.generated_images.save_image = False +else: + app_settings.settings.generated_images.save_image = True + +app_settings.settings.generated_images.save_image_quality = args.imagequality + +if not args.realtime: + # To minimize realtime mode dependencies + from backend.upscale.upscaler import upscale_image + from frontend.cli_interactive import interactive_mode + +if args.gui: + from frontend.gui.ui import start_gui + + print("Starting desktop GUI mode(Qt)") + start_gui( + [], + app_settings, + ) +elif args.webui: + from frontend.webui.ui import start_webui + + print("Starting web UI mode") + start_webui( + args.share, + ) +elif args.realtime: + from frontend.webui.realtime_ui import start_realtime_text_to_image + + print("Starting realtime text to image(EXPERIMENTAL)") + start_realtime_text_to_image(args.share) +elif args.api: + from backend.api.web import start_web_server + + start_web_server(args.port) +elif args.mcp: + from backend.api.mcp_server import start_mcp_server + + start_mcp_server(args.port) +else: + context = get_context(InterfaceType.CLI) + config = app_settings.settings + + if args.use_openvino: + config.lcm_diffusion_setting.openvino_lcm_model_id = args.openvino_lcm_model_id + else: + config.lcm_diffusion_setting.lcm_model_id = args.lcm_model_id + + config.lcm_diffusion_setting.prompt = args.prompt + config.lcm_diffusion_setting.negative_prompt = args.negative_prompt + config.lcm_diffusion_setting.image_height = args.image_height + config.lcm_diffusion_setting.image_width = args.image_width + config.lcm_diffusion_setting.guidance_scale = args.guidance_scale + config.lcm_diffusion_setting.number_of_images = args.number_of_images + config.lcm_diffusion_setting.inference_steps = args.inference_steps + config.lcm_diffusion_setting.strength = args.strength + config.lcm_diffusion_setting.seed = args.seed + config.lcm_diffusion_setting.use_openvino = args.use_openvino + config.lcm_diffusion_setting.use_tiny_auto_encoder = args.use_tiny_auto_encoder + config.lcm_diffusion_setting.use_lcm_lora = args.use_lcm_lora + config.lcm_diffusion_setting.lcm_lora.base_model_id = args.base_model_id + config.lcm_diffusion_setting.lcm_lora.lcm_lora_id = args.lcm_lora_id + config.lcm_diffusion_setting.diffusion_task = DiffusionTask.text_to_image.value + config.lcm_diffusion_setting.lora.enabled = False + config.lcm_diffusion_setting.lora.path = args.lora + config.lcm_diffusion_setting.lora.weight = args.lora_weight + config.lcm_diffusion_setting.lora.fuse = True + if config.lcm_diffusion_setting.lora.path: + config.lcm_diffusion_setting.lora.enabled = True + if args.usejpeg: + config.generated_images.format = ImageFormat.JPEG.value.upper() + if args.seed > -1: + config.lcm_diffusion_setting.use_seed = True + else: + config.lcm_diffusion_setting.use_seed = False + config.lcm_diffusion_setting.use_offline_model = args.use_offline_model + config.lcm_diffusion_setting.clip_skip = args.clip_skip + config.lcm_diffusion_setting.token_merging = args.token_merging + config.lcm_diffusion_setting.use_safety_checker = args.use_safety_checker + + # Read custom settings from JSON file + custom_settings = {} + if args.custom_settings: + with open(args.custom_settings) as f: + custom_settings = json.load(f) + + # Basic ControlNet settings; if ControlNet is enabled, an image is + # required even in txt2img mode + config.lcm_diffusion_setting.controlnet = None + controlnet_settings_from_dict( + config.lcm_diffusion_setting, + custom_settings, + ) + + # Interactive mode + if args.interactive: + # wrapper(interactive_mode, config, context) + config.lcm_diffusion_setting.lora.fuse = False + interactive_mode(config, context) + + # Start of non-interactive CLI image generation + if args.img2img and args.file != "": + config.lcm_diffusion_setting.init_image = Image.open(args.file) + config.lcm_diffusion_setting.diffusion_task = DiffusionTask.image_to_image.value + elif args.img2img and args.file == "": + print("Error : You need to specify a file in img2img mode") + exit() + elif args.upscale and args.file == "" and args.custom_settings == None: + print("Error : You need to specify a file in SD upscale mode") + exit() + elif ( + args.prompt == "" + and args.file == "" + and args.custom_settings == None + and not args.benchmark + ): + print("Error : You need to provide a prompt") + exit() + + if args.upscale: + # image = Image.open(args.file) + output_path = FastStableDiffusionPaths.get_upscale_filepath( + args.file, + 2, + config.generated_images.format, + ) + result = upscale_image( + context, + args.file, + output_path, + 2, + ) + # Perform Tiled SD upscale (EXPERIMENTAL) + elif args.sdupscale: + if args.use_openvino: + config.lcm_diffusion_setting.strength = 0.3 + upscale_settings = None + if custom_settings != {}: + upscale_settings = custom_settings + filepath = args.file + output_format = config.generated_images.format + if upscale_settings: + filepath = upscale_settings["source_file"] + output_format = upscale_settings["output_format"].upper() + output_path = FastStableDiffusionPaths.get_upscale_filepath( + filepath, + 2, + output_format, + ) + + generate_upscaled_image( + config, + filepath, + config.lcm_diffusion_setting.strength, + upscale_settings=upscale_settings, + context=context, + tile_overlap=32 if config.lcm_diffusion_setting.use_openvino else 16, + output_path=output_path, + image_format=output_format, + ) + exit() + # If img2img argument is set and prompt is empty, use image variations mode + elif args.img2img and args.prompt == "": + for i in range(0, args.batch_count): + generate_image_variations( + config.lcm_diffusion_setting.init_image, args.strength + ) + else: + if args.benchmark: + print("Initializing benchmark...") + bench_lcm_setting = config.lcm_diffusion_setting + bench_lcm_setting.prompt = "a cat" + bench_lcm_setting.use_tiny_auto_encoder = False + context.generate_text_to_image( + settings=config, + device=DEVICE, + ) + + latencies = [] + + print("Starting benchmark please wait...") + for _ in range(3): + context.generate_text_to_image( + settings=config, + device=DEVICE, + ) + latencies.append(context.latency) + + avg_latency = sum(latencies) / 3 + + bench_lcm_setting.use_tiny_auto_encoder = True + + context.generate_text_to_image( + settings=config, + device=DEVICE, + ) + latencies = [] + for _ in range(3): + context.generate_text_to_image( + settings=config, + device=DEVICE, + ) + latencies.append(context.latency) + + avg_latency_taesd = sum(latencies) / 3 + + benchmark_name = "" + + if config.lcm_diffusion_setting.use_openvino: + benchmark_name = "OpenVINO" + else: + benchmark_name = "PyTorch" + + bench_model_id = "" + if bench_lcm_setting.use_openvino: + bench_model_id = bench_lcm_setting.openvino_lcm_model_id + elif bench_lcm_setting.use_lcm_lora: + bench_model_id = bench_lcm_setting.lcm_lora.base_model_id + else: + bench_model_id = bench_lcm_setting.lcm_model_id + + benchmark_result = [ + ["Device", f"{DEVICE.upper()},{get_device_name()}"], + ["Stable Diffusion Model", bench_model_id], + [ + "Image Size ", + f"{bench_lcm_setting.image_width}x{bench_lcm_setting.image_height}", + ], + [ + "Inference Steps", + f"{bench_lcm_setting.inference_steps}", + ], + [ + "Benchmark Passes", + 3, + ], + [ + "Average Latency", + f"{round(avg_latency, 3)} sec", + ], + [ + "Average Latency(TAESD* enabled)", + f"{round(avg_latency_taesd, 3)} sec", + ], + ] + print() + print( + f" FastSD Benchmark - {benchmark_name:8} " + ) + print(f"-" * 80) + for benchmark in benchmark_result: + print(f"{benchmark[0]:35} - {benchmark[1]}") + print(f"-" * 80) + print("*TAESD - Tiny AutoEncoder for Stable Diffusion") + + else: + for i in range(0, args.batch_count): + context.generate_text_to_image( + settings=config, + device=DEVICE, + ) diff --git a/app_settings.py b/app_settings.py new file mode 100644 index 0000000000000000000000000000000000000000..8a35193a5182c65ab058f9a3173ef26b52804de4 --- /dev/null +++ b/app_settings.py @@ -0,0 +1,124 @@ +from copy import deepcopy +from os import makedirs, path + +import yaml +from constants import ( + LCM_LORA_MODELS_FILE, + LCM_MODELS_FILE, + OPENVINO_LCM_MODELS_FILE, + SD_MODELS_FILE, +) +from paths import FastStableDiffusionPaths, join_paths +from utils import get_files_in_dir, get_models_from_text_file + +from models.settings import Settings + + +class AppSettings: + def __init__(self): + self.config_path = FastStableDiffusionPaths().get_app_settings_path() + self._stable_diffsuion_models = get_models_from_text_file( + FastStableDiffusionPaths().get_models_config_path(SD_MODELS_FILE) + ) + self._lcm_lora_models = get_models_from_text_file( + FastStableDiffusionPaths().get_models_config_path(LCM_LORA_MODELS_FILE) + ) + self._openvino_lcm_models = get_models_from_text_file( + FastStableDiffusionPaths().get_models_config_path(OPENVINO_LCM_MODELS_FILE) + ) + self._lcm_models = get_models_from_text_file( + FastStableDiffusionPaths().get_models_config_path(LCM_MODELS_FILE) + ) + self._gguf_diffusion_models = get_files_in_dir( + join_paths(FastStableDiffusionPaths().get_gguf_models_path(), "diffusion") + ) + self._gguf_clip_models = get_files_in_dir( + join_paths(FastStableDiffusionPaths().get_gguf_models_path(), "clip") + ) + self._gguf_vae_models = get_files_in_dir( + join_paths(FastStableDiffusionPaths().get_gguf_models_path(), "vae") + ) + self._gguf_t5xxl_models = get_files_in_dir( + join_paths(FastStableDiffusionPaths().get_gguf_models_path(), "t5xxl") + ) + self._config = None + + @property + def settings(self): + return self._config + + @property + def stable_diffsuion_models(self): + return self._stable_diffsuion_models + + @property + def openvino_lcm_models(self): + return self._openvino_lcm_models + + @property + def lcm_models(self): + return self._lcm_models + + @property + def lcm_lora_models(self): + return self._lcm_lora_models + + @property + def gguf_diffusion_models(self): + return self._gguf_diffusion_models + + @property + def gguf_clip_models(self): + return self._gguf_clip_models + + @property + def gguf_vae_models(self): + return self._gguf_vae_models + + @property + def gguf_t5xxl_models(self): + return self._gguf_t5xxl_models + + def load(self, skip_file=False): + if skip_file: + print("Skipping config file") + settings_dict = self._load_default() + self._config = Settings.model_validate(settings_dict) + else: + if not path.exists(self.config_path): + base_dir = path.dirname(self.config_path) + if not path.exists(base_dir): + makedirs(base_dir) + try: + print("Settings not found creating default settings") + with open(self.config_path, "w") as file: + yaml.dump( + self._load_default(), + file, + ) + except Exception as ex: + print(f"Error in creating settings : {ex}") + exit() + try: + with open(self.config_path) as file: + settings_dict = yaml.safe_load(file) + self._config = Settings.model_validate(settings_dict) + except Exception as ex: + print(f"Error in loading settings : {ex}") + + def save(self): + try: + with open(self.config_path, "w") as file: + tmp_cfg = deepcopy(self._config) + tmp_cfg.lcm_diffusion_setting.init_image = None + configurations = tmp_cfg.model_dump( + exclude=["init_image"], + ) + if configurations: + yaml.dump(configurations, file) + except Exception as ex: + print(f"Error in saving settings : {ex}") + + def _load_default(self) -> dict: + default_config = Settings() + return default_config.model_dump() diff --git a/backend/__init__.py b/backend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/backend/annotators/canny_control.py b/backend/annotators/canny_control.py new file mode 100644 index 0000000000000000000000000000000000000000..a9cd68d6c35180cac6e63c394add2cfac04ca283 --- /dev/null +++ b/backend/annotators/canny_control.py @@ -0,0 +1,15 @@ +import numpy as np +from backend.annotators.control_interface import ControlInterface +from cv2 import Canny +from PIL import Image + + +class CannyControl(ControlInterface): + def get_control_image(self, image: Image) -> Image: + low_threshold = 100 + high_threshold = 200 + image = np.array(image) + image = Canny(image, low_threshold, high_threshold) + image = image[:, :, None] + image = np.concatenate([image, image, image], axis=2) + return Image.fromarray(image) diff --git a/backend/annotators/control_interface.py b/backend/annotators/control_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..fc5caa62d9a1a938b11b2dc900331a2d2604c5f9 --- /dev/null +++ b/backend/annotators/control_interface.py @@ -0,0 +1,12 @@ +from abc import ABC, abstractmethod + +from PIL import Image + + +class ControlInterface(ABC): + @abstractmethod + def get_control_image( + self, + image: Image, + ) -> Image: + pass diff --git a/backend/annotators/depth_control.py b/backend/annotators/depth_control.py new file mode 100644 index 0000000000000000000000000000000000000000..cccba88810c9523872784c2372fca154334e1ad5 --- /dev/null +++ b/backend/annotators/depth_control.py @@ -0,0 +1,15 @@ +import numpy as np +from backend.annotators.control_interface import ControlInterface +from PIL import Image +from transformers import pipeline + + +class DepthControl(ControlInterface): + def get_control_image(self, image: Image) -> Image: + depth_estimator = pipeline("depth-estimation") + image = depth_estimator(image)["depth"] + image = np.array(image) + image = image[:, :, None] + image = np.concatenate([image, image, image], axis=2) + image = Image.fromarray(image) + return image diff --git a/backend/annotators/image_control_factory.py b/backend/annotators/image_control_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..4b2da4920974aa62e76f0a4d841478dedaf0d9b4 --- /dev/null +++ b/backend/annotators/image_control_factory.py @@ -0,0 +1,31 @@ +from backend.annotators.canny_control import CannyControl +from backend.annotators.depth_control import DepthControl +from backend.annotators.lineart_control import LineArtControl +from backend.annotators.mlsd_control import MlsdControl +from backend.annotators.normal_control import NormalControl +from backend.annotators.pose_control import PoseControl +from backend.annotators.shuffle_control import ShuffleControl +from backend.annotators.softedge_control import SoftEdgeControl + + +class ImageControlFactory: + def create_control(self, controlnet_type: str): + if controlnet_type == "Canny": + return CannyControl() + elif controlnet_type == "Pose": + return PoseControl() + elif controlnet_type == "MLSD": + return MlsdControl() + elif controlnet_type == "Depth": + return DepthControl() + elif controlnet_type == "LineArt": + return LineArtControl() + elif controlnet_type == "Shuffle": + return ShuffleControl() + elif controlnet_type == "NormalBAE": + return NormalControl() + elif controlnet_type == "SoftEdge": + return SoftEdgeControl() + else: + print("Error: Control type not implemented!") + raise Exception("Error: Control type not implemented!") diff --git a/backend/annotators/lineart_control.py b/backend/annotators/lineart_control.py new file mode 100644 index 0000000000000000000000000000000000000000..c6775b71f0a48decd66e732dd58763b198e593af --- /dev/null +++ b/backend/annotators/lineart_control.py @@ -0,0 +1,11 @@ +import numpy as np +from backend.annotators.control_interface import ControlInterface +from controlnet_aux import LineartDetector +from PIL import Image + + +class LineArtControl(ControlInterface): + def get_control_image(self, image: Image) -> Image: + processor = LineartDetector.from_pretrained("lllyasviel/Annotators") + control_image = processor(image) + return control_image diff --git a/backend/annotators/mlsd_control.py b/backend/annotators/mlsd_control.py new file mode 100644 index 0000000000000000000000000000000000000000..80c0debe0bf5b45011bd8d2b751abae5c1d53071 --- /dev/null +++ b/backend/annotators/mlsd_control.py @@ -0,0 +1,10 @@ +from backend.annotators.control_interface import ControlInterface +from controlnet_aux import MLSDdetector +from PIL import Image + + +class MlsdControl(ControlInterface): + def get_control_image(self, image: Image) -> Image: + mlsd = MLSDdetector.from_pretrained("lllyasviel/ControlNet") + image = mlsd(image) + return image diff --git a/backend/annotators/normal_control.py b/backend/annotators/normal_control.py new file mode 100644 index 0000000000000000000000000000000000000000..7f22ed68360c5cda458be0b64a0bfcc18cd7acc2 --- /dev/null +++ b/backend/annotators/normal_control.py @@ -0,0 +1,10 @@ +from backend.annotators.control_interface import ControlInterface +from controlnet_aux import NormalBaeDetector +from PIL import Image + + +class NormalControl(ControlInterface): + def get_control_image(self, image: Image) -> Image: + processor = NormalBaeDetector.from_pretrained("lllyasviel/Annotators") + control_image = processor(image) + return control_image diff --git a/backend/annotators/pose_control.py b/backend/annotators/pose_control.py new file mode 100644 index 0000000000000000000000000000000000000000..87ca92f2a029bbc6c7187c6eaa5a65bac298677a --- /dev/null +++ b/backend/annotators/pose_control.py @@ -0,0 +1,10 @@ +from backend.annotators.control_interface import ControlInterface +from controlnet_aux import OpenposeDetector +from PIL import Image + + +class PoseControl(ControlInterface): + def get_control_image(self, image: Image) -> Image: + openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet") + image = openpose(image) + return image diff --git a/backend/annotators/shuffle_control.py b/backend/annotators/shuffle_control.py new file mode 100644 index 0000000000000000000000000000000000000000..20c6e3dabedb17f22c8a38bd5b855d9b0591a6c1 --- /dev/null +++ b/backend/annotators/shuffle_control.py @@ -0,0 +1,10 @@ +from backend.annotators.control_interface import ControlInterface +from controlnet_aux import ContentShuffleDetector +from PIL import Image + + +class ShuffleControl(ControlInterface): + def get_control_image(self, image: Image) -> Image: + shuffle_processor = ContentShuffleDetector() + image = shuffle_processor(image) + return image diff --git a/backend/annotators/softedge_control.py b/backend/annotators/softedge_control.py new file mode 100644 index 0000000000000000000000000000000000000000..d11965712472588979b76932080a74b54c72fb14 --- /dev/null +++ b/backend/annotators/softedge_control.py @@ -0,0 +1,10 @@ +from backend.annotators.control_interface import ControlInterface +from controlnet_aux import PidiNetDetector +from PIL import Image + + +class SoftEdgeControl(ControlInterface): + def get_control_image(self, image: Image) -> Image: + processor = PidiNetDetector.from_pretrained("lllyasviel/Annotators") + control_image = processor(image) + return control_image diff --git a/backend/api/mcp_server.py b/backend/api/mcp_server.py new file mode 100644 index 0000000000000000000000000000000000000000..093b58a25fd71ad35a25d9b286f5cf53cd62fb68 --- /dev/null +++ b/backend/api/mcp_server.py @@ -0,0 +1,97 @@ +import platform + +import uvicorn +from backend.device import get_device_name +from backend.models.device import DeviceInfo +from constants import APP_VERSION, DEVICE +from context import Context +from fastapi import FastAPI, Request +from fastapi_mcp import FastApiMCP +from state import get_settings +from fastapi.middleware.cors import CORSMiddleware +from models.interface_types import InterfaceType +from fastapi.staticfiles import StaticFiles + +app_settings = get_settings() +app = FastAPI( + title="FastSD CPU", + description="Fast stable diffusion on CPU", + version=APP_VERSION, + license_info={ + "name": "MIT", + "identifier": "MIT", + }, + describe_all_responses=True, + describe_full_response_schema=True, +) +origins = ["*"] + +app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) +print(app_settings.settings.lcm_diffusion_setting) + +context = Context(InterfaceType.API_SERVER) +app.mount("/results", StaticFiles(directory="results"), name="results") + + +@app.get( + "/info", + description="Get system information", + summary="Get system information", + operation_id="get_system_info", +) +async def info() -> dict: + device_info = DeviceInfo( + device_type=DEVICE, + device_name=get_device_name(), + os=platform.system(), + platform=platform.platform(), + processor=platform.processor(), + ) + return device_info.model_dump() + + +@app.post( + "/generate", + description="Generate image from text prompt", + summary="Text to image generation", + operation_id="generate", +) +async def generate( + prompt: str, + request: Request, +) -> str: + """ + Returns URL of the generated image for text prompt + """ + + app_settings.settings.lcm_diffusion_setting.prompt = prompt + images = context.generate_text_to_image(app_settings.settings) + image_names = context.save_images( + images, + app_settings.settings, + ) + url = request.url_for("results", path=image_names[0]) + image_url = f"The generated image available at the URL {url}" + return image_url + + +def start_mcp_server(port: int = 8000): + mcp = FastApiMCP( + app, + name="FastSDCPU MCP", + description="MCP server for FastSD CPU API", + base_url=f"http://localhost:{port}", + ) + + mcp.mount() + uvicorn.run( + app, + host="0.0.0.0", + port=port, + ) diff --git a/backend/api/models/response.py b/backend/api/models/response.py new file mode 100644 index 0000000000000000000000000000000000000000..41b76726d60d749ce9cb78ffcf583c213168d83a --- /dev/null +++ b/backend/api/models/response.py @@ -0,0 +1,16 @@ +from typing import List + +from pydantic import BaseModel + + +class StableDiffusionResponse(BaseModel): + """ + Stable diffusion response model + + Attributes: + images (List[str]): List of JPEG image as base64 encoded + latency (float): Latency in seconds + """ + + images: List[str] + latency: float diff --git a/backend/api/web.py b/backend/api/web.py new file mode 100644 index 0000000000000000000000000000000000000000..26147c36c24d3b06e447eabfe303dd06f1a8bb9b --- /dev/null +++ b/backend/api/web.py @@ -0,0 +1,112 @@ +import platform + +import uvicorn +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from backend.api.models.response import StableDiffusionResponse +from backend.base64_image import base64_image_to_pil, pil_image_to_base64_str +from backend.device import get_device_name +from backend.models.device import DeviceInfo +from backend.models.lcmdiffusion_setting import DiffusionTask, LCMDiffusionSetting +from constants import APP_VERSION, DEVICE +from context import Context +from models.interface_types import InterfaceType +from state import get_settings + +app_settings = get_settings() +app = FastAPI( + title="FastSD CPU", + description="Fast stable diffusion on CPU", + version=APP_VERSION, + license_info={ + "name": "MIT", + "identifier": "MIT", + }, + docs_url="/api/docs", + redoc_url="/api/redoc", + openapi_url="/api/openapi.json", +) +print(app_settings.settings.lcm_diffusion_setting) +origins = ["*"] +app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) +context = Context(InterfaceType.API_SERVER) + + +@app.get("/api/") +async def root(): + return {"message": "Welcome to FastSD CPU API"} + + +@app.get( + "/api/info", + description="Get system information", + summary="Get system information", +) +async def info(): + device_info = DeviceInfo( + device_type=DEVICE, + device_name=get_device_name(), + os=platform.system(), + platform=platform.platform(), + processor=platform.processor(), + ) + return device_info.model_dump() + + +@app.get( + "/api/config", + description="Get current configuration", + summary="Get configurations", +) +async def config(): + return app_settings.settings + + +@app.get( + "/api/models", + description="Get available models", + summary="Get available models", +) +async def models(): + return { + "lcm_lora_models": app_settings.lcm_lora_models, + "stable_diffusion": app_settings.stable_diffsuion_models, + "openvino_models": app_settings.openvino_lcm_models, + "lcm_models": app_settings.lcm_models, + } + + +@app.post( + "/api/generate", + description="Generate image(Text to image,Image to Image)", + summary="Generate image(Text to image,Image to Image)", +) +async def generate(diffusion_config: LCMDiffusionSetting) -> StableDiffusionResponse: + app_settings.settings.lcm_diffusion_setting = diffusion_config + if diffusion_config.diffusion_task == DiffusionTask.image_to_image: + app_settings.settings.lcm_diffusion_setting.init_image = base64_image_to_pil( + diffusion_config.init_image + ) + + images = context.generate_text_to_image(app_settings.settings) + + images_base64 = [pil_image_to_base64_str(img) for img in images] + return StableDiffusionResponse( + latency=round(context.latency, 2), + images=images_base64, + ) + + +def start_web_server(port: int = 8000): + uvicorn.run( + app, + host="0.0.0.0", + port=port, + ) diff --git a/backend/base64_image.py b/backend/base64_image.py new file mode 100644 index 0000000000000000000000000000000000000000..597f75808d02e1f6aa40bea9d4fad7ce1933cf84 --- /dev/null +++ b/backend/base64_image.py @@ -0,0 +1,21 @@ +from io import BytesIO +from base64 import b64encode, b64decode +from PIL import Image + + +def pil_image_to_base64_str( + image: Image, + format: str = "JPEG", +) -> str: + buffer = BytesIO() + image.save(buffer, format=format) + buffer.seek(0) + img_base64 = b64encode(buffer.getvalue()).decode("utf-8") + return img_base64 + + +def base64_image_to_pil(base64_str) -> Image: + image_data = b64decode(base64_str) + image_buffer = BytesIO(image_data) + image = Image.open(image_buffer) + return image diff --git a/backend/controlnet.py b/backend/controlnet.py new file mode 100644 index 0000000000000000000000000000000000000000..34f961cec88db5a4a17b700180c66d3e79b043d7 --- /dev/null +++ b/backend/controlnet.py @@ -0,0 +1,90 @@ +import logging +from PIL import Image +from diffusers import ControlNetModel +from backend.models.lcmdiffusion_setting import ( + DiffusionTask, + ControlNetSetting, +) + + +# Prepares ControlNet adapters for use with FastSD CPU +# +# This function loads the ControlNet adapters defined by the +# _lcm_diffusion_setting.controlnet_ object and returns a dictionary +# with the pipeline arguments required to use the loaded adapters +def load_controlnet_adapters(lcm_diffusion_setting) -> dict: + controlnet_args = {} + if ( + lcm_diffusion_setting.controlnet is None + or not lcm_diffusion_setting.controlnet.enabled + ): + return controlnet_args + + logging.info("Loading ControlNet adapter") + controlnet_adapter = ControlNetModel.from_single_file( + lcm_diffusion_setting.controlnet.adapter_path, + # local_files_only=True, + use_safetensors=True, + ) + controlnet_args["controlnet"] = controlnet_adapter + return controlnet_args + + +# Updates the ControlNet pipeline arguments to use for image generation +# +# This function uses the contents of the _lcm_diffusion_setting.controlnet_ +# object to generate a dictionary with the corresponding pipeline arguments +# to be used for image generation; in particular, it sets the ControlNet control +# image and conditioning scale +def update_controlnet_arguments(lcm_diffusion_setting) -> dict: + controlnet_args = {} + if ( + lcm_diffusion_setting.controlnet is None + or not lcm_diffusion_setting.controlnet.enabled + ): + return controlnet_args + + controlnet_args["controlnet_conditioning_scale"] = ( + lcm_diffusion_setting.controlnet.conditioning_scale + ) + if lcm_diffusion_setting.diffusion_task == DiffusionTask.text_to_image.value: + controlnet_args["image"] = lcm_diffusion_setting.controlnet._control_image + elif lcm_diffusion_setting.diffusion_task == DiffusionTask.image_to_image.value: + controlnet_args["control_image"] = ( + lcm_diffusion_setting.controlnet._control_image + ) + return controlnet_args + + +# Helper function to adjust ControlNet settings from a dictionary +def controlnet_settings_from_dict( + lcm_diffusion_setting, + dictionary, +) -> None: + if lcm_diffusion_setting is None or dictionary is None: + logging.error("Invalid arguments!") + return + if ( + "controlnet" not in dictionary + or dictionary["controlnet"] is None + or len(dictionary["controlnet"]) == 0 + ): + logging.warning("ControlNet settings not found, ControlNet will be disabled") + lcm_diffusion_setting.controlnet = None + return + + controlnet = ControlNetSetting() + controlnet.enabled = dictionary["controlnet"][0]["enabled"] + controlnet.conditioning_scale = dictionary["controlnet"][0]["conditioning_scale"] + controlnet.adapter_path = dictionary["controlnet"][0]["adapter_path"] + controlnet._control_image = None + image_path = dictionary["controlnet"][0]["control_image"] + if controlnet.enabled: + try: + controlnet._control_image = Image.open(image_path) + except (AttributeError, FileNotFoundError) as err: + print(err) + if controlnet._control_image is None: + logging.error("Wrong ControlNet control image! Disabling ControlNet") + controlnet.enabled = False + lcm_diffusion_setting.controlnet = controlnet diff --git a/backend/device.py b/backend/device.py new file mode 100644 index 0000000000000000000000000000000000000000..cacb2a5197eae85eb2ec7e8bf1df25f6fe62202c --- /dev/null +++ b/backend/device.py @@ -0,0 +1,23 @@ +import platform +from constants import DEVICE +import torch +import openvino as ov + +core = ov.Core() + + +def is_openvino_device() -> bool: + if DEVICE.lower() == "cpu" or DEVICE.lower()[0] == "g" or DEVICE.lower()[0] == "n": + return True + else: + return False + + +def get_device_name() -> str: + if DEVICE == "cuda" or DEVICE == "mps": + default_gpu_index = torch.cuda.current_device() + return torch.cuda.get_device_name(default_gpu_index) + elif platform.system().lower() == "darwin": + return platform.processor() + elif is_openvino_device(): + return core.get_property(DEVICE.upper(), "FULL_DEVICE_NAME") diff --git a/backend/gguf/gguf_diffusion.py b/backend/gguf/gguf_diffusion.py new file mode 100644 index 0000000000000000000000000000000000000000..9060ddc8a29ae36586c354f8219b1a024b0932ba --- /dev/null +++ b/backend/gguf/gguf_diffusion.py @@ -0,0 +1,319 @@ +""" +Wrapper class to call the stablediffusion.cpp shared library for GGUF support +""" + +import ctypes +import platform +from ctypes import ( + POINTER, + c_bool, + c_char_p, + c_float, + c_int, + c_int64, + c_void_p, +) +from dataclasses import dataclass +from os import path +from typing import List, Any + +import numpy as np +from PIL import Image + +from backend.gguf.sdcpp_types import ( + RngType, + SampleMethod, + Schedule, + SDCPPLogLevel, + SDImage, + SdType, +) + + +@dataclass +class ModelConfig: + model_path: str = "" + clip_l_path: str = "" + t5xxl_path: str = "" + diffusion_model_path: str = "" + vae_path: str = "" + taesd_path: str = "" + control_net_path: str = "" + lora_model_dir: str = "" + embed_dir: str = "" + stacked_id_embed_dir: str = "" + vae_decode_only: bool = True + vae_tiling: bool = False + free_params_immediately: bool = False + n_threads: int = 4 + wtype: SdType = SdType.SD_TYPE_Q4_0 + rng_type: RngType = RngType.CUDA_RNG + schedule: Schedule = Schedule.DEFAULT + keep_clip_on_cpu: bool = False + keep_control_net_cpu: bool = False + keep_vae_on_cpu: bool = False + + +@dataclass +class Txt2ImgConfig: + prompt: str = "a man wearing sun glasses, highly detailed" + negative_prompt: str = "" + clip_skip: int = -1 + cfg_scale: float = 2.0 + guidance: float = 3.5 + width: int = 512 + height: int = 512 + sample_method: SampleMethod = SampleMethod.EULER_A + sample_steps: int = 1 + seed: int = -1 + batch_count: int = 2 + control_cond: Image = None + control_strength: float = 0.90 + style_strength: float = 0.5 + normalize_input: bool = False + input_id_images_path: bytes = b"" + + +class GGUFDiffusion: + """GGUF Diffusion + To support GGUF diffusion model based on stablediffusion.cpp + https://github.com/ggerganov/ggml/blob/master/docs/gguf.md + Implmented based on stablediffusion.h + """ + + def __init__( + self, + libpath: str, + config: ModelConfig, + logging_enabled: bool = False, + ): + sdcpp_shared_lib_path = self._get_sdcpp_shared_lib_path(libpath) + try: + self.libsdcpp = ctypes.CDLL(sdcpp_shared_lib_path) + except OSError as e: + print(f"Failed to load library {sdcpp_shared_lib_path}") + raise ValueError(f"Error: {e}") + + if not config.clip_l_path or not path.exists(config.clip_l_path): + raise ValueError( + "CLIP model file not found,please check readme.md for GGUF model usage" + ) + + if not config.t5xxl_path or not path.exists(config.t5xxl_path): + raise ValueError( + "T5XXL model file not found,please check readme.md for GGUF model usage" + ) + + if not config.diffusion_model_path or not path.exists( + config.diffusion_model_path + ): + raise ValueError( + "Diffusion model file not found,please check readme.md for GGUF model usage" + ) + + if not config.vae_path or not path.exists(config.vae_path): + raise ValueError( + "VAE model file not found,please check readme.md for GGUF model usage" + ) + + self.model_config = config + + self.libsdcpp.new_sd_ctx.argtypes = [ + c_char_p, # const char* model_path + c_char_p, # const char* clip_l_path + c_char_p, # const char* t5xxl_path + c_char_p, # const char* diffusion_model_path + c_char_p, # const char* vae_path + c_char_p, # const char* taesd_path + c_char_p, # const char* control_net_path_c_str + c_char_p, # const char* lora_model_dir + c_char_p, # const char* embed_dir_c_str + c_char_p, # const char* stacked_id_embed_dir_c_str + c_bool, # bool vae_decode_only + c_bool, # bool vae_tiling + c_bool, # bool free_params_immediately + c_int, # int n_threads + SdType, # enum sd_type_t wtype + RngType, # enum rng_type_t rng_type + Schedule, # enum schedule_t s + c_bool, # bool keep_clip_on_cpu + c_bool, # bool keep_control_net_cpu + c_bool, # bool keep_vae_on_cpu + ] + + self.libsdcpp.new_sd_ctx.restype = POINTER(c_void_p) + + self.sd_ctx = self.libsdcpp.new_sd_ctx( + self._str_to_bytes(self.model_config.model_path), + self._str_to_bytes(self.model_config.clip_l_path), + self._str_to_bytes(self.model_config.t5xxl_path), + self._str_to_bytes(self.model_config.diffusion_model_path), + self._str_to_bytes(self.model_config.vae_path), + self._str_to_bytes(self.model_config.taesd_path), + self._str_to_bytes(self.model_config.control_net_path), + self._str_to_bytes(self.model_config.lora_model_dir), + self._str_to_bytes(self.model_config.embed_dir), + self._str_to_bytes(self.model_config.stacked_id_embed_dir), + self.model_config.vae_decode_only, + self.model_config.vae_tiling, + self.model_config.free_params_immediately, + self.model_config.n_threads, + self.model_config.wtype, + self.model_config.rng_type, + self.model_config.schedule, + self.model_config.keep_clip_on_cpu, + self.model_config.keep_control_net_cpu, + self.model_config.keep_vae_on_cpu, + ) + + if logging_enabled: + self._set_logcallback() + + def _set_logcallback(self): + print("Setting logging callback") + # Define function callback + SdLogCallbackType = ctypes.CFUNCTYPE( + None, + SDCPPLogLevel, + ctypes.c_char_p, + ctypes.c_void_p, + ) + + self.libsdcpp.sd_set_log_callback.argtypes = [ + SdLogCallbackType, + ctypes.c_void_p, + ] + self.libsdcpp.sd_set_log_callback.restype = None + # Convert the Python callback to a C func pointer + self.c_log_callback = SdLogCallbackType( + self.log_callback + ) # prevent GC,keep callback as member variable + self.libsdcpp.sd_set_log_callback(self.c_log_callback, None) + + def _get_sdcpp_shared_lib_path( + self, + root_path: str, + ) -> str: + system_name = platform.system() + print(f"GGUF Diffusion on {system_name}") + lib_name = "stable-diffusion.dll" + sdcpp_lib_path = "" + + if system_name == "Windows": + sdcpp_lib_path = path.join(root_path, lib_name) + elif system_name == "Linux": + lib_name = "libstable-diffusion.so" + sdcpp_lib_path = path.join(root_path, lib_name) + elif system_name == "Darwin": + lib_name = "libstable-diffusion.dylib" + sdcpp_lib_path = path.join(root_path, lib_name) + else: + print("Unknown platform.") + + return sdcpp_lib_path + + @staticmethod + def log_callback( + level, + text, + data, + ): + print(f"{text.decode('utf-8')}", end="") + + def _str_to_bytes(self, in_str: str, encoding: str = "utf-8") -> bytes: + if in_str: + return in_str.encode(encoding) + else: + return b"" + + def generate_text2mg(self, txt2img_cfg: Txt2ImgConfig) -> List[Any]: + self.libsdcpp.txt2img.restype = POINTER(SDImage) + self.libsdcpp.txt2img.argtypes = [ + c_void_p, # sd_ctx_t* sd_ctx (pointer to context object) + c_char_p, # const char* prompt + c_char_p, # const char* negative_prompt + c_int, # int clip_skip + c_float, # float cfg_scale + c_float, # float guidance + c_int, # int width + c_int, # int height + SampleMethod, # enum sample_method_t sample_method + c_int, # int sample_steps + c_int64, # int64_t seed + c_int, # int batch_count + POINTER(SDImage), # const sd_image_t* control_cond (pointer to SDImage) + c_float, # float control_strength + c_float, # float style_strength + c_bool, # bool normalize_input + c_char_p, # const char* input_id_images_path + ] + + image_buffer = self.libsdcpp.txt2img( + self.sd_ctx, + self._str_to_bytes(txt2img_cfg.prompt), + self._str_to_bytes(txt2img_cfg.negative_prompt), + txt2img_cfg.clip_skip, + txt2img_cfg.cfg_scale, + txt2img_cfg.guidance, + txt2img_cfg.width, + txt2img_cfg.height, + txt2img_cfg.sample_method, + txt2img_cfg.sample_steps, + txt2img_cfg.seed, + txt2img_cfg.batch_count, + txt2img_cfg.control_cond, + txt2img_cfg.control_strength, + txt2img_cfg.style_strength, + txt2img_cfg.normalize_input, + txt2img_cfg.input_id_images_path, + ) + + images = self._get_sd_images_from_buffer( + image_buffer, + txt2img_cfg.batch_count, + ) + + return images + + def _get_sd_images_from_buffer( + self, + image_buffer: Any, + batch_count: int, + ) -> List[Any]: + images = [] + if image_buffer: + for i in range(batch_count): + image = image_buffer[i] + print( + f"Generated image: {image.width}x{image.height} with {image.channel} channels" + ) + + width = image.width + height = image.height + channels = image.channel + pixel_data = np.ctypeslib.as_array( + image.data, shape=(height, width, channels) + ) + + if channels == 1: + pil_image = Image.fromarray(pixel_data.squeeze(), mode="L") + elif channels == 3: + pil_image = Image.fromarray(pixel_data, mode="RGB") + elif channels == 4: + pil_image = Image.fromarray(pixel_data, mode="RGBA") + else: + raise ValueError(f"Unsupported number of channels: {channels}") + + images.append(pil_image) + return images + + def terminate(self): + if self.libsdcpp: + if self.sd_ctx: + self.libsdcpp.free_sd_ctx.argtypes = [c_void_p] + self.libsdcpp.free_sd_ctx.restype = None + self.libsdcpp.free_sd_ctx(self.sd_ctx) + del self.sd_ctx + self.sd_ctx = None + del self.libsdcpp + self.libsdcpp = None diff --git a/backend/gguf/sdcpp_types.py b/backend/gguf/sdcpp_types.py new file mode 100644 index 0000000000000000000000000000000000000000..e8cc81bdf45cd7ec6d41ae6403e14391e2eff361 --- /dev/null +++ b/backend/gguf/sdcpp_types.py @@ -0,0 +1,104 @@ +""" +Ctypes for stablediffusion.cpp shared library +This is as per the stablediffusion.h file +""" + +from enum import IntEnum +from ctypes import ( + c_int, + c_uint32, + c_uint8, + POINTER, + Structure, +) + + +class CtypesEnum(IntEnum): + """A ctypes-compatible IntEnum superclass.""" + + @classmethod + def from_param(cls, obj): + return int(obj) + + +class RngType(CtypesEnum): + STD_DEFAULT_RNG = 0 + CUDA_RNG = 1 + + +class SampleMethod(CtypesEnum): + EULER_A = 0 + EULER = 1 + HEUN = 2 + DPM2 = 3 + DPMPP2S_A = 4 + DPMPP2M = 5 + DPMPP2Mv2 = 6 + IPNDM = 7 + IPNDM_V = 7 + LCM = 8 + N_SAMPLE_METHODS = 9 + + +class Schedule(CtypesEnum): + DEFAULT = 0 + DISCRETE = 1 + KARRAS = 2 + EXPONENTIAL = 3 + AYS = 4 + GITS = 5 + N_SCHEDULES = 5 + + +class SdType(CtypesEnum): + SD_TYPE_F32 = 0 + SD_TYPE_F16 = 1 + SD_TYPE_Q4_0 = 2 + SD_TYPE_Q4_1 = 3 + # SD_TYPE_Q4_2 = 4, support has been removed + # SD_TYPE_Q4_3 = 5, support has been removed + SD_TYPE_Q5_0 = 6 + SD_TYPE_Q5_1 = 7 + SD_TYPE_Q8_0 = 8 + SD_TYPE_Q8_1 = 9 + SD_TYPE_Q2_K = 10 + SD_TYPE_Q3_K = 11 + SD_TYPE_Q4_K = 12 + SD_TYPE_Q5_K = 13 + SD_TYPE_Q6_K = 14 + SD_TYPE_Q8_K = 15 + SD_TYPE_IQ2_XXS = 16 + SD_TYPE_IQ2_XS = 17 + SD_TYPE_IQ3_XXS = 18 + SD_TYPE_IQ1_S = 19 + SD_TYPE_IQ4_NL = 20 + SD_TYPE_IQ3_S = 21 + SD_TYPE_IQ2_S = 22 + SD_TYPE_IQ4_XS = 23 + SD_TYPE_I8 = 24 + SD_TYPE_I16 = 25 + SD_TYPE_I32 = 26 + SD_TYPE_I64 = 27 + SD_TYPE_F64 = 28 + SD_TYPE_IQ1_M = 29 + SD_TYPE_BF16 = 30 + SD_TYPE_Q4_0_4_4 = 31 + SD_TYPE_Q4_0_4_8 = 32 + SD_TYPE_Q4_0_8_8 = 33 + SD_TYPE_COUNT = 34 + + +class SDImage(Structure): + _fields_ = [ + ("width", c_uint32), + ("height", c_uint32), + ("channel", c_uint32), + ("data", POINTER(c_uint8)), + ] + + +class SDCPPLogLevel(c_int): + SD_LOG_LEVEL_DEBUG = 0 + SD_LOG_LEVEL_INFO = 1 + SD_LOG_LEVEL_WARNING = 2 + SD_LOG_LEVEL_ERROR = 3 diff --git a/backend/image_saver.py b/backend/image_saver.py new file mode 100644 index 0000000000000000000000000000000000000000..8c3e10504ac4cbeb8a72d29e80bf7d6b9f8531cb --- /dev/null +++ b/backend/image_saver.py @@ -0,0 +1,75 @@ +import json +from os import path, mkdir +from typing import Any +from uuid import uuid4 +from backend.models.lcmdiffusion_setting import LCMDiffusionSetting +from utils import get_image_file_extension + + +def get_exclude_keys(): + exclude_keys = { + "init_image": True, + "generated_images": True, + "lora": { + "models_dir": True, + "path": True, + }, + "dirs": True, + "controlnet": { + "adapter_path": True, + }, + } + return exclude_keys + + +class ImageSaver: + @staticmethod + def save_images( + output_path: str, + images: Any, + folder_name: str = "", + format: str = "PNG", + jpeg_quality: int = 90, + lcm_diffusion_setting: LCMDiffusionSetting = None, + ) -> list[str]: + gen_id = uuid4() + image_ids = [] + + if images: + image_seeds = [] + + for index, image in enumerate(images): + + image_seed = image.info.get('image_seed') + if image_seed is not None: + image_seeds.append(image_seed) + + if not path.exists(output_path): + mkdir(output_path) + + if folder_name: + out_path = path.join( + output_path, + folder_name, + ) + else: + out_path = output_path + + if not path.exists(out_path): + mkdir(out_path) + image_extension = get_image_file_extension(format) + image_file_name = f"{gen_id}-{index+1}{image_extension}" + image_ids.append(image_file_name) + image.save(path.join(out_path, image_file_name), quality = jpeg_quality) + if lcm_diffusion_setting: + data = lcm_diffusion_setting.model_dump(exclude=get_exclude_keys()) + if image_seeds: + data['image_seeds'] = image_seeds + with open(path.join(out_path, f"{gen_id}.json"), "w") as json_file: + json.dump( + data, + json_file, + indent=4, + ) + return image_ids + diff --git a/backend/lcm_text_to_image.py b/backend/lcm_text_to_image.py new file mode 100644 index 0000000000000000000000000000000000000000..3ac68f24597a638429eb98d3cb347c6207e62859 --- /dev/null +++ b/backend/lcm_text_to_image.py @@ -0,0 +1,577 @@ +import gc +from math import ceil +from typing import Any, List +import random + +import numpy as np +import torch +from backend.device import is_openvino_device +from backend.controlnet import ( + load_controlnet_adapters, + update_controlnet_arguments, +) +from backend.models.lcmdiffusion_setting import ( + DiffusionTask, + LCMDiffusionSetting, + LCMLora, +) +from backend.openvino.pipelines import ( + get_ov_image_to_image_pipeline, + get_ov_text_to_image_pipeline, + ov_load_taesd, +) +from backend.pipelines.lcm import ( + get_image_to_image_pipeline, + get_lcm_model_pipeline, + load_taesd, +) +from backend.pipelines.lcm_lora import get_lcm_lora_pipeline +from constants import DEVICE, GGUF_THREADS +from diffusers import LCMScheduler +from image_ops import resize_pil_image +from backend.openvino.flux_pipeline import get_flux_pipeline +from backend.openvino.ov_hc_stablediffusion_pipeline import OvHcLatentConsistency +from backend.gguf.gguf_diffusion import ( + GGUFDiffusion, + ModelConfig, + Txt2ImgConfig, + SampleMethod, +) +from paths import get_app_path +from pprint import pprint + +try: + # support for token merging; keeping it optional for now + import tomesd +except ImportError: + print("tomesd library unavailable; disabling token merging support") + tomesd = None + + +class LCMTextToImage: + def __init__( + self, + device: str = "cpu", + ) -> None: + self.pipeline = None + self.use_openvino = False + self.device = "" + self.previous_model_id = None + self.previous_use_tae_sd = False + self.previous_use_lcm_lora = False + self.previous_ov_model_id = "" + self.previous_token_merging = 0.0 + self.previous_safety_checker = False + self.previous_use_openvino = False + self.img_to_img_pipeline = None + self.is_openvino_init = False + self.previous_lora = None + self.task_type = DiffusionTask.text_to_image + self.previous_use_gguf_model = False + self.previous_gguf_model = None + self.torch_data_type = ( + torch.float32 if is_openvino_device() or DEVICE == "mps" else torch.float16 + ) + self.ov_model_id = None + print(f"Torch datatype : {self.torch_data_type}") + + def _pipeline_to_device(self): + print(f"Pipeline device : {DEVICE}") + print(f"Pipeline dtype : {self.torch_data_type}") + self.pipeline.to( + torch_device=DEVICE, + torch_dtype=self.torch_data_type, + ) + + def _add_freeu(self): + pipeline_class = self.pipeline.__class__.__name__ + if isinstance(self.pipeline.scheduler, LCMScheduler): + if pipeline_class == "StableDiffusionPipeline": + print("Add FreeU - SD") + self.pipeline.enable_freeu( + s1=0.9, + s2=0.2, + b1=1.2, + b2=1.4, + ) + elif pipeline_class == "StableDiffusionXLPipeline": + print("Add FreeU - SDXL") + self.pipeline.enable_freeu( + s1=0.6, + s2=0.4, + b1=1.1, + b2=1.2, + ) + + def _enable_vae_tiling(self): + self.pipeline.vae.enable_tiling() + + def _update_lcm_scheduler_params(self): + if isinstance(self.pipeline.scheduler, LCMScheduler): + self.pipeline.scheduler = LCMScheduler.from_config( + self.pipeline.scheduler.config, + beta_start=0.001, + beta_end=0.01, + ) + + def _is_hetero_pipeline(self) -> bool: + return "square" in self.ov_model_id.lower() + + def _load_ov_hetero_pipeline(self): + print("Loading Heterogeneous Compute pipeline") + if DEVICE.upper() == "NPU": + device = ["NPU", "NPU", "NPU"] + self.pipeline = OvHcLatentConsistency(self.ov_model_id, device) + else: + self.pipeline = OvHcLatentConsistency(self.ov_model_id) + + def _generate_images_hetero_compute( + self, + lcm_diffusion_setting: LCMDiffusionSetting, + ): + print("Using OpenVINO ") + if lcm_diffusion_setting.diffusion_task == DiffusionTask.text_to_image.value: + return [ + self.pipeline.generate( + prompt=lcm_diffusion_setting.prompt, + neg_prompt=lcm_diffusion_setting.negative_prompt, + init_image=None, + strength=1.0, + num_inference_steps=lcm_diffusion_setting.inference_steps, + ) + ] + else: + return [ + self.pipeline.generate( + prompt=lcm_diffusion_setting.prompt, + neg_prompt=lcm_diffusion_setting.negative_prompt, + init_image=lcm_diffusion_setting.init_image, + strength=lcm_diffusion_setting.strength, + num_inference_steps=lcm_diffusion_setting.inference_steps, + ) + ] + + def _is_valid_mode( + self, + modes: List, + ) -> bool: + return modes.count(True) == 1 or modes.count(False) == 3 + + def _validate_mode( + self, + modes: List, + ) -> None: + if not self._is_valid_mode(modes): + raise ValueError("Invalid mode,delete configs/settings.yaml and retry!") + + def init( + self, + device: str = "cpu", + lcm_diffusion_setting: LCMDiffusionSetting = LCMDiffusionSetting(), + ) -> None: + # Mode validation either LCM LoRA or OpenVINO or GGUF + + modes = [ + lcm_diffusion_setting.use_gguf_model, + lcm_diffusion_setting.use_openvino, + lcm_diffusion_setting.use_lcm_lora, + ] + self._validate_mode(modes) + self.device = device + self.use_openvino = lcm_diffusion_setting.use_openvino + model_id = lcm_diffusion_setting.lcm_model_id + use_local_model = lcm_diffusion_setting.use_offline_model + use_tiny_auto_encoder = lcm_diffusion_setting.use_tiny_auto_encoder + use_lora = lcm_diffusion_setting.use_lcm_lora + lcm_lora: LCMLora = lcm_diffusion_setting.lcm_lora + token_merging = lcm_diffusion_setting.token_merging + self.ov_model_id = lcm_diffusion_setting.openvino_lcm_model_id + + if lcm_diffusion_setting.diffusion_task == DiffusionTask.image_to_image.value: + lcm_diffusion_setting.init_image = resize_pil_image( + lcm_diffusion_setting.init_image, + lcm_diffusion_setting.image_width, + lcm_diffusion_setting.image_height, + ) + + if ( + self.pipeline is None + or self.previous_model_id != model_id + or self.previous_use_tae_sd != use_tiny_auto_encoder + or self.previous_lcm_lora_base_id != lcm_lora.base_model_id + or self.previous_lcm_lora_id != lcm_lora.lcm_lora_id + or self.previous_use_lcm_lora != use_lora + or self.previous_ov_model_id != self.ov_model_id + or self.previous_token_merging != token_merging + or self.previous_safety_checker != lcm_diffusion_setting.use_safety_checker + or self.previous_use_openvino != lcm_diffusion_setting.use_openvino + or self.previous_use_gguf_model != lcm_diffusion_setting.use_gguf_model + or self.previous_gguf_model != lcm_diffusion_setting.gguf_model + or ( + self.use_openvino + and ( + self.previous_task_type != lcm_diffusion_setting.diffusion_task + or self.previous_lora != lcm_diffusion_setting.lora + ) + ) + or lcm_diffusion_setting.rebuild_pipeline + ): + if self.use_openvino and is_openvino_device(): + if self.pipeline: + del self.pipeline + self.pipeline = None + gc.collect() + self.is_openvino_init = True + if ( + lcm_diffusion_setting.diffusion_task + == DiffusionTask.text_to_image.value + ): + print( + f"***** Init Text to image (OpenVINO) - {self.ov_model_id} *****" + ) + if "flux" in self.ov_model_id.lower(): + print("Loading OpenVINO Flux pipeline") + self.pipeline = get_flux_pipeline( + self.ov_model_id, + lcm_diffusion_setting.use_tiny_auto_encoder, + ) + elif self._is_hetero_pipeline(): + self._load_ov_hetero_pipeline() + else: + self.pipeline = get_ov_text_to_image_pipeline( + self.ov_model_id, + use_local_model, + ) + elif ( + lcm_diffusion_setting.diffusion_task + == DiffusionTask.image_to_image.value + ): + if not self.pipeline and self._is_hetero_pipeline(): + self._load_ov_hetero_pipeline() + else: + print( + f"***** Image to image (OpenVINO) - {self.ov_model_id} *****" + ) + self.pipeline = get_ov_image_to_image_pipeline( + self.ov_model_id, + use_local_model, + ) + elif lcm_diffusion_setting.use_gguf_model: + model = lcm_diffusion_setting.gguf_model.diffusion_path + print(f"***** Init Text to image (GGUF) - {model} *****") + # if self.pipeline: + # self.pipeline.terminate() + # del self.pipeline + # self.pipeline = None + self._init_gguf_diffusion(lcm_diffusion_setting) + else: + if self.pipeline or self.img_to_img_pipeline: + self.pipeline = None + self.img_to_img_pipeline = None + gc.collect() + + controlnet_args = load_controlnet_adapters(lcm_diffusion_setting) + if use_lora: + print( + f"***** Init LCM-LoRA pipeline - {lcm_lora.base_model_id} *****" + ) + self.pipeline = get_lcm_lora_pipeline( + lcm_lora.base_model_id, + lcm_lora.lcm_lora_id, + use_local_model, + torch_data_type=self.torch_data_type, + pipeline_args=controlnet_args, + ) + + else: + print(f"***** Init LCM Model pipeline - {model_id} *****") + self.pipeline = get_lcm_model_pipeline( + model_id, + use_local_model, + controlnet_args, + ) + + self.img_to_img_pipeline = get_image_to_image_pipeline(self.pipeline) + + if tomesd and token_merging > 0.001: + print(f"***** Token Merging: {token_merging} *****") + tomesd.apply_patch(self.pipeline, ratio=token_merging) + tomesd.apply_patch(self.img_to_img_pipeline, ratio=token_merging) + + if use_tiny_auto_encoder: + if self.use_openvino and is_openvino_device(): + if self.pipeline.__class__.__name__ != "OVFluxPipeline": + print("Using Tiny Auto Encoder (OpenVINO)") + ov_load_taesd( + self.pipeline, + use_local_model, + ) + else: + print("Using Tiny Auto Encoder") + load_taesd( + self.pipeline, + use_local_model, + self.torch_data_type, + ) + load_taesd( + self.img_to_img_pipeline, + use_local_model, + self.torch_data_type, + ) + + if not self.use_openvino and not is_openvino_device(): + self._pipeline_to_device() + + if not self._is_hetero_pipeline(): + if ( + lcm_diffusion_setting.diffusion_task + == DiffusionTask.image_to_image.value + and lcm_diffusion_setting.use_openvino + ): + self.pipeline.scheduler = LCMScheduler.from_config( + self.pipeline.scheduler.config, + ) + else: + if not lcm_diffusion_setting.use_gguf_model: + self._update_lcm_scheduler_params() + + if use_lora: + self._add_freeu() + + self.previous_model_id = model_id + self.previous_ov_model_id = self.ov_model_id + self.previous_use_tae_sd = use_tiny_auto_encoder + self.previous_lcm_lora_base_id = lcm_lora.base_model_id + self.previous_lcm_lora_id = lcm_lora.lcm_lora_id + self.previous_use_lcm_lora = use_lora + self.previous_token_merging = lcm_diffusion_setting.token_merging + self.previous_safety_checker = lcm_diffusion_setting.use_safety_checker + self.previous_use_openvino = lcm_diffusion_setting.use_openvino + self.previous_task_type = lcm_diffusion_setting.diffusion_task + self.previous_lora = lcm_diffusion_setting.lora.model_copy(deep=True) + self.previous_use_gguf_model = lcm_diffusion_setting.use_gguf_model + self.previous_gguf_model = lcm_diffusion_setting.gguf_model.model_copy( + deep=True + ) + lcm_diffusion_setting.rebuild_pipeline = False + if ( + lcm_diffusion_setting.diffusion_task + == DiffusionTask.text_to_image.value + ): + print(f"Pipeline : {self.pipeline}") + elif ( + lcm_diffusion_setting.diffusion_task + == DiffusionTask.image_to_image.value + ): + if self.use_openvino and is_openvino_device(): + print(f"Pipeline : {self.pipeline}") + else: + print(f"Pipeline : {self.img_to_img_pipeline}") + if self.use_openvino: + if lcm_diffusion_setting.lora.enabled: + print("Warning: Lora models not supported on OpenVINO mode") + elif not lcm_diffusion_setting.use_gguf_model: + adapters = self.pipeline.get_active_adapters() + print(f"Active adapters : {adapters}") + + def _get_timesteps(self): + time_steps = self.pipeline.scheduler.config.get("timesteps") + time_steps_value = [int(time_steps)] if time_steps else None + return time_steps_value + + def generate( + self, + lcm_diffusion_setting: LCMDiffusionSetting, + reshape: bool = False, + ) -> Any: + guidance_scale = lcm_diffusion_setting.guidance_scale + img_to_img_inference_steps = lcm_diffusion_setting.inference_steps + check_step_value = int( + lcm_diffusion_setting.inference_steps * lcm_diffusion_setting.strength + ) + if ( + lcm_diffusion_setting.diffusion_task == DiffusionTask.image_to_image.value + and check_step_value < 1 + ): + img_to_img_inference_steps = ceil(1 / lcm_diffusion_setting.strength) + print( + f"Strength: {lcm_diffusion_setting.strength},{img_to_img_inference_steps}" + ) + + pipeline_extra_args = {} + + if lcm_diffusion_setting.use_seed: + cur_seed = lcm_diffusion_setting.seed + # for multiple images with a fixed seed, use sequential seeds + seeds = [ + (cur_seed + i) for i in range(lcm_diffusion_setting.number_of_images) + ] + else: + seeds = [ + random.randint(0, 999999999) + for i in range(lcm_diffusion_setting.number_of_images) + ] + + if self.use_openvino: + # no support for generators; try at least to ensure reproducible results for single images + np.random.seed(seeds[0]) + if self._is_hetero_pipeline(): + torch.manual_seed(seeds[0]) + lcm_diffusion_setting.seed = seeds[0] + else: + pipeline_extra_args["generator"] = [ + torch.Generator(device=self.device).manual_seed(s) for s in seeds + ] + + is_openvino_pipe = lcm_diffusion_setting.use_openvino and is_openvino_device() + if is_openvino_pipe and not self._is_hetero_pipeline(): + print("Using OpenVINO") + if reshape and not self.is_openvino_init: + print("Reshape and compile") + self.pipeline.reshape( + batch_size=-1, + height=lcm_diffusion_setting.image_height, + width=lcm_diffusion_setting.image_width, + num_images_per_prompt=lcm_diffusion_setting.number_of_images, + ) + self.pipeline.compile() + + if self.is_openvino_init: + self.is_openvino_init = False + + if is_openvino_pipe and self._is_hetero_pipeline(): + return self._generate_images_hetero_compute(lcm_diffusion_setting) + elif lcm_diffusion_setting.use_gguf_model: + return self._generate_images_gguf(lcm_diffusion_setting) + + if lcm_diffusion_setting.clip_skip > 1: + # We follow the convention that "CLIP Skip == 2" means "skip + # the last layer", so "CLIP Skip == 1" means "no skipping" + pipeline_extra_args["clip_skip"] = lcm_diffusion_setting.clip_skip - 1 + + if not lcm_diffusion_setting.use_safety_checker: + self.pipeline.safety_checker = None + if ( + lcm_diffusion_setting.diffusion_task + == DiffusionTask.image_to_image.value + and not is_openvino_pipe + ): + self.img_to_img_pipeline.safety_checker = None + + if ( + not lcm_diffusion_setting.use_lcm_lora + and not lcm_diffusion_setting.use_openvino + and lcm_diffusion_setting.guidance_scale != 1.0 + ): + print("Not using LCM-LoRA so setting guidance_scale 1.0") + guidance_scale = 1.0 + + controlnet_args = update_controlnet_arguments(lcm_diffusion_setting) + if lcm_diffusion_setting.use_openvino: + if ( + lcm_diffusion_setting.diffusion_task + == DiffusionTask.text_to_image.value + ): + result_images = self.pipeline( + prompt=lcm_diffusion_setting.prompt, + negative_prompt=lcm_diffusion_setting.negative_prompt, + num_inference_steps=lcm_diffusion_setting.inference_steps, + guidance_scale=guidance_scale, + width=lcm_diffusion_setting.image_width, + height=lcm_diffusion_setting.image_height, + num_images_per_prompt=lcm_diffusion_setting.number_of_images, + ).images + elif ( + lcm_diffusion_setting.diffusion_task + == DiffusionTask.image_to_image.value + ): + result_images = self.pipeline( + image=lcm_diffusion_setting.init_image, + strength=lcm_diffusion_setting.strength, + prompt=lcm_diffusion_setting.prompt, + negative_prompt=lcm_diffusion_setting.negative_prompt, + num_inference_steps=img_to_img_inference_steps * 3, + guidance_scale=guidance_scale, + num_images_per_prompt=lcm_diffusion_setting.number_of_images, + ).images + + else: + if ( + lcm_diffusion_setting.diffusion_task + == DiffusionTask.text_to_image.value + ): + result_images = self.pipeline( + prompt=lcm_diffusion_setting.prompt, + negative_prompt=lcm_diffusion_setting.negative_prompt, + num_inference_steps=lcm_diffusion_setting.inference_steps, + guidance_scale=guidance_scale, + width=lcm_diffusion_setting.image_width, + height=lcm_diffusion_setting.image_height, + num_images_per_prompt=lcm_diffusion_setting.number_of_images, + timesteps=self._get_timesteps(), + **pipeline_extra_args, + **controlnet_args, + ).images + + elif ( + lcm_diffusion_setting.diffusion_task + == DiffusionTask.image_to_image.value + ): + result_images = self.img_to_img_pipeline( + image=lcm_diffusion_setting.init_image, + strength=lcm_diffusion_setting.strength, + prompt=lcm_diffusion_setting.prompt, + negative_prompt=lcm_diffusion_setting.negative_prompt, + num_inference_steps=img_to_img_inference_steps, + guidance_scale=guidance_scale, + width=lcm_diffusion_setting.image_width, + height=lcm_diffusion_setting.image_height, + num_images_per_prompt=lcm_diffusion_setting.number_of_images, + **pipeline_extra_args, + **controlnet_args, + ).images + + for i, seed in enumerate(seeds): + result_images[i].info["image_seed"] = seed + + return result_images + + def _init_gguf_diffusion( + self, + lcm_diffusion_setting: LCMDiffusionSetting, + ): + config = ModelConfig() + config.model_path = lcm_diffusion_setting.gguf_model.diffusion_path + config.diffusion_model_path = lcm_diffusion_setting.gguf_model.diffusion_path + config.clip_l_path = lcm_diffusion_setting.gguf_model.clip_path + config.t5xxl_path = lcm_diffusion_setting.gguf_model.t5xxl_path + config.vae_path = lcm_diffusion_setting.gguf_model.vae_path + config.n_threads = GGUF_THREADS + print(f"GGUF Threads : {GGUF_THREADS} ") + print("GGUF - Model config") + pprint(lcm_diffusion_setting.gguf_model.model_dump()) + self.pipeline = GGUFDiffusion( + get_app_path(), # Place DLL in fastsdcpu folder + config, + True, + ) + + def _generate_images_gguf( + self, + lcm_diffusion_setting: LCMDiffusionSetting, + ): + if lcm_diffusion_setting.diffusion_task == DiffusionTask.text_to_image.value: + t2iconfig = Txt2ImgConfig() + t2iconfig.prompt = lcm_diffusion_setting.prompt + t2iconfig.batch_count = lcm_diffusion_setting.number_of_images + t2iconfig.cfg_scale = lcm_diffusion_setting.guidance_scale + t2iconfig.height = lcm_diffusion_setting.image_height + t2iconfig.width = lcm_diffusion_setting.image_width + t2iconfig.sample_steps = lcm_diffusion_setting.inference_steps + t2iconfig.sample_method = SampleMethod.EULER + if lcm_diffusion_setting.use_seed: + t2iconfig.seed = lcm_diffusion_setting.seed + else: + t2iconfig.seed = -1 + + return self.pipeline.generate_text2mg(t2iconfig) diff --git a/backend/lora.py b/backend/lora.py new file mode 100644 index 0000000000000000000000000000000000000000..369f54f9577c391222331770093f3531b25258ae --- /dev/null +++ b/backend/lora.py @@ -0,0 +1,136 @@ +import glob +from os import path +from paths import get_file_name, FastStableDiffusionPaths +from pathlib import Path + + +# A basic class to keep track of the currently loaded LoRAs and +# their weights; the diffusers function \c get_active_adapters() +# returns a list of adapter names but not their weights so we need +# a way to keep track of the current LoRA weights to set whenever +# a new LoRA is loaded +class _lora_info: + def __init__( + self, + path: str, + weight: float, + ): + self.path = path + self.adapter_name = get_file_name(path) + self.weight = weight + + def __del__(self): + self.path = None + self.adapter_name = None + + +_loaded_loras = [] +_current_pipeline = None + + +# This function loads a LoRA from the LoRA path setting, so it's +# possible to load multiple LoRAs by calling this function more than +# once with a different LoRA path setting; note that if you plan to +# load multiple LoRAs and dynamically change their weights, you +# might want to set the LoRA fuse option to False +def load_lora_weight( + pipeline, + lcm_diffusion_setting, +): + if not lcm_diffusion_setting.lora.path: + raise Exception("Empty lora model path") + + if not path.exists(lcm_diffusion_setting.lora.path): + raise Exception("Lora model path is invalid") + + # If the pipeline has been rebuilt since the last call, remove all + # references to previously loaded LoRAs and store the new pipeline + global _loaded_loras + global _current_pipeline + if pipeline != _current_pipeline: + for lora in _loaded_loras: + del lora + del _loaded_loras + _loaded_loras = [] + _current_pipeline = pipeline + + current_lora = _lora_info( + lcm_diffusion_setting.lora.path, + lcm_diffusion_setting.lora.weight, + ) + _loaded_loras.append(current_lora) + + if lcm_diffusion_setting.lora.enabled: + print(f"LoRA adapter name : {current_lora.adapter_name}") + pipeline.load_lora_weights( + FastStableDiffusionPaths.get_lora_models_path(), + weight_name=Path(lcm_diffusion_setting.lora.path).name, + local_files_only=True, + adapter_name=current_lora.adapter_name, + ) + update_lora_weights( + pipeline, + lcm_diffusion_setting, + ) + + if lcm_diffusion_setting.lora.fuse: + pipeline.fuse_lora() + + +def get_lora_models(root_dir: str): + lora_models = glob.glob(f"{root_dir}/**/*.safetensors", recursive=True) + lora_models_map = {} + for file_path in lora_models: + lora_name = get_file_name(file_path) + if lora_name is not None: + lora_models_map[lora_name] = file_path + return lora_models_map + + +# This function returns a list of (adapter_name, weight) tuples for the +# currently loaded LoRAs +def get_active_lora_weights(): + active_loras = [] + for lora_info in _loaded_loras: + active_loras.append( + ( + lora_info.adapter_name, + lora_info.weight, + ) + ) + return active_loras + + +# This function receives a pipeline, an lcm_diffusion_setting object and +# an optional list of updated (adapter_name, weight) tuples +def update_lora_weights( + pipeline, + lcm_diffusion_setting, + lora_weights=None, +): + global _loaded_loras + global _current_pipeline + if pipeline != _current_pipeline: + print("Wrong pipeline when trying to update LoRA weights") + return + if lora_weights: + for idx, lora in enumerate(lora_weights): + if _loaded_loras[idx].adapter_name != lora[0]: + print("Wrong adapter name in LoRA enumeration!") + continue + _loaded_loras[idx].weight = lora[1] + + adapter_names = [] + adapter_weights = [] + if lcm_diffusion_setting.use_lcm_lora: + adapter_names.append("lcm") + adapter_weights.append(1.0) + for lora in _loaded_loras: + adapter_names.append(lora.adapter_name) + adapter_weights.append(lora.weight) + pipeline.set_adapters( + adapter_names, + adapter_weights=adapter_weights, + ) + adapter_weights = zip(adapter_names, adapter_weights) + print(f"Adapters: {list(adapter_weights)}") diff --git a/backend/models/device.py b/backend/models/device.py new file mode 100644 index 0000000000000000000000000000000000000000..5951c732e485eeace4dc6d9f289ddeb973ea3f2d --- /dev/null +++ b/backend/models/device.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + + +class DeviceInfo(BaseModel): + device_type: str + device_name: str + os: str + platform: str + processor: str diff --git a/backend/models/gen_images.py b/backend/models/gen_images.py new file mode 100644 index 0000000000000000000000000000000000000000..098c61dd7aa955062a13c92a438f12299e4e3a42 --- /dev/null +++ b/backend/models/gen_images.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel +from enum import Enum +from paths import FastStableDiffusionPaths + + +class ImageFormat(str, Enum): + """Image format""" + + JPEG = "jpeg" + PNG = "png" + + +class GeneratedImages(BaseModel): + path: str = FastStableDiffusionPaths.get_results_path() + format: str = ImageFormat.PNG.value.upper() + save_image: bool = True + save_image_quality: int = 90 diff --git a/backend/models/lcmdiffusion_setting.py b/backend/models/lcmdiffusion_setting.py new file mode 100644 index 0000000000000000000000000000000000000000..71db4e6621b0dd6d887eec586bea4836312df01f --- /dev/null +++ b/backend/models/lcmdiffusion_setting.py @@ -0,0 +1,76 @@ +from enum import Enum +from PIL import Image +from typing import Any, Optional, Union + +from constants import LCM_DEFAULT_MODEL, LCM_DEFAULT_MODEL_OPENVINO +from paths import FastStableDiffusionPaths +from pydantic import BaseModel + + +class LCMLora(BaseModel): + base_model_id: str = "Lykon/dreamshaper-8" + lcm_lora_id: str = "latent-consistency/lcm-lora-sdv1-5" + + +class DiffusionTask(str, Enum): + """Diffusion task types""" + + text_to_image = "text_to_image" + image_to_image = "image_to_image" + + +class Lora(BaseModel): + models_dir: str = FastStableDiffusionPaths.get_lora_models_path() + path: Optional[Any] = None + weight: Optional[float] = 0.5 + fuse: bool = True + enabled: bool = False + + +class ControlNetSetting(BaseModel): + adapter_path: Optional[str] = None # ControlNet adapter path + conditioning_scale: float = 0.5 + enabled: bool = False + _control_image: Image = None # Control image, PIL image + + +class GGUFModel(BaseModel): + gguf_models: str = FastStableDiffusionPaths.get_gguf_models_path() + diffusion_path: Optional[str] = None + clip_path: Optional[str] = None + t5xxl_path: Optional[str] = None + vae_path: Optional[str] = None + + +class LCMDiffusionSetting(BaseModel): + lcm_model_id: str = LCM_DEFAULT_MODEL + openvino_lcm_model_id: str = LCM_DEFAULT_MODEL_OPENVINO + use_offline_model: bool = False + use_lcm_lora: bool = False + lcm_lora: Optional[LCMLora] = LCMLora() + use_tiny_auto_encoder: bool = False + use_openvino: bool = False + prompt: str = "" + negative_prompt: str = "" + init_image: Any = None + strength: Optional[float] = 0.6 + image_height: Optional[int] = 512 + image_width: Optional[int] = 512 + inference_steps: Optional[int] = 1 + guidance_scale: Optional[float] = 1 + clip_skip: Optional[int] = 1 + token_merging: Optional[float] = 0 + number_of_images: Optional[int] = 1 + seed: Optional[int] = 123123 + use_seed: bool = False + use_safety_checker: bool = False + diffusion_task: str = DiffusionTask.text_to_image.value + lora: Optional[Lora] = Lora() + controlnet: Optional[Union[ControlNetSetting, list[ControlNetSetting]]] = None + dirs: dict = { + "controlnet": FastStableDiffusionPaths.get_controlnet_models_path(), + "lora": FastStableDiffusionPaths.get_lora_models_path(), + } + rebuild_pipeline: bool = False + use_gguf_model: bool = False + gguf_model: Optional[GGUFModel] = GGUFModel() diff --git a/backend/models/upscale.py b/backend/models/upscale.py new file mode 100644 index 0000000000000000000000000000000000000000..e065fed0ebb3719236f3881a54dff21ff3f0b7b2 --- /dev/null +++ b/backend/models/upscale.py @@ -0,0 +1,9 @@ +from enum import Enum + + +class UpscaleMode(str, Enum): + """Diffusion task types""" + + normal = "normal" + sd_upscale = "sd_upscale" + aura_sr = "aura_sr" diff --git a/backend/openvino/custom_ov_model_vae_decoder.py b/backend/openvino/custom_ov_model_vae_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..ef83fb079f9956c80043cab04a65e114f7e56c66 --- /dev/null +++ b/backend/openvino/custom_ov_model_vae_decoder.py @@ -0,0 +1,21 @@ +from backend.device import is_openvino_device + +if is_openvino_device(): + from optimum.intel.openvino.modeling_diffusion import OVModelVaeDecoder + + +class CustomOVModelVaeDecoder(OVModelVaeDecoder): + def __init__( + self, + model, + parent_model, + ov_config=None, + model_dir=None, + ): + super(OVModelVaeDecoder, self).__init__( + model, + parent_model, + ov_config, + "vae_decoder", + model_dir, + ) diff --git a/backend/openvino/flux_pipeline.py b/backend/openvino/flux_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..5e725dbae43418914919e3539c3d2cc30b048abd --- /dev/null +++ b/backend/openvino/flux_pipeline.py @@ -0,0 +1,36 @@ +from pathlib import Path + +from constants import DEVICE, LCM_DEFAULT_MODEL_OPENVINO, TAEF1_MODEL_OPENVINO +from huggingface_hub import snapshot_download + +from backend.openvino.ovflux import ( + TEXT_ENCODER_2_PATH, + TEXT_ENCODER_PATH, + TRANSFORMER_PATH, + VAE_DECODER_PATH, + init_pipeline, +) + + +def get_flux_pipeline( + model_id: str = LCM_DEFAULT_MODEL_OPENVINO, + use_taef1: bool = False, + taef1_path: str = TAEF1_MODEL_OPENVINO, +): + model_dir = Path(snapshot_download(model_id)) + vae_dir = Path(snapshot_download(taef1_path)) if use_taef1 else model_dir + + model_dict = { + "transformer": model_dir / TRANSFORMER_PATH, + "text_encoder": model_dir / TEXT_ENCODER_PATH, + "text_encoder_2": model_dir / TEXT_ENCODER_2_PATH, + "vae": vae_dir / VAE_DECODER_PATH, + } + ov_pipe = init_pipeline( + model_dir, + model_dict, + device=DEVICE.upper(), + use_taef1=use_taef1, + ) + + return ov_pipe diff --git a/backend/openvino/ov_hc_stablediffusion_pipeline.py b/backend/openvino/ov_hc_stablediffusion_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..79f196e09658df2a2aa88d4b843140320cd1da89 --- /dev/null +++ b/backend/openvino/ov_hc_stablediffusion_pipeline.py @@ -0,0 +1,93 @@ +"""This is an experimental pipeline used to test AI PC NPU and GPU""" + +from pathlib import Path + +from diffusers import EulerDiscreteScheduler,LCMScheduler +from huggingface_hub import snapshot_download +from PIL import Image +from backend.openvino.stable_diffusion_engine import ( + StableDiffusionEngineAdvanced, + LatentConsistencyEngineAdvanced +) + + +class OvHcStableDiffusion: + "OpenVINO Heterogeneous compute Stablediffusion" + + def __init__( + self, + model_path, + device: list = ["GPU", "NPU", "GPU", "GPU"], + ): + model_dir = Path(snapshot_download(model_path)) + self.scheduler = EulerDiscreteScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + ) + self.ov_sd_pipleline = StableDiffusionEngineAdvanced( + model=model_dir, + device=device, + ) + + def generate( + self, + prompt: str, + neg_prompt: str, + init_image: Image = None, + strength: float = 1.0, + ): + image = self.ov_sd_pipleline( + prompt=prompt, + negative_prompt=neg_prompt, + init_image=init_image, + strength=strength, + num_inference_steps=25, + scheduler=self.scheduler, + ) + image_rgb = image[..., ::-1] + return Image.fromarray(image_rgb) + + +class OvHcLatentConsistency: + """ + OpenVINO Heterogeneous compute Latent consistency models + For the current Intel Cor Ultra, the Text Encoder and Unet can run on NPU + Supports following - Text to image , Image to image and image variations + """ + + def __init__( + self, + model_path, + device: list = ["NPU", "NPU", "GPU"], + ): + + model_dir = Path(snapshot_download(model_path)) + + self.scheduler = LCMScheduler( + beta_start=0.001, + beta_end=0.01, + ) + self.ov_sd_pipleline = LatentConsistencyEngineAdvanced( + model=model_dir, + device=device, + ) + + def generate( + self, + prompt: str, + neg_prompt: str, + init_image: Image = None, + num_inference_steps=4, + strength: float = 0.5, + ): + image = self.ov_sd_pipleline( + prompt=prompt, + init_image = init_image, + strength = strength, + num_inference_steps=num_inference_steps, + scheduler=self.scheduler, + seed=None, + ) + + return image diff --git a/backend/openvino/ovflux.py b/backend/openvino/ovflux.py new file mode 100644 index 0000000000000000000000000000000000000000..b30dfbee5b4da0ee8c188bb36b3138deddfa75c4 --- /dev/null +++ b/backend/openvino/ovflux.py @@ -0,0 +1,675 @@ +"""Based on https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/flux.1-image-generation/flux_helper.py""" + +import inspect +import json +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import numpy as np +import openvino as ov +import torch +from diffusers.image_processor import VaeImageProcessor +from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.schedulers import FlowMatchEulerDiscreteScheduler +from diffusers.utils.torch_utils import randn_tensor +from transformers import AutoTokenizer + +TRANSFORMER_PATH = Path("transformer/transformer.xml") +VAE_DECODER_PATH = Path("vae/vae_decoder.xml") +TEXT_ENCODER_PATH = Path("text_encoder/text_encoder.xml") +TEXT_ENCODER_2_PATH = Path("text_encoder_2/text_encoder_2.xml") + + +def cleanup_torchscript_cache(): + """ + Helper for removing cached model representation + """ + torch._C._jit_clear_class_registry() + torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() + torch.jit._state._clear_class_state() + + +def _prepare_latent_image_ids( + batch_size, height, width, device=torch.device("cpu"), dtype=torch.float32 +): + latent_image_ids = torch.zeros(height // 2, width // 2, 3) + latent_image_ids[..., 1] = ( + latent_image_ids[..., 1] + torch.arange(height // 2)[:, None] + ) + latent_image_ids[..., 2] = ( + latent_image_ids[..., 2] + torch.arange(width // 2)[None, :] + ) + + latent_image_id_height, latent_image_id_width, latent_image_id_channels = ( + latent_image_ids.shape + ) + + latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1) + latent_image_ids = latent_image_ids.reshape( + batch_size, + latent_image_id_height * latent_image_id_width, + latent_image_id_channels, + ) + + return latent_image_ids.to(device=device, dtype=dtype) + + +def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: + assert dim % 2 == 0, "The dimension must be even." + + scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim + omega = 1.0 / (theta**scale) + + batch_size, seq_length = pos.shape + out = pos.unsqueeze(-1) * omega.unsqueeze(0).unsqueeze(0) + cos_out = torch.cos(out) + sin_out = torch.sin(out) + + stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1) + out = stacked_out.view(batch_size, -1, dim // 2, 2, 2) + return out.float() + + +def calculate_shift( + image_seq_len, + base_seq_len: int = 256, + max_seq_len: int = 4096, + base_shift: float = 0.5, + max_shift: float = 1.16, +): + m = (max_shift - base_shift) / (max_seq_len - base_seq_len) + b = base_shift - m * base_seq_len + mu = image_seq_len * m + b + return mu + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError( + "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values" + ) + if timesteps is not None: + accepts_timesteps = "timesteps" in set( + inspect.signature(scheduler.set_timesteps).parameters.keys() + ) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set( + inspect.signature(scheduler.set_timesteps).parameters.keys() + ) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +class OVFluxPipeline(DiffusionPipeline): + def __init__( + self, + scheduler, + transformer, + vae, + text_encoder, + text_encoder_2, + tokenizer, + tokenizer_2, + transformer_config, + vae_config, + ): + super().__init__() + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + text_encoder_2=text_encoder_2, + tokenizer=tokenizer, + tokenizer_2=tokenizer_2, + transformer=transformer, + scheduler=scheduler, + ) + self.vae_config = vae_config + self.transformer_config = transformer_config + self.vae_scale_factor = 2 ** ( + len(self.vae_config.get("block_out_channels", [0] * 16)) + if hasattr(self, "vae") and self.vae is not None + else 16 + ) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.tokenizer_max_length = ( + self.tokenizer.model_max_length + if hasattr(self, "tokenizer") and self.tokenizer is not None + else 77 + ) + self.default_sample_size = 64 + + def _get_t5_prompt_embeds( + self, + prompt: Union[str, List[str]] = None, + num_images_per_prompt: int = 1, + max_sequence_length: int = 512, + ): + prompt = [prompt] if isinstance(prompt, str) else prompt + batch_size = len(prompt) + + text_inputs = self.tokenizer_2( + prompt, + padding="max_length", + max_length=max_sequence_length, + truncation=True, + return_length=False, + return_overflowing_tokens=False, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + prompt_embeds = torch.from_numpy(self.text_encoder_2(text_input_ids)[0]) + + _, seq_len, _ = prompt_embeds.shape + + # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view( + batch_size * num_images_per_prompt, seq_len, -1 + ) + + return prompt_embeds + + def _get_clip_prompt_embeds( + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int = 1, + ): + + prompt = [prompt] if isinstance(prompt, str) else prompt + batch_size = len(prompt) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer_max_length, + truncation=True, + return_overflowing_tokens=False, + return_length=False, + return_tensors="pt", + ) + + text_input_ids = text_inputs.input_ids + prompt_embeds = torch.from_numpy(self.text_encoder(text_input_ids)[1]) + + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1) + + return prompt_embeds + + def encode_prompt( + self, + prompt: Union[str, List[str]], + prompt_2: Union[str, List[str]], + num_images_per_prompt: int = 1, + prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + max_sequence_length: int = 512, + ): + r""" + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + used in all text-encoders + num_images_per_prompt (`int`): + number of images that should be generated per prompt + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, pooled text embeddings will be generated from `prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + """ + + prompt = [prompt] if isinstance(prompt, str) else prompt + if prompt is not None: + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + prompt_2 = prompt_2 or prompt + prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2 + + # We only use the pooled prompt output from the CLIPTextModel + pooled_prompt_embeds = self._get_clip_prompt_embeds( + prompt=prompt, + num_images_per_prompt=num_images_per_prompt, + ) + prompt_embeds = self._get_t5_prompt_embeds( + prompt=prompt_2, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + ) + text_ids = torch.zeros(batch_size, prompt_embeds.shape[1], 3) + text_ids = text_ids.repeat(num_images_per_prompt, 1, 1) + + return prompt_embeds, pooled_prompt_embeds, text_ids + + def check_inputs( + self, + prompt, + prompt_2, + height, + width, + prompt_embeds=None, + pooled_prompt_embeds=None, + max_sequence_length=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError( + f"`height` and `width` have to be divisible by 8 but are {height} and {width}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt_2 is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and ( + not isinstance(prompt, str) and not isinstance(prompt, list) + ): + raise ValueError( + f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" + ) + elif prompt_2 is not None and ( + not isinstance(prompt_2, str) and not isinstance(prompt_2, list) + ): + raise ValueError( + f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}" + ) + + if prompt_embeds is not None and pooled_prompt_embeds is None: + raise ValueError( + "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`." + ) + + if max_sequence_length is not None and max_sequence_length > 512: + raise ValueError( + f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}" + ) + + @staticmethod + def _prepare_latent_image_ids(batch_size, height, width): + return _prepare_latent_image_ids(batch_size, height, width) + + @staticmethod + def _pack_latents(latents, batch_size, num_channels_latents, height, width): + latents = latents.view( + batch_size, num_channels_latents, height // 2, 2, width // 2, 2 + ) + latents = latents.permute(0, 2, 4, 1, 3, 5) + latents = latents.reshape( + batch_size, (height // 2) * (width // 2), num_channels_latents * 4 + ) + + return latents + + @staticmethod + def _unpack_latents(latents, height, width, vae_scale_factor): + batch_size, num_patches, channels = latents.shape + + height = height // vae_scale_factor + width = width // vae_scale_factor + + latents = latents.view(batch_size, height, width, channels // 4, 2, 2) + latents = latents.permute(0, 3, 1, 4, 2, 5) + + latents = latents.reshape( + batch_size, channels // (2 * 2), height * 2, width * 2 + ) + + return latents + + def prepare_latents( + self, + batch_size, + num_channels_latents, + height, + width, + generator, + latents=None, + ): + height = 2 * (int(height) // self.vae_scale_factor) + width = 2 * (int(width) // self.vae_scale_factor) + + shape = (batch_size, num_channels_latents, height, width) + + if latents is not None: + latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width) + return latents, latent_image_ids + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + latents = randn_tensor(shape, generator=generator) + latents = self._pack_latents( + latents, batch_size, num_channels_latents, height, width + ) + + latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width) + + return latents, latent_image_ids + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def interrupt(self): + return self._interrupt + + def __call__( + self, + prompt: Union[str, List[str]] = None, + prompt_2: Optional[Union[str, List[str]]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + negative_prompt: str = None, + num_inference_steps: int = 28, + timesteps: List[int] = None, + guidance_scale: float = 7.0, + num_images_per_prompt: Optional[int] = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + max_sequence_length: int = 512, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + will be used instead + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. This is set to 1024 by default for the best results. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. This is set to 1024 by default for the best results. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to 7.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, pooled text embeddings will be generated from `prompt` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple. + max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`. + Returns: + [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict` + is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated + images. + """ + + height = height or self.default_sample_size * self.vae_scale_factor + width = width or self.default_sample_size * self.vae_scale_factor + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + prompt_2, + height, + width, + prompt_embeds=prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + max_sequence_length=max_sequence_length, + ) + + self._guidance_scale = guidance_scale + self._interrupt = False + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + ( + prompt_embeds, + pooled_prompt_embeds, + text_ids, + ) = self.encode_prompt( + prompt=prompt, + prompt_2=prompt_2, + prompt_embeds=prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + ) + + # 4. Prepare latent variables + num_channels_latents = self.transformer_config.get("in_channels", 64) // 4 + latents, latent_image_ids = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + generator, + latents, + ) + + # 5. Prepare timesteps + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + image_seq_len = latents.shape[1] + mu = calculate_shift( + image_seq_len, + self.scheduler.config.base_image_seq_len, + self.scheduler.config.max_image_seq_len, + self.scheduler.config.base_shift, + self.scheduler.config.max_shift, + ) + timesteps, num_inference_steps = retrieve_timesteps( + scheduler=self.scheduler, + num_inference_steps=num_inference_steps, + timesteps=timesteps, + sigmas=sigmas, + mu=mu, + ) + num_warmup_steps = max( + len(timesteps) - num_inference_steps * self.scheduler.order, 0 + ) + self._num_timesteps = len(timesteps) + + # 6. Denoising loop + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timestep = t.expand(latents.shape[0]).to(latents.dtype) + + # handle guidance + if self.transformer_config.get("guidance_embeds"): + guidance = torch.tensor([guidance_scale]) + guidance = guidance.expand(latents.shape[0]) + else: + guidance = None + + transformer_input = { + "hidden_states": latents, + "timestep": timestep / 1000, + "pooled_projections": pooled_prompt_embeds, + "encoder_hidden_states": prompt_embeds, + "txt_ids": text_ids, + "img_ids": latent_image_ids, + } + if guidance is not None: + transformer_input["guidance"] = guidance + + noise_pred = torch.from_numpy(self.transformer(transformer_input)[0]) + + latents = self.scheduler.step( + noise_pred, t, latents, return_dict=False + )[0] + + # call the callback, if provided + if i == len(timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): + progress_bar.update() + + if output_type == "latent": + image = latents + + else: + latents = self._unpack_latents( + latents, height, width, self.vae_scale_factor + ) + latents = latents / self.vae_config.get( + "scaling_factor" + ) + self.vae_config.get("shift_factor") + image = self.vae(latents)[0] + image = self.image_processor.postprocess( + torch.from_numpy(image), output_type=output_type + ) + + if not return_dict: + return (image,) + + return FluxPipelineOutput(images=image) + + +def init_pipeline( + model_dir, + models_dict: Dict[str, Any], + device: str, + use_taef1: bool = False, +): + pipeline_args = {} + + print("OpenVINO FLUX Model compilation") + core = ov.Core() + for model_name, model_path in models_dict.items(): + pipeline_args[model_name] = core.compile_model(model_path, device) + if model_name == "vae" and use_taef1: + print(f"✅ VAE(TAEF1) - Done!") + else: + print(f"✅ {model_name} - Done!") + + transformer_path = models_dict["transformer"] + transformer_config_path = transformer_path.parent / "config.json" + with transformer_config_path.open("r") as f: + transformer_config = json.load(f) + vae_path = models_dict["vae"] + vae_config_path = vae_path.parent / "config.json" + with vae_config_path.open("r") as f: + vae_config = json.load(f) + + pipeline_args["vae_config"] = vae_config + pipeline_args["transformer_config"] = transformer_config + + scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(model_dir / "scheduler") + + tokenizer = AutoTokenizer.from_pretrained(model_dir / "tokenizer") + tokenizer_2 = AutoTokenizer.from_pretrained(model_dir / "tokenizer_2") + + pipeline_args["scheduler"] = scheduler + pipeline_args["tokenizer"] = tokenizer + pipeline_args["tokenizer_2"] = tokenizer_2 + ov_pipe = OVFluxPipeline(**pipeline_args) + return ov_pipe diff --git a/backend/openvino/pipelines.py b/backend/openvino/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..62d936dd7426bbe1dd7f43376bbfa61089cf0a8a --- /dev/null +++ b/backend/openvino/pipelines.py @@ -0,0 +1,75 @@ +from constants import DEVICE, LCM_DEFAULT_MODEL_OPENVINO +from backend.tiny_decoder import get_tiny_decoder_vae_model +from typing import Any +from backend.device import is_openvino_device +from paths import get_base_folder_name + +if is_openvino_device(): + from huggingface_hub import snapshot_download + from optimum.intel.openvino.modeling_diffusion import OVBaseModel + + from optimum.intel.openvino.modeling_diffusion import ( + OVStableDiffusionPipeline, + OVStableDiffusionImg2ImgPipeline, + OVStableDiffusionXLPipeline, + OVStableDiffusionXLImg2ImgPipeline, + ) + from backend.openvino.custom_ov_model_vae_decoder import CustomOVModelVaeDecoder + + +def ov_load_taesd( + pipeline: Any, + use_local_model: bool = False, +): + taesd_dir = snapshot_download( + repo_id=get_tiny_decoder_vae_model(pipeline.__class__.__name__), + local_files_only=use_local_model, + ) + pipeline.vae_decoder = CustomOVModelVaeDecoder( + model=OVBaseModel.load_model(f"{taesd_dir}/vae_decoder/openvino_model.xml"), + parent_model=pipeline, + model_dir=taesd_dir, + ) + + +def get_ov_text_to_image_pipeline( + model_id: str = LCM_DEFAULT_MODEL_OPENVINO, + use_local_model: bool = False, +) -> Any: + if "xl" in get_base_folder_name(model_id).lower(): + pipeline = OVStableDiffusionXLPipeline.from_pretrained( + model_id, + local_files_only=use_local_model, + ov_config={"CACHE_DIR": ""}, + device=DEVICE.upper(), + ) + else: + pipeline = OVStableDiffusionPipeline.from_pretrained( + model_id, + local_files_only=use_local_model, + ov_config={"CACHE_DIR": ""}, + device=DEVICE.upper(), + ) + + return pipeline + + +def get_ov_image_to_image_pipeline( + model_id: str = LCM_DEFAULT_MODEL_OPENVINO, + use_local_model: bool = False, +) -> Any: + if "xl" in get_base_folder_name(model_id).lower(): + pipeline = OVStableDiffusionXLImg2ImgPipeline.from_pretrained( + model_id, + local_files_only=use_local_model, + ov_config={"CACHE_DIR": ""}, + device=DEVICE.upper(), + ) + else: + pipeline = OVStableDiffusionImg2ImgPipeline.from_pretrained( + model_id, + local_files_only=use_local_model, + ov_config={"CACHE_DIR": ""}, + device=DEVICE.upper(), + ) + return pipeline diff --git a/backend/openvino/stable_diffusion_engine.py b/backend/openvino/stable_diffusion_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..3546db24dddaeaf78eb1162ad066bb0169de9ca7 --- /dev/null +++ b/backend/openvino/stable_diffusion_engine.py @@ -0,0 +1,1817 @@ +""" +Copyright(C) 2022-2023 Intel Corporation +SPDX - License - Identifier: Apache - 2.0 + +""" +import inspect +from typing import Union, Optional, Any, List, Dict +import numpy as np +# openvino +from openvino.runtime import Core +# tokenizer +from transformers import CLIPTokenizer +import torch +import random + +from diffusers import DiffusionPipeline +from diffusers.schedulers import (DDIMScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + EulerDiscreteScheduler, + EulerAncestralDiscreteScheduler) + + +from diffusers.image_processor import VaeImageProcessor +from diffusers.utils.torch_utils import randn_tensor +from diffusers.utils import PIL_INTERPOLATION + +import cv2 +import os +import sys + +# for multithreading +import concurrent.futures + +#For GIF +import PIL +from PIL import Image +import glob +import json +import time + +def scale_fit_to_window(dst_width:int, dst_height:int, image_width:int, image_height:int): + """ + Preprocessing helper function for calculating image size for resize with peserving original aspect ratio + and fitting image to specific window size + + Parameters: + dst_width (int): destination window width + dst_height (int): destination window height + image_width (int): source image width + image_height (int): source image height + Returns: + result_width (int): calculated width for resize + result_height (int): calculated height for resize + """ + im_scale = min(dst_height / image_height, dst_width / image_width) + return int(im_scale * image_width), int(im_scale * image_height) + +def preprocess(image: PIL.Image.Image, ht=512, wt=512): + """ + Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512, + then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that + converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW. + The function returns preprocessed input tensor and padding size, which can be used in postprocessing. + + Parameters: + image (PIL.Image.Image): input image + Returns: + image (np.ndarray): preprocessed image tensor + meta (Dict): dictionary with preprocessing metadata info + """ + + src_width, src_height = image.size + image = image.convert('RGB') + dst_width, dst_height = scale_fit_to_window( + wt, ht, src_width, src_height) + image = np.array(image.resize((dst_width, dst_height), + resample=PIL.Image.Resampling.LANCZOS))[None, :] + + pad_width = wt - dst_width + pad_height = ht - dst_height + pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0)) + image = np.pad(image, pad, mode="constant") + image = image.astype(np.float32) / 255.0 + image = 2.0 * image - 1.0 + image = image.transpose(0, 3, 1, 2) + + return image, {"padding": pad, "src_width": src_width, "src_height": src_height} + +def try_enable_npu_turbo(device, core): + import platform + if "windows" in platform.system().lower(): + if "NPU" in device and "3720" not in core.get_property('NPU', 'DEVICE_ARCHITECTURE'): + try: + core.set_property(properties={'NPU_TURBO': 'YES'},device_name='NPU') + except: + print(f"Failed loading NPU_TURBO for device {device}. Skipping... ") + else: + print_npu_turbo_art() + else: + print(f"Skipping NPU_TURBO for device {device}") + elif "linux" in platform.system().lower(): + if os.path.isfile('/sys/module/intel_vpu/parameters/test_mode'): + with open('/sys/module/intel_vpu/version', 'r') as f: + version = f.readline().split()[0] + if tuple(map(int, version.split('.'))) < tuple(map(int, '1.9.0'.split('.'))): + print(f"The driver intel_vpu-1.9.0 (or later) needs to be loaded for NPU Turbo (currently {version}). Skipping...") + else: + with open('/sys/module/intel_vpu/parameters/test_mode', 'r') as tm_file: + test_mode = int(tm_file.readline().split()[0]) + if test_mode == 512: + print_npu_turbo_art() + else: + print("The driver >=intel_vpu-1.9.0 was must be loaded with " + "\"modprobe intel_vpu test_mode=512\" to enable NPU_TURBO " + f"(currently test_mode={test_mode}). Skipping...") + else: + print(f"The driver >=intel_vpu-1.9.0 must be loaded with \"modprobe intel_vpu test_mode=512\" to enable NPU_TURBO. Skipping...") + else: + print(f"This platform ({platform.system()}) does not support NPU Turbo") + +def result(var): + return next(iter(var.values())) + +class StableDiffusionEngineAdvanced(DiffusionPipeline): + def __init__(self, model="runwayml/stable-diffusion-v1-5", + tokenizer="openai/clip-vit-large-patch14", + device=["CPU", "CPU", "CPU", "CPU"]): + try: + self.tokenizer = CLIPTokenizer.from_pretrained(model, local_files_only=True) + except: + self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer) + self.tokenizer.save_pretrained(model) + + self.core = Core() + self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')}) + try_enable_npu_turbo(device, self.core) + + print("Loading models... ") + + + + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: + futures = { + "unet_time_proj": executor.submit(self.core.compile_model, os.path.join(model, "unet_time_proj.xml"), device[0]), + "text": executor.submit(self.load_model, model, "text_encoder", device[0]), + "unet": executor.submit(self.load_model, model, "unet_int8", device[1]), + "unet_neg": executor.submit(self.load_model, model, "unet_int8", device[2]) if device[1] != device[2] else None, + "vae_decoder": executor.submit(self.load_model, model, "vae_decoder", device[3]), + "vae_encoder": executor.submit(self.load_model, model, "vae_encoder", device[3]) + } + + self.unet_time_proj = futures["unet_time_proj"].result() + self.text_encoder = futures["text"].result() + self.unet = futures["unet"].result() + self.unet_neg = futures["unet_neg"].result() if futures["unet_neg"] else self.unet + self.vae_decoder = futures["vae_decoder"].result() + self.vae_encoder = futures["vae_encoder"].result() + print("Text Device:", device[0]) + print("unet Device:", device[1]) + print("unet-neg Device:", device[2]) + print("VAE Device:", device[3]) + + self._text_encoder_output = self.text_encoder.output(0) + self._vae_d_output = self.vae_decoder.output(0) + self._vae_e_output = self.vae_encoder.output(0) if self.vae_encoder else None + + self.set_dimensions() + self.infer_request_neg = self.unet_neg.create_infer_request() + self.infer_request = self.unet.create_infer_request() + self.infer_request_time_proj = self.unet_time_proj.create_infer_request() + self.time_proj_constants = np.load(os.path.join(model, "time_proj_constants.npy")) + + def load_model(self, model, model_name, device): + if "NPU" in device: + with open(os.path.join(model, f"{model_name}.blob"), "rb") as f: + return self.core.import_model(f.read(), device) + return self.core.compile_model(os.path.join(model, f"{model_name}.xml"), device) + + def set_dimensions(self): + latent_shape = self.unet.input("latent_model_input").shape + if latent_shape[1] == 4: + self.height = latent_shape[2] * 8 + self.width = latent_shape[3] * 8 + else: + self.height = latent_shape[1] * 8 + self.width = latent_shape[2] * 8 + + def __call__( + self, + prompt, + init_image = None, + negative_prompt=None, + scheduler=None, + strength = 0.5, + num_inference_steps = 32, + guidance_scale = 7.5, + eta = 0.0, + create_gif = False, + model = None, + callback = None, + callback_userdata = None + ): + + # extract condition + text_input = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="np", + ) + text_embeddings = self.text_encoder(text_input.input_ids)[self._text_encoder_output] + + # do classifier free guidance + do_classifier_free_guidance = guidance_scale > 1.0 + if do_classifier_free_guidance: + + if negative_prompt is None: + uncond_tokens = [""] + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + else: + uncond_tokens = negative_prompt + + tokens_uncond = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=self.tokenizer.model_max_length, #truncation=True, + return_tensors="np" + ) + uncond_embeddings = self.text_encoder(tokens_uncond.input_ids)[self._text_encoder_output] + text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) + + # set timesteps + accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + extra_set_kwargs = {} + + if accepts_offset: + extra_set_kwargs["offset"] = 1 + + scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) + + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, scheduler) + latent_timestep = timesteps[:1] + + # get the initial random noise unless the user supplied it + latents, meta = self.prepare_latents(init_image, latent_timestep, scheduler) + + + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + if create_gif: + frames = [] + + for i, t in enumerate(self.progress_bar(timesteps)): + if callback: + callback(i, callback_userdata) + + # expand the latents if we are doing classifier free guidance + noise_pred = [] + latent_model_input = latents + latent_model_input = scheduler.scale_model_input(latent_model_input, t) + + latent_model_input_neg = latent_model_input + if self.unet.input("latent_model_input").shape[1] != 4: + #print("In transpose") + try: + latent_model_input = latent_model_input.permute(0,2,3,1) + except: + latent_model_input = latent_model_input.transpose(0,2,3,1) + + if self.unet_neg.input("latent_model_input").shape[1] != 4: + #print("In transpose") + try: + latent_model_input_neg = latent_model_input_neg.permute(0,2,3,1) + except: + latent_model_input_neg = latent_model_input_neg.transpose(0,2,3,1) + + + time_proj_constants_fp16 = np.float16(self.time_proj_constants) + t_scaled_fp16 = time_proj_constants_fp16 * np.float16(t) + cosine_t_fp16 = np.cos(t_scaled_fp16) + sine_t_fp16 = np.sin(t_scaled_fp16) + + t_scaled = self.time_proj_constants * np.float32(t) + + cosine_t = np.cos(t_scaled) + sine_t = np.sin(t_scaled) + + time_proj_dict = {"sine_t" : np.float32(sine_t), "cosine_t" : np.float32(cosine_t)} + self.infer_request_time_proj.start_async(time_proj_dict) + self.infer_request_time_proj.wait() + time_proj = self.infer_request_time_proj.get_output_tensor(0).data.astype(np.float32) + + input_tens_neg_dict = {"time_proj": np.float32(time_proj), "latent_model_input":latent_model_input_neg, "encoder_hidden_states": np.expand_dims(text_embeddings[0], axis=0)} + input_tens_dict = {"time_proj": np.float32(time_proj), "latent_model_input":latent_model_input, "encoder_hidden_states": np.expand_dims(text_embeddings[1], axis=0)} + + self.infer_request_neg.start_async(input_tens_neg_dict) + self.infer_request.start_async(input_tens_dict) + self.infer_request_neg.wait() + self.infer_request.wait() + + noise_pred_neg = self.infer_request_neg.get_output_tensor(0) + noise_pred_pos = self.infer_request.get_output_tensor(0) + + noise_pred.append(noise_pred_neg.data.astype(np.float32)) + noise_pred.append(noise_pred_pos.data.astype(np.float32)) + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs)["prev_sample"].numpy() + + if create_gif: + frames.append(latents) + + if callback: + callback(num_inference_steps, callback_userdata) + + # scale and decode the image latents with vae + latents = 1 / 0.18215 * latents + + start = time.time() + image = self.vae_decoder(latents)[self._vae_d_output] + print("Decoder ended:",time.time() - start) + + image = self.postprocess_image(image, meta) + + if create_gif: + gif_folder=os.path.join(model,"../../../gif") + print("gif_folder:",gif_folder) + if not os.path.exists(gif_folder): + os.makedirs(gif_folder) + for i in range(0,len(frames)): + image = self.vae_decoder(frames[i]*(1/0.18215))[self._vae_d_output] + image = self.postprocess_image(image, meta) + output = gif_folder + "/" + str(i).zfill(3) +".png" + cv2.imwrite(output, image) + with open(os.path.join(gif_folder, "prompt.json"), "w") as file: + json.dump({"prompt": prompt}, file) + frames_image = [Image.open(image) for image in glob.glob(f"{gif_folder}/*.png")] + frame_one = frames_image[0] + gif_file=os.path.join(gif_folder,"stable_diffusion.gif") + frame_one.save(gif_file, format="GIF", append_images=frames_image, save_all=True, duration=100, loop=0) + + return image + + def prepare_latents(self, image:PIL.Image.Image = None, latent_timestep:torch.Tensor = None, scheduler = LMSDiscreteScheduler): + """ + Function for getting initial latents for starting generation + + Parameters: + image (PIL.Image.Image, *optional*, None): + Input image for generation, if not provided randon noise will be used as starting point + latent_timestep (torch.Tensor, *optional*, None): + Predicted by scheduler initial step for image generation, required for latent image mixing with nosie + Returns: + latents (np.ndarray): + Image encoded in latent space + """ + latents_shape = (1, 4, self.height // 8, self.width // 8) + + noise = np.random.randn(*latents_shape).astype(np.float32) + if image is None: + ##print("Image is NONE") + # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas + if isinstance(scheduler, LMSDiscreteScheduler): + + noise = noise * scheduler.sigmas[0].numpy() + return noise, {} + elif isinstance(scheduler, EulerDiscreteScheduler) or isinstance(scheduler,EulerAncestralDiscreteScheduler): + + noise = noise * scheduler.sigmas.max().numpy() + return noise, {} + else: + return noise, {} + input_image, meta = preprocess(image,self.height,self.width) + + moments = self.vae_encoder(input_image)[self._vae_e_output] + + mean, logvar = np.split(moments, 2, axis=1) + + std = np.exp(logvar * 0.5) + latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215 + + + latents = scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy() + return latents, meta + + def postprocess_image(self, image:np.ndarray, meta:Dict): + """ + Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initial image size (if required), + normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format + + Parameters: + image (np.ndarray): + Generated image + meta (Dict): + Metadata obtained on latents preparing step, can be empty + output_type (str, *optional*, pil): + Output format for result, can be pil or numpy + Returns: + image (List of np.ndarray or PIL.Image.Image): + Postprocessed images + + if "src_height" in meta: + orig_height, orig_width = meta["src_height"], meta["src_width"] + image = [cv2.resize(img, (orig_width, orig_height)) + for img in image] + + return image + """ + if "padding" in meta: + pad = meta["padding"] + (_, end_h), (_, end_w) = pad[1:3] + h, w = image.shape[2:] + #print("image shape",image.shape[2:]) + unpad_h = h - end_h + unpad_w = w - end_w + image = image[:, :, :unpad_h, :unpad_w] + image = np.clip(image / 2 + 0.5, 0, 1) + image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8) + + + + if "src_height" in meta: + orig_height, orig_width = meta["src_height"], meta["src_width"] + image = cv2.resize(image, (orig_width, orig_height)) + + return image + + + + + def get_timesteps(self, num_inference_steps:int, strength:float, scheduler): + """ + Helper function for getting scheduler timesteps for generation + In case of image-to-image generation, it updates number of steps according to strength + + Parameters: + num_inference_steps (int): + number of inference steps for generation + strength (float): + value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. + Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. + """ + # get the original timestep using init_timestep + + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + +class StableDiffusionEngine(DiffusionPipeline): + def __init__( + self, + model="bes-dev/stable-diffusion-v1-4-openvino", + tokenizer="openai/clip-vit-large-patch14", + device=["CPU","CPU","CPU","CPU"]): + + self.core = Core() + self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')}) + + self.batch_size = 2 if device[1] == device[2] and device[1] == "GPU" else 1 + try_enable_npu_turbo(device, self.core) + + try: + self.tokenizer = CLIPTokenizer.from_pretrained(model, local_files_only=True) + except Exception as e: + print("Local tokenizer not found. Attempting to download...") + self.tokenizer = self.download_tokenizer(tokenizer, model) + + print("Loading models... ") + + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: + text_future = executor.submit(self.load_model, model, "text_encoder", device[0]) + vae_de_future = executor.submit(self.load_model, model, "vae_decoder", device[3]) + vae_en_future = executor.submit(self.load_model, model, "vae_encoder", device[3]) + + if self.batch_size == 1: + if "int8" not in model: + unet_future = executor.submit(self.load_model, model, "unet_bs1", device[1]) + unet_neg_future = executor.submit(self.load_model, model, "unet_bs1", device[2]) if device[1] != device[2] else None + else: + unet_future = executor.submit(self.load_model, model, "unet_int8a16", device[1]) + unet_neg_future = executor.submit(self.load_model, model, "unet_int8a16", device[2]) if device[1] != device[2] else None + else: + unet_future = executor.submit(self.load_model, model, "unet", device[1]) + unet_neg_future = None + + self.unet = unet_future.result() + self.unet_neg = unet_neg_future.result() if unet_neg_future else self.unet + self.text_encoder = text_future.result() + self.vae_decoder = vae_de_future.result() + self.vae_encoder = vae_en_future.result() + print("Text Device:", device[0]) + print("unet Device:", device[1]) + print("unet-neg Device:", device[2]) + print("VAE Device:", device[3]) + + self._text_encoder_output = self.text_encoder.output(0) + self._unet_output = self.unet.output(0) + self._vae_d_output = self.vae_decoder.output(0) + self._vae_e_output = self.vae_encoder.output(0) if self.vae_encoder else None + + self.unet_input_tensor_name = "sample" if 'sample' in self.unet.input(0).names else "latent_model_input" + + if self.batch_size == 1: + self.infer_request = self.unet.create_infer_request() + self.infer_request_neg = self.unet_neg.create_infer_request() + self._unet_neg_output = self.unet_neg.output(0) + else: + self.infer_request = None + self.infer_request_neg = None + self._unet_neg_output = None + + self.set_dimensions() + + + + def load_model(self, model, model_name, device): + if "NPU" in device: + with open(os.path.join(model, f"{model_name}.blob"), "rb") as f: + return self.core.import_model(f.read(), device) + return self.core.compile_model(os.path.join(model, f"{model_name}.xml"), device) + + def set_dimensions(self): + latent_shape = self.unet.input(self.unet_input_tensor_name).shape + if latent_shape[1] == 4: + self.height = latent_shape[2] * 8 + self.width = latent_shape[3] * 8 + else: + self.height = latent_shape[1] * 8 + self.width = latent_shape[2] * 8 + + def __call__( + self, + prompt, + init_image=None, + negative_prompt=None, + scheduler=None, + strength=0.5, + num_inference_steps=32, + guidance_scale=7.5, + eta=0.0, + create_gif=False, + model=None, + callback=None, + callback_userdata=None + ): + # extract condition + text_input = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="np", + ) + text_embeddings = self.text_encoder(text_input.input_ids)[self._text_encoder_output] + + + # do classifier free guidance + do_classifier_free_guidance = guidance_scale > 1.0 + if do_classifier_free_guidance: + if negative_prompt is None: + uncond_tokens = [""] + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + else: + uncond_tokens = negative_prompt + + tokens_uncond = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=self.tokenizer.model_max_length, # truncation=True, + return_tensors="np" + ) + uncond_embeddings = self.text_encoder(tokens_uncond.input_ids)[self._text_encoder_output] + text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) + + # set timesteps + accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + extra_set_kwargs = {} + + if accepts_offset: + extra_set_kwargs["offset"] = 1 + + scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) + + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, scheduler) + latent_timestep = timesteps[:1] + + # get the initial random noise unless the user supplied it + latents, meta = self.prepare_latents(init_image, latent_timestep, scheduler,model) + + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + if create_gif: + frames = [] + + for i, t in enumerate(self.progress_bar(timesteps)): + if callback: + callback(i, callback_userdata) + + if self.batch_size == 1: + # expand the latents if we are doing classifier free guidance + noise_pred = [] + latent_model_input = latents + + #Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. + latent_model_input = scheduler.scale_model_input(latent_model_input, t) + latent_model_input_pos = latent_model_input + latent_model_input_neg = latent_model_input + + if self.unet.input(self.unet_input_tensor_name).shape[1] != 4: + try: + latent_model_input_pos = latent_model_input_pos.permute(0,2,3,1) + except: + latent_model_input_pos = latent_model_input_pos.transpose(0,2,3,1) + + if self.unet_neg.input(self.unet_input_tensor_name).shape[1] != 4: + try: + latent_model_input_neg = latent_model_input_neg.permute(0,2,3,1) + except: + latent_model_input_neg = latent_model_input_neg.transpose(0,2,3,1) + + if "sample" in self.unet_input_tensor_name: + input_tens_neg_dict = {"sample" : latent_model_input_neg, "encoder_hidden_states": np.expand_dims(text_embeddings[0], axis=0), "timestep": np.expand_dims(np.float32(t), axis=0)} + input_tens_pos_dict = {"sample" : latent_model_input_pos, "encoder_hidden_states": np.expand_dims(text_embeddings[1], axis=0), "timestep": np.expand_dims(np.float32(t), axis=0)} + else: + input_tens_neg_dict = {"latent_model_input" : latent_model_input_neg, "encoder_hidden_states": np.expand_dims(text_embeddings[0], axis=0), "t": np.expand_dims(np.float32(t), axis=0)} + input_tens_pos_dict = {"latent_model_input" : latent_model_input_pos, "encoder_hidden_states": np.expand_dims(text_embeddings[1], axis=0), "t": np.expand_dims(np.float32(t), axis=0)} + + self.infer_request_neg.start_async(input_tens_neg_dict) + self.infer_request.start_async(input_tens_pos_dict) + + self.infer_request_neg.wait() + self.infer_request.wait() + + noise_pred_neg = self.infer_request_neg.get_output_tensor(0) + noise_pred_pos = self.infer_request.get_output_tensor(0) + + noise_pred.append(noise_pred_neg.data.astype(np.float32)) + noise_pred.append(noise_pred_pos.data.astype(np.float32)) + else: + latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = scheduler.scale_model_input(latent_model_input, t) + noise_pred = self.unet([latent_model_input, np.array(t, dtype=np.float32), text_embeddings])[self._unet_output] + + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs)["prev_sample"].numpy() + + if create_gif: + frames.append(latents) + + if callback: + callback(num_inference_steps, callback_userdata) + + # scale and decode the image latents with vae + #if self.height == 512 and self.width == 512: + latents = 1 / 0.18215 * latents + image = self.vae_decoder(latents)[self._vae_d_output] + image = self.postprocess_image(image, meta) + + return image + + def prepare_latents(self, image: PIL.Image.Image = None, latent_timestep: torch.Tensor = None, + scheduler=LMSDiscreteScheduler,model=None): + """ + Function for getting initial latents for starting generation + + Parameters: + image (PIL.Image.Image, *optional*, None): + Input image for generation, if not provided randon noise will be used as starting point + latent_timestep (torch.Tensor, *optional*, None): + Predicted by scheduler initial step for image generation, required for latent image mixing with nosie + Returns: + latents (np.ndarray): + Image encoded in latent space + """ + latents_shape = (1, 4, self.height // 8, self.width // 8) + + noise = np.random.randn(*latents_shape).astype(np.float32) + if image is None: + #print("Image is NONE") + # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas + if isinstance(scheduler, LMSDiscreteScheduler): + + noise = noise * scheduler.sigmas[0].numpy() + return noise, {} + elif isinstance(scheduler, EulerDiscreteScheduler): + + noise = noise * scheduler.sigmas.max().numpy() + return noise, {} + else: + return noise, {} + input_image, meta = preprocess(image, self.height, self.width) + + moments = self.vae_encoder(input_image)[self._vae_e_output] + + if "sd_2.1" in model: + latents = moments * 0.18215 + + else: + + mean, logvar = np.split(moments, 2, axis=1) + + std = np.exp(logvar * 0.5) + latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215 + + latents = scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy() + return latents, meta + + + def postprocess_image(self, image: np.ndarray, meta: Dict): + """ + Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initila image size (if required), + normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format + + Parameters: + image (np.ndarray): + Generated image + meta (Dict): + Metadata obtained on latents preparing step, can be empty + output_type (str, *optional*, pil): + Output format for result, can be pil or numpy + Returns: + image (List of np.ndarray or PIL.Image.Image): + Postprocessed images + + if "src_height" in meta: + orig_height, orig_width = meta["src_height"], meta["src_width"] + image = [cv2.resize(img, (orig_width, orig_height)) + for img in image] + + return image + """ + if "padding" in meta: + pad = meta["padding"] + (_, end_h), (_, end_w) = pad[1:3] + h, w = image.shape[2:] + # print("image shape",image.shape[2:]) + unpad_h = h - end_h + unpad_w = w - end_w + image = image[:, :, :unpad_h, :unpad_w] + image = np.clip(image / 2 + 0.5, 0, 1) + image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8) + + if "src_height" in meta: + orig_height, orig_width = meta["src_height"], meta["src_width"] + image = cv2.resize(image, (orig_width, orig_height)) + + return image + + # image = (image / 2 + 0.5).clip(0, 1) + # image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8) + + def get_timesteps(self, num_inference_steps: int, strength: float, scheduler): + """ + Helper function for getting scheduler timesteps for generation + In case of image-to-image generation, it updates number of steps according to strength + + Parameters: + num_inference_steps (int): + number of inference steps for generation + strength (float): + value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. + Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. + """ + # get the original timestep using init_timestep + + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + +class LatentConsistencyEngine(DiffusionPipeline): + def __init__( + self, + model="SimianLuo/LCM_Dreamshaper_v7", + tokenizer="openai/clip-vit-large-patch14", + device=["CPU", "CPU", "CPU"], + ): + super().__init__() + try: + self.tokenizer = CLIPTokenizer.from_pretrained(model, local_files_only=True) + except: + self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer) + self.tokenizer.save_pretrained(model) + + self.core = Core() + self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')}) # adding caching to reduce init time + try_enable_npu_turbo(device, self.core) + + + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: + text_future = executor.submit(self.load_model, model, "text_encoder", device[0]) + unet_future = executor.submit(self.load_model, model, "unet", device[1]) + vae_de_future = executor.submit(self.load_model, model, "vae_decoder", device[2]) + + print("Text Device:", device[0]) + self.text_encoder = text_future.result() + self._text_encoder_output = self.text_encoder.output(0) + + print("Unet Device:", device[1]) + self.unet = unet_future.result() + self._unet_output = self.unet.output(0) + self.infer_request = self.unet.create_infer_request() + + print(f"VAE Device: {device[2]}") + self.vae_decoder = vae_de_future.result() + self.infer_request_vae = self.vae_decoder.create_infer_request() + self.safety_checker = None #pipe.safety_checker + self.feature_extractor = None #pipe.feature_extractor + self.vae_scale_factor = 2 ** 3 + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + + def load_model(self, model, model_name, device): + if "NPU" in device: + with open(os.path.join(model, f"{model_name}.blob"), "rb") as f: + return self.core.import_model(f.read(), device) + return self.core.compile_model(os.path.join(model, f"{model_name}.xml"), device) + + def _encode_prompt( + self, + prompt, + num_images_per_prompt, + prompt_embeds: None, + ): + r""" + Encodes the prompt into text encoder hidden states. + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + num_images_per_prompt (`int`): + number of images that should be generated per prompt + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + """ + + if prompt_embeds is None: + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer( + prompt, padding="longest", return_tensors="pt" + ).input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[ + -1 + ] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + prompt_embeds = self.text_encoder(text_input_ids, share_inputs=True, share_outputs=True) + prompt_embeds = torch.from_numpy(prompt_embeds[0]) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view( + bs_embed * num_images_per_prompt, seq_len, -1 + ) + + # Don't need to get uncond prompt embedding because of LCM Guided Distillation + return prompt_embeds + + def run_safety_checker(self, image, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess( + image, output_type="pil" + ) + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor( + feature_extractor_input, return_tensors="pt" + ) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + + def prepare_latents( + self, batch_size, num_channels_latents, height, width, dtype, latents=None + ): + shape = ( + batch_size, + num_channels_latents, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + if latents is None: + latents = torch.randn(shape, dtype=dtype) + # scale the initial noise by the standard deviation required by the scheduler + return latents + + def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32): + """ + see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + Args: + timesteps: torch.Tensor: generate embedding vectors at these timesteps + embedding_dim: int: dimension of the embeddings to generate + dtype: data type of the generated embeddings + Returns: + embedding vectors with shape `(len(timesteps), embedding_dim)` + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = 512, + width: Optional[int] = 512, + guidance_scale: float = 7.5, + scheduler = None, + num_images_per_prompt: Optional[int] = 1, + latents: Optional[torch.FloatTensor] = None, + num_inference_steps: int = 4, + lcm_origin_steps: int = 50, + prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + model: Optional[Dict[str, any]] = None, + seed: Optional[int] = 1234567, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + callback = None, + callback_userdata = None + ): + + # 1. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if seed is not None: + torch.manual_seed(seed) + + #print("After Step 1: batch size is ", batch_size) + # do_classifier_free_guidance = guidance_scale > 0.0 + # In LCM Implementation: cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG) + + # 2. Encode input prompt + prompt_embeds = self._encode_prompt( + prompt, + num_images_per_prompt, + prompt_embeds=prompt_embeds, + ) + #print("After Step 2: prompt embeds is ", prompt_embeds) + #print("After Step 2: scheduler is ", scheduler ) + # 3. Prepare timesteps + scheduler.set_timesteps(num_inference_steps, original_inference_steps=lcm_origin_steps) + timesteps = scheduler.timesteps + + #print("After Step 3: timesteps is ", timesteps) + + # 4. Prepare latent variable + num_channels_latents = 4 + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + latents, + ) + latents = latents * scheduler.init_noise_sigma + + #print("After Step 4: ") + bs = batch_size * num_images_per_prompt + + # 5. Get Guidance Scale Embedding + w = torch.tensor(guidance_scale).repeat(bs) + w_embedding = self.get_w_embedding(w, embedding_dim=256) + #print("After Step 5: ") + # 6. LCM MultiStep Sampling Loop: + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if callback: + callback(i+1, callback_userdata) + + ts = torch.full((bs,), t, dtype=torch.long) + + # model prediction (v-prediction, eps, x) + model_pred = self.unet([latents, ts, prompt_embeds, w_embedding],share_inputs=True, share_outputs=True)[0] + + # compute the previous noisy sample x_t -> x_t-1 + latents, denoised = scheduler.step( + torch.from_numpy(model_pred), t, latents, return_dict=False + ) + progress_bar.update() + + #print("After Step 6: ") + + vae_start = time.time() + + if not output_type == "latent": + image = torch.from_numpy(self.vae_decoder(denoised / 0.18215, share_inputs=True, share_outputs=True)[0]) + else: + image = denoised + + print("Decoder Ended: ", time.time() - vae_start) + #post_start = time.time() + + #if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + #else: + # do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + #print ("After do_denormalize: image is ", image) + + image = self.image_processor.postprocess( + image, output_type=output_type, do_denormalize=do_denormalize + ) + + return image[0] + +class LatentConsistencyEngineAdvanced(DiffusionPipeline): + def __init__( + self, + model="SimianLuo/LCM_Dreamshaper_v7", + tokenizer="openai/clip-vit-large-patch14", + device=["CPU", "CPU", "CPU"], + ): + super().__init__() + try: + self.tokenizer = CLIPTokenizer.from_pretrained(model, local_files_only=True) + except: + self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer) + self.tokenizer.save_pretrained(model) + + self.core = Core() + self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')}) # adding caching to reduce init time + #try_enable_npu_turbo(device, self.core) + + + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: + text_future = executor.submit(self.load_model, model, "text_encoder", device[0]) + unet_future = executor.submit(self.load_model, model, "unet", device[1]) + vae_de_future = executor.submit(self.load_model, model, "vae_decoder", device[2]) + vae_encoder_future = executor.submit(self.load_model, model, "vae_encoder", device[2]) + + + print("Text Device:", device[0]) + self.text_encoder = text_future.result() + self._text_encoder_output = self.text_encoder.output(0) + + print("Unet Device:", device[1]) + self.unet = unet_future.result() + self._unet_output = self.unet.output(0) + self.infer_request = self.unet.create_infer_request() + + print(f"VAE Device: {device[2]}") + self.vae_decoder = vae_de_future.result() + self.vae_encoder = vae_encoder_future.result() + self._vae_e_output = self.vae_encoder.output(0) if self.vae_encoder else None + + self.infer_request_vae = self.vae_decoder.create_infer_request() + self.safety_checker = None #pipe.safety_checker + self.feature_extractor = None #pipe.feature_extractor + self.vae_scale_factor = 2 ** 3 + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + + def load_model(self, model, model_name, device): + print(f"Compiling the {model_name} to {device} ...") + return self.core.compile_model(os.path.join(model, f"{model_name}.xml"), device) + + def get_timesteps(self, num_inference_steps:int, strength:float, scheduler): + """ + Helper function for getting scheduler timesteps for generation + In case of image-to-image generation, it updates number of steps according to strength + + Parameters: + num_inference_steps (int): + number of inference steps for generation + strength (float): + value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. + Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. + """ + # get the original timestep using init_timestep + + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + + def _encode_prompt( + self, + prompt, + num_images_per_prompt, + prompt_embeds: None, + ): + r""" + Encodes the prompt into text encoder hidden states. + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + num_images_per_prompt (`int`): + number of images that should be generated per prompt + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + """ + + if prompt_embeds is None: + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer( + prompt, padding="longest", return_tensors="pt" + ).input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[ + -1 + ] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + prompt_embeds = self.text_encoder(text_input_ids, share_inputs=True, share_outputs=True) + prompt_embeds = torch.from_numpy(prompt_embeds[0]) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view( + bs_embed * num_images_per_prompt, seq_len, -1 + ) + + # Don't need to get uncond prompt embedding because of LCM Guided Distillation + return prompt_embeds + + def run_safety_checker(self, image, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess( + image, output_type="pil" + ) + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor( + feature_extractor_input, return_tensors="pt" + ) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concep + + def prepare_latents( + self,image,timestep,batch_size, num_channels_latents, height, width, dtype, scheduler,latents=None, + ): + shape = ( + batch_size, + num_channels_latents, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + if image: + #latents_shape = (1, 4, 512, 512 // 8) + #input_image, meta = preprocess(image,512,512) + latents_shape = (1, 4, 512 // 8, 512 // 8) + noise = np.random.randn(*latents_shape).astype(np.float32) + input_image,meta = preprocess(image,512,512) + moments = self.vae_encoder(input_image)[self._vae_e_output] + mean, logvar = np.split(moments, 2, axis=1) + std = np.exp(logvar * 0.5) + latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215 + noise = torch.randn(shape, dtype=dtype) + #latents = scheduler.add_noise(init_latents, noise, timestep) + latents = scheduler.add_noise(torch.from_numpy(latents), noise, timestep) + + else: + latents = torch.randn(shape, dtype=dtype) + # scale the initial noise by the standard deviation required by the scheduler + return latents + + def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32): + """ + see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + Args: + timesteps: torch.Tensor: generate embedding vectors at these timesteps + embedding_dim: int: dimension of the embeddings to generate + dtype: data type of the generated embeddings + Returns: + embedding vectors with shape `(len(timesteps), embedding_dim)` + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]] = None, + init_image: Optional[PIL.Image.Image] = None, + strength: Optional[float] = 0.8, + height: Optional[int] = 512, + width: Optional[int] = 512, + guidance_scale: float = 7.5, + scheduler = None, + num_images_per_prompt: Optional[int] = 1, + latents: Optional[torch.FloatTensor] = None, + num_inference_steps: int = 4, + lcm_origin_steps: int = 50, + prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + model: Optional[Dict[str, any]] = None, + seed: Optional[int] = 1234567, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + callback = None, + callback_userdata = None + ): + + # 1. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if seed is not None: + torch.manual_seed(seed) + + #print("After Step 1: batch size is ", batch_size) + # do_classifier_free_guidance = guidance_scale > 0.0 + # In LCM Implementation: cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG) + + # 2. Encode input prompt + prompt_embeds = self._encode_prompt( + prompt, + num_images_per_prompt, + prompt_embeds=prompt_embeds, + ) + #print("After Step 2: prompt embeds is ", prompt_embeds) + #print("After Step 2: scheduler is ", scheduler ) + # 3. Prepare timesteps + #scheduler.set_timesteps(num_inference_steps, original_inference_steps=lcm_origin_steps) + latent_timestep = None + if init_image: + scheduler.set_timesteps(num_inference_steps, original_inference_steps=lcm_origin_steps) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, scheduler) + latent_timestep = timesteps[:1] + else: + scheduler.set_timesteps(num_inference_steps, original_inference_steps=lcm_origin_steps) + timesteps = scheduler.timesteps + #timesteps = scheduler.timesteps + #latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + #print("timesteps: ", latent_timestep) + + #print("After Step 3: timesteps is ", timesteps) + + # 4. Prepare latent variable + num_channels_latents = 4 + latents = self.prepare_latents( + init_image, + latent_timestep, + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + scheduler, + latents, + ) + + latents = latents * scheduler.init_noise_sigma + + #print("After Step 4: ") + bs = batch_size * num_images_per_prompt + + # 5. Get Guidance Scale Embedding + w = torch.tensor(guidance_scale).repeat(bs) + w_embedding = self.get_w_embedding(w, embedding_dim=256) + #print("After Step 5: ") + # 6. LCM MultiStep Sampling Loop: + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if callback: + callback(i+1, callback_userdata) + + ts = torch.full((bs,), t, dtype=torch.long) + + # model prediction (v-prediction, eps, x) + model_pred = self.unet([latents, ts, prompt_embeds, w_embedding],share_inputs=True, share_outputs=True)[0] + + # compute the previous noisy sample x_t -> x_t-1 + latents, denoised = scheduler.step( + torch.from_numpy(model_pred), t, latents, return_dict=False + ) + progress_bar.update() + + #print("After Step 6: ") + + vae_start = time.time() + + if not output_type == "latent": + image = torch.from_numpy(self.vae_decoder(denoised / 0.18215, share_inputs=True, share_outputs=True)[0]) + else: + image = denoised + + print("Decoder Ended: ", time.time() - vae_start) + #post_start = time.time() + + #if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + #else: + # do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + #print ("After do_denormalize: image is ", image) + + image = self.image_processor.postprocess( + image, output_type=output_type, do_denormalize=do_denormalize + ) + + return image[0] + +class StableDiffusionEngineReferenceOnly(DiffusionPipeline): + def __init__( + self, + #scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + model="bes-dev/stable-diffusion-v1-4-openvino", + tokenizer="openai/clip-vit-large-patch14", + device=["CPU","CPU","CPU"] + ): + #self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer) + try: + self.tokenizer = CLIPTokenizer.from_pretrained(model,local_files_only=True) + except: + self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer) + self.tokenizer.save_pretrained(model) + + #self.scheduler = scheduler + # models + + self.core = Core() + self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')}) #adding caching to reduce init time + # text features + + print("Text Device:",device[0]) + self.text_encoder = self.core.compile_model(os.path.join(model, "text_encoder.xml"), device[0]) + + self._text_encoder_output = self.text_encoder.output(0) + + # diffusion + print("unet_w Device:",device[1]) + self.unet_w = self.core.compile_model(os.path.join(model, "unet_reference_write.xml"), device[1]) + self._unet_w_output = self.unet_w.output(0) + self.latent_shape = tuple(self.unet_w.inputs[0].shape)[1:] + + print("unet_r Device:",device[1]) + self.unet_r = self.core.compile_model(os.path.join(model, "unet_reference_read.xml"), device[1]) + self._unet_r_output = self.unet_r.output(0) + # decoder + print("Vae Device:",device[2]) + + self.vae_decoder = self.core.compile_model(os.path.join(model, "vae_decoder.xml"), device[2]) + + # encoder + + self.vae_encoder = self.core.compile_model(os.path.join(model, "vae_encoder.xml"), device[2]) + + self.init_image_shape = tuple(self.vae_encoder.inputs[0].shape)[2:] + + self._vae_d_output = self.vae_decoder.output(0) + self._vae_e_output = self.vae_encoder.output(0) if self.vae_encoder is not None else None + + self.height = self.unet_w.input(0).shape[2] * 8 + self.width = self.unet_w.input(0).shape[3] * 8 + + + + def __call__( + self, + prompt, + image = None, + negative_prompt=None, + scheduler=None, + strength = 1.0, + num_inference_steps = 32, + guidance_scale = 7.5, + eta = 0.0, + create_gif = False, + model = None, + callback = None, + callback_userdata = None + ): + # extract condition + text_input = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="np", + ) + text_embeddings = self.text_encoder(text_input.input_ids)[self._text_encoder_output] + + + # do classifier free guidance + do_classifier_free_guidance = guidance_scale > 1.0 + if do_classifier_free_guidance: + + if negative_prompt is None: + uncond_tokens = [""] + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + else: + uncond_tokens = negative_prompt + + tokens_uncond = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=self.tokenizer.model_max_length, #truncation=True, + return_tensors="np" + ) + uncond_embeddings = self.text_encoder(tokens_uncond.input_ids)[self._text_encoder_output] + text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) + + # set timesteps + accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + extra_set_kwargs = {} + + if accepts_offset: + extra_set_kwargs["offset"] = 1 + + scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) + + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, scheduler) + latent_timestep = timesteps[:1] + + ref_image = self.prepare_image( + image=image, + width=512, + height=512, + ) + # get the initial random noise unless the user supplied it + latents, meta = self.prepare_latents(None, latent_timestep, scheduler) + #ref_image_latents, _ = self.prepare_latents(init_image, latent_timestep, scheduler) + ref_image_latents = self.ov_prepare_ref_latents(ref_image) + + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + if create_gif: + frames = [] + + for i, t in enumerate(self.progress_bar(timesteps)): + if callback: + callback(i, callback_userdata) + + # expand the latents if we are doing classifier free guidance + latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = scheduler.scale_model_input(latent_model_input, t) + + # ref only part + noise = randn_tensor( + ref_image_latents.shape + ) + + ref_xt = scheduler.add_noise( + torch.from_numpy(ref_image_latents), + noise, + t.reshape( + 1, + ), + ).numpy() + ref_xt = np.concatenate([ref_xt] * 2) if do_classifier_free_guidance else ref_xt + ref_xt = scheduler.scale_model_input(ref_xt, t) + + # MODE = "write" + result_w_dict = self.unet_w([ + ref_xt, + t, + text_embeddings + ]) + down_0_attn0 = result_w_dict["/unet/down_blocks.0/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"] + down_0_attn1 = result_w_dict["/unet/down_blocks.0/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"] + down_1_attn0 = result_w_dict["/unet/down_blocks.1/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"] + down_1_attn1 = result_w_dict["/unet/down_blocks.1/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"] + down_2_attn0 = result_w_dict["/unet/down_blocks.2/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"] + down_2_attn1 = result_w_dict["/unet/down_blocks.2/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"] + mid_attn0 = result_w_dict["/unet/mid_block/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"] + up_1_attn0 = result_w_dict["/unet/up_blocks.1/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"] + up_1_attn1 = result_w_dict["/unet/up_blocks.1/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"] + up_1_attn2 = result_w_dict["/unet/up_blocks.1/attentions.2/transformer_blocks.0/norm1/LayerNormalization_output_0"] + up_2_attn0 = result_w_dict["/unet/up_blocks.2/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"] + up_2_attn1 = result_w_dict["/unet/up_blocks.2/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"] + up_2_attn2 = result_w_dict["/unet/up_blocks.2/attentions.2/transformer_blocks.0/norm1/LayerNormalization_output_0"] + up_3_attn0 = result_w_dict["/unet/up_blocks.3/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"] + up_3_attn1 = result_w_dict["/unet/up_blocks.3/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"] + up_3_attn2 = result_w_dict["/unet/up_blocks.3/attentions.2/transformer_blocks.0/norm1/LayerNormalization_output_0"] + + # MODE = "read" + noise_pred = self.unet_r([ + latent_model_input, t, text_embeddings, down_0_attn0, down_0_attn1, down_1_attn0, + down_1_attn1, down_2_attn0, down_2_attn1, mid_attn0, up_1_attn0, up_1_attn1, up_1_attn2, + up_2_attn0, up_2_attn1, up_2_attn2, up_3_attn0, up_3_attn1, up_3_attn2 + ])[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs)["prev_sample"].numpy() + + if create_gif: + frames.append(latents) + + if callback: + callback(num_inference_steps, callback_userdata) + + # scale and decode the image latents with vae + + image = self.vae_decoder(latents)[self._vae_d_output] + + image = self.postprocess_image(image, meta) + + if create_gif: + gif_folder=os.path.join(model,"../../../gif") + if not os.path.exists(gif_folder): + os.makedirs(gif_folder) + for i in range(0,len(frames)): + image = self.vae_decoder(frames[i])[self._vae_d_output] + image = self.postprocess_image(image, meta) + output = gif_folder + "/" + str(i).zfill(3) +".png" + cv2.imwrite(output, image) + with open(os.path.join(gif_folder, "prompt.json"), "w") as file: + json.dump({"prompt": prompt}, file) + frames_image = [Image.open(image) for image in glob.glob(f"{gif_folder}/*.png")] + frame_one = frames_image[0] + gif_file=os.path.join(gif_folder,"stable_diffusion.gif") + frame_one.save(gif_file, format="GIF", append_images=frames_image, save_all=True, duration=100, loop=0) + + return image + + def ov_prepare_ref_latents(self, refimage, vae_scaling_factor=0.18215): + #refimage = refimage.to(device=device, dtype=dtype) + + # encode the mask image into latents space so we can concatenate it to the latents + moments = self.vae_encoder(refimage)[0] + mean, logvar = np.split(moments, 2, axis=1) + std = np.exp(logvar * 0.5) + ref_image_latents = (mean + std * np.random.randn(*mean.shape)) + ref_image_latents = vae_scaling_factor * ref_image_latents + #ref_image_latents = scheduler.add_noise(torch.from_numpy(ref_image_latents), torch.from_numpy(noise), latent_timestep).numpy() + + # aligning device to prevent device errors when concating it with the latent model input + #ref_image_latents = ref_image_latents.to(device=device, dtype=dtype) + return ref_image_latents + + def prepare_latents(self, image:PIL.Image.Image = None, latent_timestep:torch.Tensor = None, scheduler = LMSDiscreteScheduler): + """ + Function for getting initial latents for starting generation + + Parameters: + image (PIL.Image.Image, *optional*, None): + Input image for generation, if not provided randon noise will be used as starting point + latent_timestep (torch.Tensor, *optional*, None): + Predicted by scheduler initial step for image generation, required for latent image mixing with nosie + Returns: + latents (np.ndarray): + Image encoded in latent space + """ + latents_shape = (1, 4, self.height // 8, self.width // 8) + + noise = np.random.randn(*latents_shape).astype(np.float32) + if image is None: + #print("Image is NONE") + # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas + if isinstance(scheduler, LMSDiscreteScheduler): + + noise = noise * scheduler.sigmas[0].numpy() + return noise, {} + elif isinstance(scheduler, EulerDiscreteScheduler): + + noise = noise * scheduler.sigmas.max().numpy() + return noise, {} + else: + return noise, {} + input_image, meta = preprocess(image,self.height,self.width) + + moments = self.vae_encoder(input_image)[self._vae_e_output] + + mean, logvar = np.split(moments, 2, axis=1) + + std = np.exp(logvar * 0.5) + latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215 + + + latents = scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy() + return latents, meta + + def postprocess_image(self, image:np.ndarray, meta:Dict): + """ + Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initila image size (if required), + normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format + + Parameters: + image (np.ndarray): + Generated image + meta (Dict): + Metadata obtained on latents preparing step, can be empty + output_type (str, *optional*, pil): + Output format for result, can be pil or numpy + Returns: + image (List of np.ndarray or PIL.Image.Image): + Postprocessed images + + if "src_height" in meta: + orig_height, orig_width = meta["src_height"], meta["src_width"] + image = [cv2.resize(img, (orig_width, orig_height)) + for img in image] + + return image + """ + if "padding" in meta: + pad = meta["padding"] + (_, end_h), (_, end_w) = pad[1:3] + h, w = image.shape[2:] + #print("image shape",image.shape[2:]) + unpad_h = h - end_h + unpad_w = w - end_w + image = image[:, :, :unpad_h, :unpad_w] + image = np.clip(image / 2 + 0.5, 0, 1) + image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8) + + + + if "src_height" in meta: + orig_height, orig_width = meta["src_height"], meta["src_width"] + image = cv2.resize(image, (orig_width, orig_height)) + + return image + + + #image = (image / 2 + 0.5).clip(0, 1) + #image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8) + + + def get_timesteps(self, num_inference_steps:int, strength:float, scheduler): + """ + Helper function for getting scheduler timesteps for generation + In case of image-to-image generation, it updates number of steps according to strength + + Parameters: + num_inference_steps (int): + number of inference steps for generation + strength (float): + value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. + Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. + """ + # get the original timestep using init_timestep + + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + def prepare_image( + self, + image, + width, + height, + do_classifier_free_guidance=False, + guess_mode=False, + ): + if not isinstance(image, np.ndarray): + if isinstance(image, PIL.Image.Image): + image = [image] + + if isinstance(image[0], PIL.Image.Image): + images = [] + + for image_ in image: + image_ = image_.convert("RGB") + image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]) + image_ = np.array(image_) + image_ = image_[None, :] + images.append(image_) + + image = images + + image = np.concatenate(image, axis=0) + image = np.array(image).astype(np.float32) / 255.0 + image = (image - 0.5) / 0.5 + image = image.transpose(0, 3, 1, 2) + elif isinstance(image[0], np.ndarray): + image = np.concatenate(image, dim=0) + + if do_classifier_free_guidance and not guess_mode: + image = np.concatenate([image] * 2) + + return image + +def print_npu_turbo_art(): + random_number = random.randint(1, 3) + + if random_number == 1: + print(" ") + print(" ___ ___ ___ ___ ___ ___ ") + print(" /\ \ /\ \ /\ \ /\ \ /\ \ _____ /\ \ ") + print(" \:\ \ /::\ \ \:\ \ ___ \:\ \ /::\ \ /::\ \ /::\ \ ") + print(" \:\ \ /:/\:\__\ \:\ \ /\__\ \:\ \ /:/\:\__\ /:/\:\ \ /:/\:\ \ ") + print(" _____\:\ \ /:/ /:/ / ___ \:\ \ /:/ / ___ \:\ \ /:/ /:/ / /:/ /::\__\ /:/ \:\ \ ") + print(" /::::::::\__\ /:/_/:/ / /\ \ \:\__\ /:/__/ /\ \ \:\__\ /:/_/:/__/___ /:/_/:/\:|__| /:/__/ \:\__\ ") + print(" \:\~~\~~\/__/ \:\/:/ / \:\ \ /:/ / /::\ \ \:\ \ /:/ / \:\/:::::/ / \:\/:/ /:/ / \:\ \ /:/ / ") + print(" \:\ \ \::/__/ \:\ /:/ / /:/\:\ \ \:\ /:/ / \::/~~/~~~~ \::/_/:/ / \:\ /:/ / ") + print(" \:\ \ \:\ \ \:\/:/ / \/__\:\ \ \:\/:/ / \:\~~\ \:\/:/ / \:\/:/ / ") + print(" \:\__\ \:\__\ \::/ / \:\__\ \::/ / \:\__\ \::/ / \::/ / ") + print(" \/__/ \/__/ \/__/ \/__/ \/__/ \/__/ \/__/ \/__/ ") + print(" ") + elif random_number == 2: + print(" _ _ ____ _ _ _____ _ _ ____ ____ ___ ") + print("| \ | | | _ \ | | | | |_ _| | | | | | _ \ | __ ) / _ \ ") + print("| \| | | |_) | | | | | | | | | | | | |_) | | _ \ | | | |") + print("| |\ | | __/ | |_| | | | | |_| | | _ < | |_) | | |_| |") + print("|_| \_| |_| \___/ |_| \___/ |_| \_\ |____/ \___/ ") + print(" ") + else: + print("") + print(" ) ( ( ) ") + print(" ( /( )\ ) * ) )\ ) ( ( /( ") + print(" )\()) (()/( ( ` ) /( ( (()/( ( )\ )\()) ") + print("((_)\ /(_)) )\ ( )(_)) )\ /(_)) )((_) ((_)\ ") + print(" _((_) (_)) _ ((_) (_(_()) _ ((_) (_)) ((_)_ ((_) ") + print("| \| | | _ \ | | | | |_ _| | | | | | _ \ | _ ) / _ \ ") + print("| .` | | _/ | |_| | | | | |_| | | / | _ \ | (_) | ") + print("|_|\_| |_| \___/ |_| \___/ |_|_\ |___/ \___/ ") + print(" ") + + + diff --git a/backend/pipelines/lcm.py b/backend/pipelines/lcm.py new file mode 100644 index 0000000000000000000000000000000000000000..4fe428516822ede118980002370c45adcf74c0be --- /dev/null +++ b/backend/pipelines/lcm.py @@ -0,0 +1,122 @@ +from constants import LCM_DEFAULT_MODEL +from diffusers import ( + DiffusionPipeline, + AutoencoderTiny, + UNet2DConditionModel, + LCMScheduler, + StableDiffusionPipeline, +) +import torch +from backend.tiny_decoder import get_tiny_decoder_vae_model +from typing import Any +from diffusers import ( + LCMScheduler, + StableDiffusionImg2ImgPipeline, + StableDiffusionXLImg2ImgPipeline, + AutoPipelineForText2Image, + AutoPipelineForImage2Image, + StableDiffusionControlNetPipeline, +) +import pathlib + + +def _get_lcm_pipeline_from_base_model( + lcm_model_id: str, + base_model_id: str, + use_local_model: bool, +): + pipeline = None + unet = UNet2DConditionModel.from_pretrained( + lcm_model_id, + torch_dtype=torch.float32, + local_files_only=use_local_model, + resume_download=True, + ) + pipeline = DiffusionPipeline.from_pretrained( + base_model_id, + unet=unet, + torch_dtype=torch.float32, + local_files_only=use_local_model, + resume_download=True, + ) + pipeline.scheduler = LCMScheduler.from_config(pipeline.scheduler.config) + return pipeline + + +def load_taesd( + pipeline: Any, + use_local_model: bool = False, + torch_data_type: torch.dtype = torch.float32, +): + vae_model = get_tiny_decoder_vae_model(pipeline.__class__.__name__) + pipeline.vae = AutoencoderTiny.from_pretrained( + vae_model, + torch_dtype=torch_data_type, + local_files_only=use_local_model, + ) + + +def get_lcm_model_pipeline( + model_id: str = LCM_DEFAULT_MODEL, + use_local_model: bool = False, + pipeline_args={}, +): + pipeline = None + if model_id == "latent-consistency/lcm-sdxl": + pipeline = _get_lcm_pipeline_from_base_model( + model_id, + "stabilityai/stable-diffusion-xl-base-1.0", + use_local_model, + ) + + elif model_id == "latent-consistency/lcm-ssd-1b": + pipeline = _get_lcm_pipeline_from_base_model( + model_id, + "segmind/SSD-1B", + use_local_model, + ) + elif pathlib.Path(model_id).suffix == ".safetensors": + # When loading a .safetensors model, the pipeline has to be created + # with StableDiffusionPipeline() since it's the only class that + # defines the method from_single_file() + dummy_pipeline = StableDiffusionPipeline.from_single_file( + model_id, + safety_checker=None, + run_safety_checker=False, + load_safety_checker=False, + local_files_only=use_local_model, + use_safetensors=True, + ) + if 'lcm' in model_id.lower(): + dummy_pipeline.scheduler = LCMScheduler.from_config(dummy_pipeline.scheduler.config) + + pipeline = AutoPipelineForText2Image.from_pipe( + dummy_pipeline, + **pipeline_args, + ) + del dummy_pipeline + else: + # pipeline = DiffusionPipeline.from_pretrained( + pipeline = AutoPipelineForText2Image.from_pretrained( + model_id, + local_files_only=use_local_model, + **pipeline_args, + ) + + return pipeline + + +def get_image_to_image_pipeline(pipeline: Any) -> Any: + components = pipeline.components + pipeline_class = pipeline.__class__.__name__ + if ( + pipeline_class == "LatentConsistencyModelPipeline" + or pipeline_class == "StableDiffusionPipeline" + ): + return StableDiffusionImg2ImgPipeline(**components) + elif pipeline_class == "StableDiffusionControlNetPipeline": + return AutoPipelineForImage2Image.from_pipe(pipeline) + elif pipeline_class == "StableDiffusionXLPipeline": + return StableDiffusionXLImg2ImgPipeline(**components) + else: + raise Exception(f"Unknown pipeline {pipeline_class}") diff --git a/backend/pipelines/lcm_lora.py b/backend/pipelines/lcm_lora.py new file mode 100644 index 0000000000000000000000000000000000000000..1816f99ee90d732498c025f5047553bb9228c734 --- /dev/null +++ b/backend/pipelines/lcm_lora.py @@ -0,0 +1,81 @@ +import pathlib +from os import path + +import torch +from diffusers import ( + AutoPipelineForText2Image, + LCMScheduler, + StableDiffusionPipeline, +) + + +def load_lcm_weights( + pipeline, + use_local_model, + lcm_lora_id, +): + kwargs = { + "local_files_only": use_local_model, + "weight_name": "pytorch_lora_weights.safetensors", + } + pipeline.load_lora_weights( + lcm_lora_id, + **kwargs, + adapter_name="lcm", + ) + + +def get_lcm_lora_pipeline( + base_model_id: str, + lcm_lora_id: str, + use_local_model: bool, + torch_data_type: torch.dtype, + pipeline_args={}, +): + if pathlib.Path(base_model_id).suffix == ".safetensors": + # SD 1.5 models only + # When loading a .safetensors model, the pipeline has to be created + # with StableDiffusionPipeline() since it's the only class that + # defines the method from_single_file(); afterwards a new pipeline + # is created using AutoPipelineForText2Image() for ControlNet + # support, in case ControlNet is enabled + if not path.exists(base_model_id): + raise FileNotFoundError( + f"Model file not found,Please check your model path: {base_model_id}" + ) + print("Using single file Safetensors model (Supported models - SD 1.5 models)") + + dummy_pipeline = StableDiffusionPipeline.from_single_file( + base_model_id, + torch_dtype=torch_data_type, + safety_checker=None, + local_files_only=use_local_model, + use_safetensors=True, + ) + pipeline = AutoPipelineForText2Image.from_pipe( + dummy_pipeline, + **pipeline_args, + ) + del dummy_pipeline + else: + pipeline = AutoPipelineForText2Image.from_pretrained( + base_model_id, + torch_dtype=torch_data_type, + local_files_only=use_local_model, + **pipeline_args, + ) + + load_lcm_weights( + pipeline, + use_local_model, + lcm_lora_id, + ) + # Always fuse LCM-LoRA + # pipeline.fuse_lora() + + if "lcm" in lcm_lora_id.lower() or "hypersd" in lcm_lora_id.lower(): + print("LCM LoRA model detected so using recommended LCMScheduler") + pipeline.scheduler = LCMScheduler.from_config(pipeline.scheduler.config) + + # pipeline.unet.to(memory_format=torch.channels_last) + return pipeline diff --git a/backend/tiny_decoder.py b/backend/tiny_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..957cfcbff65cc22b38450462e052fba50e4d764f --- /dev/null +++ b/backend/tiny_decoder.py @@ -0,0 +1,32 @@ +from constants import ( + TAESD_MODEL, + TAESDXL_MODEL, + TAESD_MODEL_OPENVINO, + TAESDXL_MODEL_OPENVINO, +) + + +def get_tiny_decoder_vae_model(pipeline_class) -> str: + print(f"Pipeline class : {pipeline_class}") + if ( + pipeline_class == "LatentConsistencyModelPipeline" + or pipeline_class == "StableDiffusionPipeline" + or pipeline_class == "StableDiffusionImg2ImgPipeline" + or pipeline_class == "StableDiffusionControlNetPipeline" + or pipeline_class == "StableDiffusionControlNetImg2ImgPipeline" + ): + return TAESD_MODEL + elif ( + pipeline_class == "StableDiffusionXLPipeline" + or pipeline_class == "StableDiffusionXLImg2ImgPipeline" + ): + return TAESDXL_MODEL + elif ( + pipeline_class == "OVStableDiffusionPipeline" + or pipeline_class == "OVStableDiffusionImg2ImgPipeline" + ): + return TAESD_MODEL_OPENVINO + elif pipeline_class == "OVStableDiffusionXLPipeline": + return TAESDXL_MODEL_OPENVINO + else: + raise Exception("No valid pipeline class found!") diff --git a/backend/upscale/aura_sr.py b/backend/upscale/aura_sr.py new file mode 100644 index 0000000000000000000000000000000000000000..787a66fd4e34b7c1f38662e721ff622024e22df7 --- /dev/null +++ b/backend/upscale/aura_sr.py @@ -0,0 +1,1004 @@ +# AuraSR: GAN-based Super-Resolution for real-world, a reproduction of the GigaGAN* paper. Implementation is +# based on the unofficial lucidrains/gigagan-pytorch repository. Heavily modified from there. +# +# https://mingukkang.github.io/GigaGAN/ +from math import log2, ceil +from functools import partial +from typing import Any, Optional, List, Iterable + +import torch +from torchvision import transforms +from PIL import Image +from torch import nn, einsum, Tensor +import torch.nn.functional as F + +from einops import rearrange, repeat, reduce +from einops.layers.torch import Rearrange +from torchvision.utils import save_image +import math + + +def get_same_padding(size, kernel, dilation, stride): + return ((size - 1) * (stride - 1) + dilation * (kernel - 1)) // 2 + + +class AdaptiveConv2DMod(nn.Module): + def __init__( + self, + dim, + dim_out, + kernel, + *, + demod=True, + stride=1, + dilation=1, + eps=1e-8, + num_conv_kernels=1, # set this to be greater than 1 for adaptive + ): + super().__init__() + self.eps = eps + + self.dim_out = dim_out + + self.kernel = kernel + self.stride = stride + self.dilation = dilation + self.adaptive = num_conv_kernels > 1 + + self.weights = nn.Parameter( + torch.randn((num_conv_kernels, dim_out, dim, kernel, kernel)) + ) + + self.demod = demod + + nn.init.kaiming_normal_( + self.weights, a=0, mode="fan_in", nonlinearity="leaky_relu" + ) + + def forward( + self, fmap, mod: Optional[Tensor] = None, kernel_mod: Optional[Tensor] = None + ): + """ + notation + + b - batch + n - convs + o - output + i - input + k - kernel + """ + + b, h = fmap.shape[0], fmap.shape[-2] + + # account for feature map that has been expanded by the scale in the first dimension + # due to multiscale inputs and outputs + + if mod.shape[0] != b: + mod = repeat(mod, "b ... -> (s b) ...", s=b // mod.shape[0]) + + if exists(kernel_mod): + kernel_mod_has_el = kernel_mod.numel() > 0 + + assert self.adaptive or not kernel_mod_has_el + + if kernel_mod_has_el and kernel_mod.shape[0] != b: + kernel_mod = repeat( + kernel_mod, "b ... -> (s b) ...", s=b // kernel_mod.shape[0] + ) + + # prepare weights for modulation + + weights = self.weights + + if self.adaptive: + weights = repeat(weights, "... -> b ...", b=b) + + # determine an adaptive weight and 'select' the kernel to use with softmax + + assert exists(kernel_mod) and kernel_mod.numel() > 0 + + kernel_attn = kernel_mod.softmax(dim=-1) + kernel_attn = rearrange(kernel_attn, "b n -> b n 1 1 1 1") + + weights = reduce(weights * kernel_attn, "b n ... -> b ...", "sum") + + # do the modulation, demodulation, as done in stylegan2 + + mod = rearrange(mod, "b i -> b 1 i 1 1") + + weights = weights * (mod + 1) + + if self.demod: + inv_norm = ( + reduce(weights**2, "b o i k1 k2 -> b o 1 1 1", "sum") + .clamp(min=self.eps) + .rsqrt() + ) + weights = weights * inv_norm + + fmap = rearrange(fmap, "b c h w -> 1 (b c) h w") + + weights = rearrange(weights, "b o ... -> (b o) ...") + + padding = get_same_padding(h, self.kernel, self.dilation, self.stride) + fmap = F.conv2d(fmap, weights, padding=padding, groups=b) + + return rearrange(fmap, "1 (b o) ... -> b o ...", b=b) + + +class Attend(nn.Module): + def __init__(self, dropout=0.0, flash=False): + super().__init__() + self.dropout = dropout + self.attn_dropout = nn.Dropout(dropout) + self.scale = nn.Parameter(torch.randn(1)) + self.flash = flash + + def flash_attn(self, q, k, v): + q, k, v = map(lambda t: t.contiguous(), (q, k, v)) + out = F.scaled_dot_product_attention( + q, k, v, dropout_p=self.dropout if self.training else 0.0 + ) + return out + + def forward(self, q, k, v): + if self.flash: + return self.flash_attn(q, k, v) + + scale = q.shape[-1] ** -0.5 + + # similarity + sim = einsum("b h i d, b h j d -> b h i j", q, k) * scale + + # attention + attn = sim.softmax(dim=-1) + attn = self.attn_dropout(attn) + + # aggregate values + out = einsum("b h i j, b h j d -> b h i d", attn, v) + + return out + + +def exists(x): + return x is not None + + +def default(val, d): + if exists(val): + return val + return d() if callable(d) else d + + +def cast_tuple(t, length=1): + if isinstance(t, tuple): + return t + return (t,) * length + + +def identity(t, *args, **kwargs): + return t + + +def is_power_of_two(n): + return log2(n).is_integer() + + +def null_iterator(): + while True: + yield None + + +def Downsample(dim, dim_out=None): + return nn.Sequential( + Rearrange("b c (h p1) (w p2) -> b (c p1 p2) h w", p1=2, p2=2), + nn.Conv2d(dim * 4, default(dim_out, dim), 1), + ) + + +class RMSNorm(nn.Module): + def __init__(self, dim): + super().__init__() + self.g = nn.Parameter(torch.ones(1, dim, 1, 1)) + self.eps = 1e-4 + + def forward(self, x): + return F.normalize(x, dim=1) * self.g * (x.shape[1] ** 0.5) + + +# building block modules + + +class Block(nn.Module): + def __init__(self, dim, dim_out, groups=8, num_conv_kernels=0): + super().__init__() + self.proj = AdaptiveConv2DMod( + dim, dim_out, kernel=3, num_conv_kernels=num_conv_kernels + ) + self.kernel = 3 + self.dilation = 1 + self.stride = 1 + + self.act = nn.SiLU() + + def forward(self, x, conv_mods_iter: Optional[Iterable] = None): + conv_mods_iter = default(conv_mods_iter, null_iterator()) + + x = self.proj(x, mod=next(conv_mods_iter), kernel_mod=next(conv_mods_iter)) + + x = self.act(x) + return x + + +class ResnetBlock(nn.Module): + def __init__( + self, dim, dim_out, *, groups=8, num_conv_kernels=0, style_dims: List = [] + ): + super().__init__() + style_dims.extend([dim, num_conv_kernels, dim_out, num_conv_kernels]) + + self.block1 = Block( + dim, dim_out, groups=groups, num_conv_kernels=num_conv_kernels + ) + self.block2 = Block( + dim_out, dim_out, groups=groups, num_conv_kernels=num_conv_kernels + ) + self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity() + + def forward(self, x, conv_mods_iter: Optional[Iterable] = None): + h = self.block1(x, conv_mods_iter=conv_mods_iter) + h = self.block2(h, conv_mods_iter=conv_mods_iter) + + return h + self.res_conv(x) + + +class LinearAttention(nn.Module): + def __init__(self, dim, heads=4, dim_head=32): + super().__init__() + self.scale = dim_head**-0.5 + self.heads = heads + hidden_dim = dim_head * heads + + self.norm = RMSNorm(dim) + self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False) + + self.to_out = nn.Sequential(nn.Conv2d(hidden_dim, dim, 1), RMSNorm(dim)) + + def forward(self, x): + b, c, h, w = x.shape + + x = self.norm(x) + + qkv = self.to_qkv(x).chunk(3, dim=1) + q, k, v = map( + lambda t: rearrange(t, "b (h c) x y -> b h c (x y)", h=self.heads), qkv + ) + + q = q.softmax(dim=-2) + k = k.softmax(dim=-1) + + q = q * self.scale + + context = torch.einsum("b h d n, b h e n -> b h d e", k, v) + + out = torch.einsum("b h d e, b h d n -> b h e n", context, q) + out = rearrange(out, "b h c (x y) -> b (h c) x y", h=self.heads, x=h, y=w) + return self.to_out(out) + + +class Attention(nn.Module): + def __init__(self, dim, heads=4, dim_head=32, flash=False): + super().__init__() + self.heads = heads + hidden_dim = dim_head * heads + + self.norm = RMSNorm(dim) + + self.attend = Attend(flash=flash) + self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False) + self.to_out = nn.Conv2d(hidden_dim, dim, 1) + + def forward(self, x): + b, c, h, w = x.shape + x = self.norm(x) + qkv = self.to_qkv(x).chunk(3, dim=1) + + q, k, v = map( + lambda t: rearrange(t, "b (h c) x y -> b h (x y) c", h=self.heads), qkv + ) + + out = self.attend(q, k, v) + out = rearrange(out, "b h (x y) d -> b (h d) x y", x=h, y=w) + + return self.to_out(out) + + +# feedforward +def FeedForward(dim, mult=4): + return nn.Sequential( + RMSNorm(dim), + nn.Conv2d(dim, dim * mult, 1), + nn.GELU(), + nn.Conv2d(dim * mult, dim, 1), + ) + + +# transformers +class Transformer(nn.Module): + def __init__(self, dim, dim_head=64, heads=8, depth=1, flash_attn=True, ff_mult=4): + super().__init__() + self.layers = nn.ModuleList([]) + + for _ in range(depth): + self.layers.append( + nn.ModuleList( + [ + Attention( + dim=dim, dim_head=dim_head, heads=heads, flash=flash_attn + ), + FeedForward(dim=dim, mult=ff_mult), + ] + ) + ) + + def forward(self, x): + for attn, ff in self.layers: + x = attn(x) + x + x = ff(x) + x + + return x + + +class LinearTransformer(nn.Module): + def __init__(self, dim, dim_head=64, heads=8, depth=1, ff_mult=4): + super().__init__() + self.layers = nn.ModuleList([]) + + for _ in range(depth): + self.layers.append( + nn.ModuleList( + [ + LinearAttention(dim=dim, dim_head=dim_head, heads=heads), + FeedForward(dim=dim, mult=ff_mult), + ] + ) + ) + + def forward(self, x): + for attn, ff in self.layers: + x = attn(x) + x + x = ff(x) + x + + return x + + +class NearestNeighborhoodUpsample(nn.Module): + def __init__(self, dim, dim_out=None): + super().__init__() + dim_out = default(dim_out, dim) + self.conv = nn.Conv2d(dim, dim_out, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + + if x.shape[0] >= 64: + x = x.contiguous() + + x = F.interpolate(x, scale_factor=2.0, mode="nearest") + x = self.conv(x) + + return x + + +class EqualLinear(nn.Module): + def __init__(self, dim, dim_out, lr_mul=1, bias=True): + super().__init__() + self.weight = nn.Parameter(torch.randn(dim_out, dim)) + if bias: + self.bias = nn.Parameter(torch.zeros(dim_out)) + + self.lr_mul = lr_mul + + def forward(self, input): + return F.linear(input, self.weight * self.lr_mul, bias=self.bias * self.lr_mul) + + +class StyleGanNetwork(nn.Module): + def __init__(self, dim_in=128, dim_out=512, depth=8, lr_mul=0.1, dim_text_latent=0): + super().__init__() + self.dim_in = dim_in + self.dim_out = dim_out + self.dim_text_latent = dim_text_latent + + layers = [] + for i in range(depth): + is_first = i == 0 + + if is_first: + dim_in_layer = dim_in + dim_text_latent + else: + dim_in_layer = dim_out + + dim_out_layer = dim_out + + layers.extend( + [EqualLinear(dim_in_layer, dim_out_layer, lr_mul), nn.LeakyReLU(0.2)] + ) + + self.net = nn.Sequential(*layers) + + def forward(self, x, text_latent=None): + x = F.normalize(x, dim=1) + if self.dim_text_latent > 0: + assert exists(text_latent) + x = torch.cat((x, text_latent), dim=-1) + return self.net(x) + + +class UnetUpsampler(torch.nn.Module): + + def __init__( + self, + dim: int, + *, + image_size: int, + input_image_size: int, + init_dim: Optional[int] = None, + out_dim: Optional[int] = None, + style_network: Optional[dict] = None, + up_dim_mults: tuple = (1, 2, 4, 8, 16), + down_dim_mults: tuple = (4, 8, 16), + channels: int = 3, + resnet_block_groups: int = 8, + full_attn: tuple = (False, False, False, True, True), + flash_attn: bool = True, + self_attn_dim_head: int = 64, + self_attn_heads: int = 8, + attn_depths: tuple = (2, 2, 2, 2, 4), + mid_attn_depth: int = 4, + num_conv_kernels: int = 4, + resize_mode: str = "bilinear", + unconditional: bool = True, + skip_connect_scale: Optional[float] = None, + ): + super().__init__() + self.style_network = style_network = StyleGanNetwork(**style_network) + self.unconditional = unconditional + assert not ( + unconditional + and exists(style_network) + and style_network.dim_text_latent > 0 + ) + + assert is_power_of_two(image_size) and is_power_of_two( + input_image_size + ), "both output image size and input image size must be power of 2" + assert ( + input_image_size < image_size + ), "input image size must be smaller than the output image size, thus upsampling" + + self.image_size = image_size + self.input_image_size = input_image_size + + style_embed_split_dims = [] + + self.channels = channels + input_channels = channels + + init_dim = default(init_dim, dim) + + up_dims = [init_dim, *map(lambda m: dim * m, up_dim_mults)] + init_down_dim = up_dims[len(up_dim_mults) - len(down_dim_mults)] + down_dims = [init_down_dim, *map(lambda m: dim * m, down_dim_mults)] + self.init_conv = nn.Conv2d(input_channels, init_down_dim, 7, padding=3) + + up_in_out = list(zip(up_dims[:-1], up_dims[1:])) + down_in_out = list(zip(down_dims[:-1], down_dims[1:])) + + block_klass = partial( + ResnetBlock, + groups=resnet_block_groups, + num_conv_kernels=num_conv_kernels, + style_dims=style_embed_split_dims, + ) + + FullAttention = partial(Transformer, flash_attn=flash_attn) + *_, mid_dim = up_dims + + self.skip_connect_scale = default(skip_connect_scale, 2**-0.5) + + self.downs = nn.ModuleList([]) + self.ups = nn.ModuleList([]) + + block_count = 6 + + for ind, ( + (dim_in, dim_out), + layer_full_attn, + layer_attn_depth, + ) in enumerate(zip(down_in_out, full_attn, attn_depths)): + attn_klass = FullAttention if layer_full_attn else LinearTransformer + + blocks = [] + for i in range(block_count): + blocks.append(block_klass(dim_in, dim_in)) + + self.downs.append( + nn.ModuleList( + [ + nn.ModuleList(blocks), + nn.ModuleList( + [ + ( + attn_klass( + dim_in, + dim_head=self_attn_dim_head, + heads=self_attn_heads, + depth=layer_attn_depth, + ) + if layer_full_attn + else None + ), + nn.Conv2d( + dim_in, dim_out, kernel_size=3, stride=2, padding=1 + ), + ] + ), + ] + ) + ) + + self.mid_block1 = block_klass(mid_dim, mid_dim) + self.mid_attn = FullAttention( + mid_dim, + dim_head=self_attn_dim_head, + heads=self_attn_heads, + depth=mid_attn_depth, + ) + self.mid_block2 = block_klass(mid_dim, mid_dim) + + *_, last_dim = up_dims + + for ind, ( + (dim_in, dim_out), + layer_full_attn, + layer_attn_depth, + ) in enumerate( + zip( + reversed(up_in_out), + reversed(full_attn), + reversed(attn_depths), + ) + ): + attn_klass = FullAttention if layer_full_attn else LinearTransformer + + blocks = [] + input_dim = dim_in * 2 if ind < len(down_in_out) else dim_in + for i in range(block_count): + blocks.append(block_klass(input_dim, dim_in)) + + self.ups.append( + nn.ModuleList( + [ + nn.ModuleList(blocks), + nn.ModuleList( + [ + NearestNeighborhoodUpsample( + last_dim if ind == 0 else dim_out, + dim_in, + ), + ( + attn_klass( + dim_in, + dim_head=self_attn_dim_head, + heads=self_attn_heads, + depth=layer_attn_depth, + ) + if layer_full_attn + else None + ), + ] + ), + ] + ) + ) + + self.out_dim = default(out_dim, channels) + self.final_res_block = block_klass(dim, dim) + self.final_to_rgb = nn.Conv2d(dim, channels, 1) + self.resize_mode = resize_mode + self.style_to_conv_modulations = nn.Linear( + style_network.dim_out, sum(style_embed_split_dims) + ) + self.style_embed_split_dims = style_embed_split_dims + + @property + def allowable_rgb_resolutions(self): + input_res_base = int(log2(self.input_image_size)) + output_res_base = int(log2(self.image_size)) + allowed_rgb_res_base = list(range(input_res_base, output_res_base)) + return [*map(lambda p: 2**p, allowed_rgb_res_base)] + + @property + def device(self): + return next(self.parameters()).device + + @property + def total_params(self): + return sum([p.numel() for p in self.parameters()]) + + def resize_image_to(self, x, size): + return F.interpolate(x, (size, size), mode=self.resize_mode) + + def forward( + self, + lowres_image: torch.Tensor, + styles: Optional[torch.Tensor] = None, + noise: Optional[torch.Tensor] = None, + global_text_tokens: Optional[torch.Tensor] = None, + return_all_rgbs: bool = False, + ): + x = lowres_image + + noise_scale = 0.001 # Adjust the scale of the noise as needed + noise_aug = torch.randn_like(x) * noise_scale + x = x + noise_aug + x = x.clamp(0, 1) + + shape = x.shape + batch_size = shape[0] + + assert shape[-2:] == ((self.input_image_size,) * 2) + + # styles + if not exists(styles): + assert exists(self.style_network) + + noise = default( + noise, + torch.randn( + (batch_size, self.style_network.dim_in), device=self.device + ), + ) + styles = self.style_network(noise, global_text_tokens) + + # project styles to conv modulations + conv_mods = self.style_to_conv_modulations(styles) + conv_mods = conv_mods.split(self.style_embed_split_dims, dim=-1) + conv_mods = iter(conv_mods) + + x = self.init_conv(x) + + h = [] + for blocks, (attn, downsample) in self.downs: + for block in blocks: + x = block(x, conv_mods_iter=conv_mods) + h.append(x) + + if attn is not None: + x = attn(x) + + x = downsample(x) + + x = self.mid_block1(x, conv_mods_iter=conv_mods) + x = self.mid_attn(x) + x = self.mid_block2(x, conv_mods_iter=conv_mods) + + for ( + blocks, + ( + upsample, + attn, + ), + ) in self.ups: + x = upsample(x) + for block in blocks: + if h != []: + res = h.pop() + res = res * self.skip_connect_scale + x = torch.cat((x, res), dim=1) + + x = block(x, conv_mods_iter=conv_mods) + + if attn is not None: + x = attn(x) + + x = self.final_res_block(x, conv_mods_iter=conv_mods) + rgb = self.final_to_rgb(x) + + if not return_all_rgbs: + return rgb + + return rgb, [] + + +def tile_image(image, chunk_size=64): + c, h, w = image.shape + h_chunks = ceil(h / chunk_size) + w_chunks = ceil(w / chunk_size) + tiles = [] + for i in range(h_chunks): + for j in range(w_chunks): + tile = image[ + :, + i * chunk_size : (i + 1) * chunk_size, + j * chunk_size : (j + 1) * chunk_size, + ] + tiles.append(tile) + return tiles, h_chunks, w_chunks + + +# This helps create a checkboard pattern with some edge blending +def create_checkerboard_weights(tile_size): + x = torch.linspace(-1, 1, tile_size) + y = torch.linspace(-1, 1, tile_size) + + x, y = torch.meshgrid(x, y, indexing="ij") + d = torch.sqrt(x * x + y * y) + sigma, mu = 0.5, 0.0 + weights = torch.exp(-((d - mu) ** 2 / (2.0 * sigma**2))) + + # saturate the values to sure get high weights in the center + weights = weights**8 + + return weights / weights.max() # Normalize to [0, 1] + + +def repeat_weights(weights, image_size): + tile_size = weights.shape[0] + repeats = ( + math.ceil(image_size[0] / tile_size), + math.ceil(image_size[1] / tile_size), + ) + return weights.repeat(repeats)[: image_size[0], : image_size[1]] + + +def create_offset_weights(weights, image_size): + tile_size = weights.shape[0] + offset = tile_size // 2 + full_weights = repeat_weights( + weights, (image_size[0] + offset, image_size[1] + offset) + ) + return full_weights[offset:, offset:] + + +def merge_tiles(tiles, h_chunks, w_chunks, chunk_size=64): + # Determine the shape of the output tensor + c = tiles[0].shape[0] + h = h_chunks * chunk_size + w = w_chunks * chunk_size + + # Create an empty tensor to hold the merged image + merged = torch.zeros((c, h, w), dtype=tiles[0].dtype) + + # Iterate over the tiles and place them in the correct position + for idx, tile in enumerate(tiles): + i = idx // w_chunks + j = idx % w_chunks + + h_start = i * chunk_size + w_start = j * chunk_size + + tile_h, tile_w = tile.shape[1:] + merged[:, h_start : h_start + tile_h, w_start : w_start + tile_w] = tile + + return merged + + +class AuraSR: + def __init__(self, config: dict[str, Any], device: str = "cuda"): + self.upsampler = UnetUpsampler(**config).to(device) + self.input_image_size = config["input_image_size"] + + @classmethod + def from_pretrained( + cls, + model_id: str = "fal-ai/AuraSR", + use_safetensors: bool = True, + device: str = "cuda", + ): + import json + import torch + from pathlib import Path + from huggingface_hub import snapshot_download + + # Check if model_id is a local file + if Path(model_id).is_file(): + local_file = Path(model_id) + if local_file.suffix == ".safetensors": + use_safetensors = True + elif local_file.suffix == ".ckpt": + use_safetensors = False + else: + raise ValueError( + f"Unsupported file format: {local_file.suffix}. Please use .safetensors or .ckpt files." + ) + + # For local files, we need to provide the config separately + config_path = local_file.with_name("config.json") + if not config_path.exists(): + raise FileNotFoundError( + f"Config file not found: {config_path}. " + f"When loading from a local file, ensure that 'config.json' " + f"is present in the same directory as '{local_file.name}'. " + f"If you're trying to load a model from Hugging Face, " + f"please provide the model ID instead of a file path." + ) + + config = json.loads(config_path.read_text()) + hf_model_path = local_file.parent + else: + hf_model_path = Path( + snapshot_download(model_id, ignore_patterns=["*.ckpt"]) + ) + config = json.loads((hf_model_path / "config.json").read_text()) + + model = cls(config, device) + + if use_safetensors: + try: + from safetensors.torch import load_file + + checkpoint = load_file( + hf_model_path / "model.safetensors" + if not Path(model_id).is_file() + else model_id + ) + except ImportError: + raise ImportError( + "The safetensors library is not installed. " + "Please install it with `pip install safetensors` " + "or use `use_safetensors=False` to load the model with PyTorch." + ) + else: + checkpoint = torch.load( + hf_model_path / "model.ckpt" + if not Path(model_id).is_file() + else model_id + ) + + model.upsampler.load_state_dict(checkpoint, strict=True) + return model + + @torch.no_grad() + def upscale_4x(self, image: Image.Image, max_batch_size=8) -> Image.Image: + tensor_transform = transforms.ToTensor() + device = self.upsampler.device + + image_tensor = tensor_transform(image).unsqueeze(0) + _, _, h, w = image_tensor.shape + pad_h = ( + self.input_image_size - h % self.input_image_size + ) % self.input_image_size + pad_w = ( + self.input_image_size - w % self.input_image_size + ) % self.input_image_size + + # Pad the image + image_tensor = torch.nn.functional.pad( + image_tensor, (0, pad_w, 0, pad_h), mode="reflect" + ).squeeze(0) + tiles, h_chunks, w_chunks = tile_image(image_tensor, self.input_image_size) + + # Batch processing of tiles + num_tiles = len(tiles) + batches = [ + tiles[i : i + max_batch_size] for i in range(0, num_tiles, max_batch_size) + ] + reconstructed_tiles = [] + + for batch in batches: + model_input = torch.stack(batch).to(device) + generator_output = self.upsampler( + lowres_image=model_input, + noise=torch.randn(model_input.shape[0], 128, device=device), + ) + reconstructed_tiles.extend( + list(generator_output.clamp_(0, 1).detach().cpu()) + ) + + merged_tensor = merge_tiles( + reconstructed_tiles, h_chunks, w_chunks, self.input_image_size * 4 + ) + unpadded = merged_tensor[:, : h * 4, : w * 4] + + to_pil = transforms.ToPILImage() + return to_pil(unpadded) + + # Tiled 4x upscaling with overlapping tiles to reduce seam artifacts + # weights options are 'checkboard' and 'constant' + @torch.no_grad() + def upscale_4x_overlapped(self, image, max_batch_size=8, weight_type="checkboard"): + tensor_transform = transforms.ToTensor() + device = self.upsampler.device + + image_tensor = tensor_transform(image).unsqueeze(0) + _, _, h, w = image_tensor.shape + + # Calculate paddings + pad_h = ( + self.input_image_size - h % self.input_image_size + ) % self.input_image_size + pad_w = ( + self.input_image_size - w % self.input_image_size + ) % self.input_image_size + + # Pad the image + image_tensor = torch.nn.functional.pad( + image_tensor, (0, pad_w, 0, pad_h), mode="reflect" + ).squeeze(0) + + # Function to process tiles + def process_tiles(tiles, h_chunks, w_chunks): + num_tiles = len(tiles) + batches = [ + tiles[i : i + max_batch_size] + for i in range(0, num_tiles, max_batch_size) + ] + reconstructed_tiles = [] + + for batch in batches: + model_input = torch.stack(batch).to(device) + generator_output = self.upsampler( + lowres_image=model_input, + noise=torch.randn(model_input.shape[0], 128, device=device), + ) + reconstructed_tiles.extend( + list(generator_output.clamp_(0, 1).detach().cpu()) + ) + + return merge_tiles( + reconstructed_tiles, h_chunks, w_chunks, self.input_image_size * 4 + ) + + # First pass + tiles1, h_chunks1, w_chunks1 = tile_image(image_tensor, self.input_image_size) + result1 = process_tiles(tiles1, h_chunks1, w_chunks1) + + # Second pass with offset + offset = self.input_image_size // 2 + image_tensor_offset = torch.nn.functional.pad( + image_tensor, (offset, offset, offset, offset), mode="reflect" + ).squeeze(0) + + tiles2, h_chunks2, w_chunks2 = tile_image( + image_tensor_offset, self.input_image_size + ) + result2 = process_tiles(tiles2, h_chunks2, w_chunks2) + + # unpad + offset_4x = offset * 4 + result2_interior = result2[:, offset_4x:-offset_4x, offset_4x:-offset_4x] + + if weight_type == "checkboard": + weight_tile = create_checkerboard_weights(self.input_image_size * 4) + + weight_shape = result2_interior.shape[1:] + weights_1 = create_offset_weights(weight_tile, weight_shape) + weights_2 = repeat_weights(weight_tile, weight_shape) + + normalizer = weights_1 + weights_2 + weights_1 = weights_1 / normalizer + weights_2 = weights_2 / normalizer + + weights_1 = weights_1.unsqueeze(0).repeat(3, 1, 1) + weights_2 = weights_2.unsqueeze(0).repeat(3, 1, 1) + elif weight_type == "constant": + weights_1 = torch.ones_like(result2_interior) * 0.5 + weights_2 = weights_1 + else: + raise ValueError( + "weight_type should be either 'gaussian' or 'constant' but got", + weight_type, + ) + + result1 = result1 * weights_2 + result2 = result2_interior * weights_1 + + # Average the overlapping region + result1 = result1 + result2 + + # Remove padding + unpadded = result1[:, : h * 4, : w * 4] + + to_pil = transforms.ToPILImage() + return to_pil(unpadded) diff --git a/backend/upscale/aura_sr_upscale.py b/backend/upscale/aura_sr_upscale.py new file mode 100644 index 0000000000000000000000000000000000000000..5bebb1ce181c5f5bd9563abf01c7209c400ae9b6 --- /dev/null +++ b/backend/upscale/aura_sr_upscale.py @@ -0,0 +1,9 @@ +from backend.upscale.aura_sr import AuraSR +from PIL import Image + + +def upscale_aura_sr(image_path: str): + + aura_sr = AuraSR.from_pretrained("fal/AuraSR-v2", device="cpu") + image_in = Image.open(image_path) # .resize((256, 256)) + return aura_sr.upscale_4x(image_in) diff --git a/backend/upscale/edsr_upscale_onnx.py b/backend/upscale/edsr_upscale_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..f837d932b813edc1b5a215978fc1766150b7c436 --- /dev/null +++ b/backend/upscale/edsr_upscale_onnx.py @@ -0,0 +1,37 @@ +import numpy as np +import onnxruntime +from huggingface_hub import hf_hub_download +from PIL import Image + + +def upscale_edsr_2x(image_path: str): + input_image = Image.open(image_path).convert("RGB") + input_image = np.array(input_image).astype("float32") + input_image = np.transpose(input_image, (2, 0, 1)) + img_arr = np.expand_dims(input_image, axis=0) + + if np.max(img_arr) > 256: # 16-bit image + max_range = 65535 + else: + max_range = 255.0 + img = img_arr / max_range + + model_path = hf_hub_download( + repo_id="rupeshs/edsr-onnx", + filename="edsr_onnxsim_2x.onnx", + ) + sess = onnxruntime.InferenceSession(model_path) + + input_name = sess.get_inputs()[0].name + output_name = sess.get_outputs()[0].name + output = sess.run( + [output_name], + {input_name: img}, + )[0] + + result = output.squeeze() + result = result.clip(0, 1) + image_array = np.transpose(result, (1, 2, 0)) + image_array = np.uint8(image_array * 255) + upscaled_image = Image.fromarray(image_array) + return upscaled_image diff --git a/backend/upscale/tiled_upscale.py b/backend/upscale/tiled_upscale.py new file mode 100644 index 0000000000000000000000000000000000000000..735aacf8bf2f391e8c9486005ceb32867b300dc5 --- /dev/null +++ b/backend/upscale/tiled_upscale.py @@ -0,0 +1,237 @@ +import time +import math +import logging +from PIL import Image, ImageDraw, ImageFilter +from backend.models.lcmdiffusion_setting import DiffusionTask +from context import Context +from constants import DEVICE + + +def generate_upscaled_image( + config, + input_path=None, + strength=0.3, + scale_factor=2.0, + tile_overlap=16, + upscale_settings=None, + context: Context = None, + output_path=None, + image_format="PNG", +): + if config == None or ( + input_path == None or input_path == "" and upscale_settings == None + ): + logging.error("Wrong arguments in tiled upscale function call!") + return + + # Use the upscale_settings dict if provided; otherwise, build the + # upscale_settings dict using the function arguments and default values + if upscale_settings == None: + upscale_settings = { + "source_file": input_path, + "target_file": None, + "output_format": image_format, + "strength": strength, + "scale_factor": scale_factor, + "prompt": config.lcm_diffusion_setting.prompt, + "tile_overlap": tile_overlap, + "tile_size": 256, + "tiles": [], + } + source_image = Image.open(input_path) # PIL image + else: + source_image = Image.open(upscale_settings["source_file"]) + + upscale_settings["source_image"] = source_image + + if upscale_settings["target_file"]: + result = Image.open(upscale_settings["target_file"]) + else: + result = Image.new( + mode="RGBA", + size=( + source_image.size[0] * int(upscale_settings["scale_factor"]), + source_image.size[1] * int(upscale_settings["scale_factor"]), + ), + color=(0, 0, 0, 0), + ) + upscale_settings["target_image"] = result + + # If the custom tile definition array 'tiles' is empty, proceed with the + # default tiled upscale task by defining all the possible image tiles; note + # that the actual tile size is 'tile_size' + 'tile_overlap' and the target + # image width and height are no longer constrained to multiples of 256 but + # are instead multiples of the actual tile size + if len(upscale_settings["tiles"]) == 0: + tile_size = upscale_settings["tile_size"] + scale_factor = upscale_settings["scale_factor"] + tile_overlap = upscale_settings["tile_overlap"] + total_cols = math.ceil( + source_image.size[0] / tile_size + ) # Image width / tile size + total_rows = math.ceil( + source_image.size[1] / tile_size + ) # Image height / tile size + for y in range(0, total_rows): + y_offset = tile_overlap if y > 0 else 0 # Tile mask offset + for x in range(0, total_cols): + x_offset = tile_overlap if x > 0 else 0 # Tile mask offset + x1 = x * tile_size + y1 = y * tile_size + w = tile_size + (tile_overlap if x < total_cols - 1 else 0) + h = tile_size + (tile_overlap if y < total_rows - 1 else 0) + mask_box = ( # Default tile mask box definition + x_offset, + y_offset, + int(w * scale_factor), + int(h * scale_factor), + ) + upscale_settings["tiles"].append( + { + "x": x1, + "y": y1, + "w": w, + "h": h, + "mask_box": mask_box, + "prompt": upscale_settings["prompt"], # Use top level prompt if available + "scale_factor": scale_factor, + } + ) + + # Generate the output image tiles + for i in range(0, len(upscale_settings["tiles"])): + generate_upscaled_tile( + config, + i, + upscale_settings, + context=context, + ) + + # Save completed upscaled image + if upscale_settings["output_format"].upper() == "JPEG": + result_rgb = result.convert("RGB") + result.close() + result = result_rgb + result.save(output_path) + result.close() + source_image.close() + return + + +def get_current_tile( + config, + context, + strength, +): + config.lcm_diffusion_setting.strength = strength + config.lcm_diffusion_setting.diffusion_task = DiffusionTask.image_to_image.value + if ( + config.lcm_diffusion_setting.use_tiny_auto_encoder + and config.lcm_diffusion_setting.use_openvino + ): + config.lcm_diffusion_setting.use_tiny_auto_encoder = False + current_tile = context.generate_text_to_image( + settings=config, + reshape=True, + device=DEVICE, + save_config=False, + )[0] + return current_tile + + +# Generates a single tile from the source image as defined in the +# upscale_settings["tiles"] array with the corresponding index and pastes the +# generated tile into the target image using the corresponding mask and scale +# factor; note that scale factor for the target image and the individual tiles +# can be different, this function will adjust scale factors as needed +def generate_upscaled_tile( + config, + index, + upscale_settings, + context: Context = None, +): + if config == None or upscale_settings == None: + logging.error("Wrong arguments in tile creation function call!") + return + + x = upscale_settings["tiles"][index]["x"] + y = upscale_settings["tiles"][index]["y"] + w = upscale_settings["tiles"][index]["w"] + h = upscale_settings["tiles"][index]["h"] + tile_prompt = upscale_settings["tiles"][index]["prompt"] + scale_factor = upscale_settings["scale_factor"] + tile_scale_factor = upscale_settings["tiles"][index]["scale_factor"] + target_width = int(w * tile_scale_factor) + target_height = int(h * tile_scale_factor) + strength = upscale_settings["strength"] + source_image = upscale_settings["source_image"] + target_image = upscale_settings["target_image"] + mask_image = generate_tile_mask(config, index, upscale_settings) + + config.lcm_diffusion_setting.number_of_images = 1 + config.lcm_diffusion_setting.prompt = tile_prompt + config.lcm_diffusion_setting.image_width = target_width + config.lcm_diffusion_setting.image_height = target_height + config.lcm_diffusion_setting.init_image = source_image.crop((x, y, x + w, y + h)) + + current_tile = None + print(f"[SD Upscale] Generating tile {index + 1}/{len(upscale_settings['tiles'])} ") + if tile_prompt == None or tile_prompt == "": + config.lcm_diffusion_setting.prompt = "" + config.lcm_diffusion_setting.negative_prompt = "" + current_tile = get_current_tile(config, context, strength) + else: + # Attempt to use img2img with low denoising strength to + # generate the tiles with the extra aid of a prompt + # context = get_context(InterfaceType.CLI) + current_tile = get_current_tile(config, context, strength) + + if math.isclose(scale_factor, tile_scale_factor): + target_image.paste( + current_tile, (int(x * scale_factor), int(y * scale_factor)), mask_image + ) + else: + target_image.paste( + current_tile.resize((int(w * scale_factor), int(h * scale_factor))), + (int(x * scale_factor), int(y * scale_factor)), + mask_image.resize((int(w * scale_factor), int(h * scale_factor))), + ) + mask_image.close() + current_tile.close() + config.lcm_diffusion_setting.init_image.close() + + +# Generate tile mask using the box definition in the upscale_settings["tiles"] +# array with the corresponding index; note that tile masks for the default +# tiled upscale task can be reused but that would complicate the code, so +# new tile masks are instead created for each tile +def generate_tile_mask( + config, + index, + upscale_settings, +): + scale_factor = upscale_settings["scale_factor"] + tile_overlap = upscale_settings["tile_overlap"] + tile_scale_factor = upscale_settings["tiles"][index]["scale_factor"] + w = int(upscale_settings["tiles"][index]["w"] * tile_scale_factor) + h = int(upscale_settings["tiles"][index]["h"] * tile_scale_factor) + # The Stable Diffusion pipeline automatically adjusts the output size + # to multiples of 8 pixels; the mask must be created with the same + # size as the output tile + w = w - (w % 8) + h = h - (h % 8) + mask_box = upscale_settings["tiles"][index]["mask_box"] + if mask_box == None: + # Build a default solid mask with soft/transparent edges + mask_box = ( + tile_overlap, + tile_overlap, + w - tile_overlap, + h - tile_overlap, + ) + mask_image = Image.new(mode="RGBA", size=(w, h), color=(0, 0, 0, 0)) + mask_draw = ImageDraw.Draw(mask_image) + mask_draw.rectangle(tuple(mask_box), fill=(0, 0, 0)) + mask_blur = mask_image.filter(ImageFilter.BoxBlur(tile_overlap - 1)) + mask_image.close() + return mask_blur diff --git a/backend/upscale/upscaler.py b/backend/upscale/upscaler.py new file mode 100644 index 0000000000000000000000000000000000000000..fea3a1363e96d287ae769bce07375f3097f6ec0a --- /dev/null +++ b/backend/upscale/upscaler.py @@ -0,0 +1,52 @@ +from backend.models.lcmdiffusion_setting import DiffusionTask +from backend.models.upscale import UpscaleMode +from backend.upscale.edsr_upscale_onnx import upscale_edsr_2x +from backend.upscale.aura_sr_upscale import upscale_aura_sr +from backend.upscale.tiled_upscale import generate_upscaled_image +from context import Context +from PIL import Image +from state import get_settings + + +config = get_settings() + + +def upscale_image( + context: Context, + src_image_path: str, + dst_image_path: str, + scale_factor: int = 2, + upscale_mode: UpscaleMode = UpscaleMode.normal.value, + strength: float = 0.1, +): + if upscale_mode == UpscaleMode.normal.value: + upscaled_img = upscale_edsr_2x(src_image_path) + upscaled_img.save(dst_image_path) + print(f"Upscaled image saved {dst_image_path}") + elif upscale_mode == UpscaleMode.aura_sr.value: + upscaled_img = upscale_aura_sr(src_image_path) + upscaled_img.save(dst_image_path) + print(f"Upscaled image saved {dst_image_path}") + else: + config.settings.lcm_diffusion_setting.strength = ( + 0.3 if config.settings.lcm_diffusion_setting.use_openvino else strength + ) + config.settings.lcm_diffusion_setting.diffusion_task = ( + DiffusionTask.image_to_image.value + ) + + generate_upscaled_image( + config.settings, + src_image_path, + config.settings.lcm_diffusion_setting.strength, + upscale_settings=None, + context=context, + tile_overlap=( + 32 if config.settings.lcm_diffusion_setting.use_openvino else 16 + ), + output_path=dst_image_path, + image_format=config.settings.generated_images.format, + ) + print(f"Upscaled image saved {dst_image_path}") + + return [Image.open(dst_image_path)] diff --git a/configs/lcm-lora-models.txt b/configs/lcm-lora-models.txt new file mode 100644 index 0000000000000000000000000000000000000000..f252571ecfc0936d6374e83c3cdfd2f87508ff69 --- /dev/null +++ b/configs/lcm-lora-models.txt @@ -0,0 +1,4 @@ +latent-consistency/lcm-lora-sdv1-5 +latent-consistency/lcm-lora-sdxl +latent-consistency/lcm-lora-ssd-1b +rupeshs/hypersd-sd1-5-1-step-lora \ No newline at end of file diff --git a/configs/lcm-models.txt b/configs/lcm-models.txt new file mode 100644 index 0000000000000000000000000000000000000000..9721ed6f43a6ccc00d3cd456d44f6632674e359c --- /dev/null +++ b/configs/lcm-models.txt @@ -0,0 +1,8 @@ +stabilityai/sd-turbo +rupeshs/sdxs-512-0.9-orig-vae +rupeshs/hyper-sd-sdxl-1-step +rupeshs/SDXL-Lightning-2steps +stabilityai/sdxl-turbo +SimianLuo/LCM_Dreamshaper_v7 +latent-consistency/lcm-sdxl +latent-consistency/lcm-ssd-1b \ No newline at end of file diff --git a/configs/openvino-lcm-models.txt b/configs/openvino-lcm-models.txt new file mode 100644 index 0000000000000000000000000000000000000000..656096d1bbabe6472a65869773b14a6c5bb9ec62 --- /dev/null +++ b/configs/openvino-lcm-models.txt @@ -0,0 +1,9 @@ +rupeshs/sd-turbo-openvino +rupeshs/sdxs-512-0.9-openvino +rupeshs/hyper-sd-sdxl-1-step-openvino-int8 +rupeshs/SDXL-Lightning-2steps-openvino-int8 +rupeshs/sdxl-turbo-openvino-int8 +rupeshs/LCM-dreamshaper-v7-openvino +Disty0/LCM_SoteMix +rupeshs/FLUX.1-schnell-openvino-int4 +rupeshs/sd15-lcm-square-openvino-int8 \ No newline at end of file diff --git a/configs/stable-diffusion-models.txt b/configs/stable-diffusion-models.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5d21c9c5e64bb55c642243d27c230f04c6aab58 --- /dev/null +++ b/configs/stable-diffusion-models.txt @@ -0,0 +1,7 @@ +Lykon/dreamshaper-8 +Fictiverse/Stable_Diffusion_PaperCut_Model +stabilityai/stable-diffusion-xl-base-1.0 +runwayml/stable-diffusion-v1-5 +segmind/SSD-1B +stablediffusionapi/anything-v5 +prompthero/openjourney-v4 \ No newline at end of file diff --git a/constants.py b/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..ddde5e6c5b0e7b8828eb45936564b6acb50881f1 --- /dev/null +++ b/constants.py @@ -0,0 +1,25 @@ +from os import environ, cpu_count + +cpu_cores = cpu_count() +cpus = cpu_cores // 2 if cpu_cores else 0 +APP_VERSION = "v1.0.0 beta 200" +LCM_DEFAULT_MODEL = "stabilityai/sd-turbo" +LCM_DEFAULT_MODEL_OPENVINO = "rupeshs/sd-turbo-openvino" +APP_NAME = "FastSD CPU" +APP_SETTINGS_FILE = "settings.yaml" +RESULTS_DIRECTORY = "results" +CONFIG_DIRECTORY = "configs" +DEVICE = environ.get("DEVICE", "cpu") +SD_MODELS_FILE = "stable-diffusion-models.txt" +LCM_LORA_MODELS_FILE = "lcm-lora-models.txt" +OPENVINO_LCM_MODELS_FILE = "openvino-lcm-models.txt" +TAESD_MODEL = "madebyollin/taesd" +TAESDXL_MODEL = "madebyollin/taesdxl" +TAESD_MODEL_OPENVINO = "deinferno/taesd-openvino" +LCM_MODELS_FILE = "lcm-models.txt" +TAESDXL_MODEL_OPENVINO = "rupeshs/taesdxl-openvino" +LORA_DIRECTORY = "lora_models" +CONTROLNET_DIRECTORY = "controlnet_models" +MODELS_DIRECTORY = "models" +GGUF_THREADS = environ.get("GGUF_THREADS", cpus) +TAEF1_MODEL_OPENVINO = "rupeshs/taef1-openvino" diff --git a/context.py b/context.py new file mode 100644 index 0000000000000000000000000000000000000000..b836527f1ceb92397dd523a910e096087e0a7876 --- /dev/null +++ b/context.py @@ -0,0 +1,85 @@ +from typing import Any +from app_settings import Settings +from models.interface_types import InterfaceType +from backend.models.lcmdiffusion_setting import DiffusionTask +from backend.lcm_text_to_image import LCMTextToImage +from time import perf_counter +from backend.image_saver import ImageSaver +from pprint import pprint + + +class Context: + def __init__( + self, + interface_type: InterfaceType, + device="cpu", + ): + self.interface_type = interface_type.value + self.lcm_text_to_image = LCMTextToImage(device) + self._latency = 0 + + @property + def latency(self): + return self._latency + + def generate_text_to_image( + self, + settings: Settings, + reshape: bool = False, + device: str = "cpu", + save_config=True, + ) -> Any: + if ( + settings.lcm_diffusion_setting.use_tiny_auto_encoder + and settings.lcm_diffusion_setting.use_openvino + ): + print( + "WARNING: Tiny AutoEncoder is not supported in Image to image mode (OpenVINO)" + ) + tick = perf_counter() + from state import get_settings + + if ( + settings.lcm_diffusion_setting.diffusion_task + == DiffusionTask.text_to_image.value + ): + settings.lcm_diffusion_setting.init_image = None + + if save_config: + get_settings().save() + + pprint(settings.lcm_diffusion_setting.model_dump()) + if not settings.lcm_diffusion_setting.lcm_lora: + return None + self.lcm_text_to_image.init( + device, + settings.lcm_diffusion_setting, + ) + images = self.lcm_text_to_image.generate( + settings.lcm_diffusion_setting, + reshape, + ) + elapsed = perf_counter() - tick + self._latency = elapsed + print(f"Latency : {elapsed:.2f} seconds") + if settings.lcm_diffusion_setting.controlnet: + if settings.lcm_diffusion_setting.controlnet.enabled: + images.append(settings.lcm_diffusion_setting.controlnet._control_image) + return images + + + def save_images( + self, + images: Any, + settings: Settings, + ) -> list[str]: + saved_images = [] + if images and settings.generated_images.save_image: + saved_images = ImageSaver.save_images( + settings.generated_images.path, + images=images, + lcm_diffusion_setting=settings.lcm_diffusion_setting, + format=settings.generated_images.format, + jpeg_quality=settings.generated_images.save_image_quality, + ) + return saved_images \ No newline at end of file diff --git a/frontend/cli_interactive.py b/frontend/cli_interactive.py new file mode 100644 index 0000000000000000000000000000000000000000..5bda6655e5935fa2528e3fee727039859dfb7029 --- /dev/null +++ b/frontend/cli_interactive.py @@ -0,0 +1,661 @@ +from os import path +from PIL import Image +from typing import Any + +from constants import DEVICE +from paths import FastStableDiffusionPaths +from backend.upscale.upscaler import upscale_image +from backend.upscale.tiled_upscale import generate_upscaled_image +from frontend.webui.image_variations_ui import generate_image_variations +from backend.lora import ( + get_active_lora_weights, + update_lora_weights, + load_lora_weight, +) +from backend.models.lcmdiffusion_setting import ( + DiffusionTask, + ControlNetSetting, +) + + +_batch_count = 1 +_edit_lora_settings = False + + +def user_value( + value_type: type, + message: str, + default_value: Any, +) -> Any: + try: + value = value_type(input(message)) + except: + value = default_value + return value + + +def interactive_mode( + config, + context, +): + print("=============================================") + print("Welcome to FastSD CPU Interactive CLI") + print("=============================================") + while True: + print("> 1. Text to Image") + print("> 2. Image to Image") + print("> 3. Image Variations") + print("> 4. EDSR Upscale") + print("> 5. SD Upscale") + print("> 6. Edit default generation settings") + print("> 7. Edit LoRA settings") + print("> 8. Edit ControlNet settings") + print("> 9. Edit negative prompt") + print("> 10. Quit") + option = user_value( + int, + "Enter a Diffusion Task number (1): ", + 1, + ) + if option not in range(1, 11): + print("Wrong Diffusion Task number!") + exit() + + if option == 1: + interactive_txt2img( + config, + context, + ) + elif option == 2: + interactive_img2img( + config, + context, + ) + elif option == 3: + interactive_variations( + config, + context, + ) + elif option == 4: + interactive_edsr( + config, + context, + ) + elif option == 5: + interactive_sdupscale( + config, + context, + ) + elif option == 6: + interactive_settings( + config, + context, + ) + elif option == 7: + interactive_lora( + config, + context, + True, + ) + elif option == 8: + interactive_controlnet( + config, + context, + True, + ) + elif option == 9: + interactive_negative( + config, + context, + ) + elif option == 10: + exit() + + +def interactive_negative( + config, + context, +): + settings = config.lcm_diffusion_setting + print(f"Current negative prompt: '{settings.negative_prompt}'") + user_input = input("Write a negative prompt (set guidance > 1.0): ") + if user_input == "": + return + else: + settings.negative_prompt = user_input + + +def interactive_controlnet( + config, + context, + menu_flag=False, +): + """ + @param menu_flag: Indicates whether this function was called from the main + interactive CLI menu; _True_ if called from the main menu, _False_ otherwise + """ + settings = config.lcm_diffusion_setting + if not settings.controlnet: + settings.controlnet = ControlNetSetting() + + current_enabled = settings.controlnet.enabled + current_adapter_path = settings.controlnet.adapter_path + current_conditioning_scale = settings.controlnet.conditioning_scale + current_control_image = settings.controlnet._control_image + + option = input("Enable ControlNet? (y/N): ") + settings.controlnet.enabled = True if option.upper() == "Y" else False + if settings.controlnet.enabled: + option = input( + f"Enter ControlNet adapter path ({settings.controlnet.adapter_path}): " + ) + if option != "": + settings.controlnet.adapter_path = option + settings.controlnet.conditioning_scale = user_value( + float, + f"Enter ControlNet conditioning scale ({settings.controlnet.conditioning_scale}): ", + settings.controlnet.conditioning_scale, + ) + option = input( + f"Enter ControlNet control image path (Leave empty to reuse current): " + ) + if option != "": + try: + new_image = Image.open(option) + settings.controlnet._control_image = new_image + except (AttributeError, FileNotFoundError) as e: + settings.controlnet._control_image = None + if ( + not settings.controlnet.adapter_path + or not path.exists(settings.controlnet.adapter_path) + or not settings.controlnet._control_image + ): + print("Invalid ControlNet settings! Disabling ControlNet") + settings.controlnet.enabled = False + + if ( + settings.controlnet.enabled != current_enabled + or settings.controlnet.adapter_path != current_adapter_path + ): + settings.rebuild_pipeline = True + + +def interactive_lora( + config, + context, + menu_flag=False, +): + """ + @param menu_flag: Indicates whether this function was called from the main + interactive CLI menu; _True_ if called from the main menu, _False_ otherwise + """ + if context == None or context.lcm_text_to_image.pipeline == None: + print("Diffusion pipeline not initialized, please run a generation task first!") + return + + print("> 1. Change LoRA weights") + print("> 2. Load new LoRA model") + option = user_value( + int, + "Enter a LoRA option (1): ", + 1, + ) + if option not in range(1, 3): + print("Wrong LoRA option!") + return + + if option == 1: + update_weights = [] + active_weights = get_active_lora_weights() + for lora in active_weights: + weight = user_value( + float, + f"Enter a new LoRA weight for {lora[0]} ({lora[1]}): ", + lora[1], + ) + update_weights.append( + ( + lora[0], + weight, + ) + ) + if len(update_weights) > 0: + update_lora_weights( + context.lcm_text_to_image.pipeline, + config.lcm_diffusion_setting, + update_weights, + ) + elif option == 2: + # Load a new LoRA + settings = config.lcm_diffusion_setting + settings.lora.fuse = False + settings.lora.enabled = False + settings.lora.path = input("Enter LoRA model path: ") + settings.lora.weight = user_value( + float, + "Enter a LoRA weight (0.5): ", + 0.5, + ) + if not path.exists(settings.lora.path): + print("Invalid LoRA model path!") + return + settings.lora.enabled = True + load_lora_weight(context.lcm_text_to_image.pipeline, settings) + + if menu_flag: + global _edit_lora_settings + _edit_lora_settings = False + option = input("Edit LoRA settings after every generation? (y/N): ") + if option.upper() == "Y": + _edit_lora_settings = True + + +def interactive_settings( + config, + context, +): + global _batch_count + settings = config.lcm_diffusion_setting + print("Enter generation settings (leave empty to use current value)") + print("> 1. Use LCM") + print("> 2. Use LCM-Lora") + print("> 3. Use OpenVINO") + option = user_value( + int, + "Select inference model option (1): ", + 1, + ) + if option not in range(1, 4): + print("Wrong inference model option! Falling back to defaults") + return + + settings.use_lcm_lora = False + settings.use_openvino = False + if option == 1: + lcm_model_id = input(f"Enter LCM model ID ({settings.lcm_model_id}): ") + if lcm_model_id != "": + settings.lcm_model_id = lcm_model_id + elif option == 2: + settings.use_lcm_lora = True + lcm_lora_id = input( + f"Enter LCM-Lora model ID ({settings.lcm_lora.lcm_lora_id}): " + ) + if lcm_lora_id != "": + settings.lcm_lora.lcm_lora_id = lcm_lora_id + base_model_id = input( + f"Enter Base model ID ({settings.lcm_lora.base_model_id}): " + ) + if base_model_id != "": + settings.lcm_lora.base_model_id = base_model_id + elif option == 3: + settings.use_openvino = True + openvino_lcm_model_id = input( + f"Enter OpenVINO model ID ({settings.openvino_lcm_model_id}): " + ) + if openvino_lcm_model_id != "": + settings.openvino_lcm_model_id = openvino_lcm_model_id + + settings.use_offline_model = True + settings.use_tiny_auto_encoder = True + option = input("Work offline? (Y/n): ") + if option.upper() == "N": + settings.use_offline_model = False + option = input("Use Tiny Auto Encoder? (Y/n): ") + if option.upper() == "N": + settings.use_tiny_auto_encoder = False + + settings.image_width = user_value( + int, + f"Image width ({settings.image_width}): ", + settings.image_width, + ) + settings.image_height = user_value( + int, + f"Image height ({settings.image_height}): ", + settings.image_height, + ) + settings.inference_steps = user_value( + int, + f"Inference steps ({settings.inference_steps}): ", + settings.inference_steps, + ) + settings.guidance_scale = user_value( + float, + f"Guidance scale ({settings.guidance_scale}): ", + settings.guidance_scale, + ) + settings.number_of_images = user_value( + int, + f"Number of images per batch ({settings.number_of_images}): ", + settings.number_of_images, + ) + _batch_count = user_value( + int, + f"Batch count ({_batch_count}): ", + _batch_count, + ) + # output_format = user_value(int, f"Output format (PNG)", 1) + print(config.lcm_diffusion_setting) + + +def interactive_txt2img( + config, + context, +): + global _batch_count + config.lcm_diffusion_setting.diffusion_task = DiffusionTask.text_to_image.value + user_input = input("Write a prompt (write 'exit' to quit): ") + while True: + if user_input == "exit": + return + elif user_input == "": + user_input = config.lcm_diffusion_setting.prompt + config.lcm_diffusion_setting.prompt = user_input + for _ in range(0, _batch_count): + images = context.generate_text_to_image( + settings=config, + device=DEVICE, + ) + context.save_images( + images, + config, + ) + if _edit_lora_settings: + interactive_lora( + config, + context, + ) + user_input = input("Write a prompt: ") + + +def interactive_img2img( + config, + context, +): + global _batch_count + settings = config.lcm_diffusion_setting + settings.diffusion_task = DiffusionTask.image_to_image.value + steps = settings.inference_steps + source_path = input("Image path: ") + if source_path == "": + print("Error : You need to provide a file in img2img mode") + return + settings.strength = user_value( + float, + f"img2img strength ({settings.strength}): ", + settings.strength, + ) + settings.inference_steps = int(steps / settings.strength + 1) + user_input = input("Write a prompt (write 'exit' to quit): ") + while True: + if user_input == "exit": + settings.inference_steps = steps + return + settings.init_image = Image.open(source_path) + settings.prompt = user_input + for _ in range(0, _batch_count): + images = context.generate_text_to_image( + settings=config, + device=DEVICE, + ) + context.save_images( + images, + config, + ) + new_path = input(f"Image path ({source_path}): ") + if new_path != "": + source_path = new_path + settings.strength = user_value( + float, + f"img2img strength ({settings.strength}): ", + settings.strength, + ) + if _edit_lora_settings: + interactive_lora( + config, + context, + ) + settings.inference_steps = int(steps / settings.strength + 1) + user_input = input("Write a prompt: ") + + +def interactive_variations( + config, + context, +): + global _batch_count + settings = config.lcm_diffusion_setting + settings.diffusion_task = DiffusionTask.image_to_image.value + steps = settings.inference_steps + source_path = input("Image path: ") + if source_path == "": + print("Error : You need to provide a file in Image variations mode") + return + settings.strength = user_value( + float, + f"Image variations strength ({settings.strength}): ", + settings.strength, + ) + settings.inference_steps = int(steps / settings.strength + 1) + while True: + settings.init_image = Image.open(source_path) + settings.prompt = "" + for i in range(0, _batch_count): + generate_image_variations( + settings.init_image, + settings.strength, + ) + if _edit_lora_settings: + interactive_lora( + config, + context, + ) + user_input = input("Continue in Image variations mode? (Y/n): ") + if user_input.upper() == "N": + settings.inference_steps = steps + return + new_path = input(f"Image path ({source_path}): ") + if new_path != "": + source_path = new_path + settings.strength = user_value( + float, + f"Image variations strength ({settings.strength}): ", + settings.strength, + ) + settings.inference_steps = int(steps / settings.strength + 1) + + +def interactive_edsr( + config, + context, +): + source_path = input("Image path: ") + if source_path == "": + print("Error : You need to provide a file in EDSR mode") + return + while True: + output_path = FastStableDiffusionPaths.get_upscale_filepath( + source_path, + 2, + config.generated_images.format, + ) + result = upscale_image( + context, + source_path, + output_path, + 2, + ) + user_input = input("Continue in EDSR upscale mode? (Y/n): ") + if user_input.upper() == "N": + return + new_path = input(f"Image path ({source_path}): ") + if new_path != "": + source_path = new_path + + +def interactive_sdupscale_settings(config): + steps = config.lcm_diffusion_setting.inference_steps + custom_settings = {} + print("> 1. Upscale whole image") + print("> 2. Define custom tiles (advanced)") + option = user_value( + int, + "Select an SD Upscale option (1): ", + 1, + ) + if option not in range(1, 3): + print("Wrong SD Upscale option!") + return + + # custom_settings["source_file"] = args.file + custom_settings["source_file"] = "" + new_path = input(f"Input image path ({custom_settings['source_file']}): ") + if new_path != "": + custom_settings["source_file"] = new_path + if custom_settings["source_file"] == "": + print("Error : You need to provide a file in SD Upscale mode") + return + custom_settings["target_file"] = None + if option == 2: + custom_settings["target_file"] = input("Image to patch: ") + if custom_settings["target_file"] == "": + print("No target file provided, upscaling whole input image instead!") + custom_settings["target_file"] = None + option = 1 + custom_settings["output_format"] = config.generated_images.format + custom_settings["strength"] = user_value( + float, + f"SD Upscale strength ({config.lcm_diffusion_setting.strength}): ", + config.lcm_diffusion_setting.strength, + ) + config.lcm_diffusion_setting.inference_steps = int( + steps / custom_settings["strength"] + 1 + ) + if option == 1: + custom_settings["scale_factor"] = user_value( + float, + f"Scale factor (2.0): ", + 2.0, + ) + custom_settings["tile_size"] = user_value( + int, + f"Split input image into tiles of the following size, in pixels (256): ", + 256, + ) + custom_settings["tile_overlap"] = user_value( + int, + f"Tile overlap, in pixels (16): ", + 16, + ) + elif option == 2: + custom_settings["scale_factor"] = user_value( + float, + "Input image to Image-to-patch scale_factor (2.0): ", + 2.0, + ) + custom_settings["tile_size"] = 256 + custom_settings["tile_overlap"] = 16 + custom_settings["prompt"] = input( + "Write a prompt describing the input image (optional): " + ) + custom_settings["tiles"] = [] + if option == 2: + add_tile = True + while add_tile: + print("=== Define custom SD Upscale tile ===") + tile_x = user_value( + int, + "Enter tile's X position: ", + 0, + ) + tile_y = user_value( + int, + "Enter tile's Y position: ", + 0, + ) + tile_w = user_value( + int, + "Enter tile's width (256): ", + 256, + ) + tile_h = user_value( + int, + "Enter tile's height (256): ", + 256, + ) + tile_scale = user_value( + float, + "Enter tile's scale factor (2.0): ", + 2.0, + ) + tile_prompt = input("Enter tile's prompt (optional): ") + custom_settings["tiles"].append( + { + "x": tile_x, + "y": tile_y, + "w": tile_w, + "h": tile_h, + "mask_box": None, + "prompt": tile_prompt, + "scale_factor": tile_scale, + } + ) + tile_option = input("Do you want to define another tile? (y/N): ") + if tile_option == "" or tile_option.upper() == "N": + add_tile = False + + return custom_settings + + +def interactive_sdupscale( + config, + context, +): + settings = config.lcm_diffusion_setting + settings.diffusion_task = DiffusionTask.image_to_image.value + settings.init_image = "" + source_path = "" + steps = settings.inference_steps + + while True: + custom_upscale_settings = None + option = input("Edit custom SD Upscale settings? (y/N): ") + if option.upper() == "Y": + config.lcm_diffusion_setting.inference_steps = steps + custom_upscale_settings = interactive_sdupscale_settings(config) + if not custom_upscale_settings: + return + source_path = custom_upscale_settings["source_file"] + else: + new_path = input(f"Image path ({source_path}): ") + if new_path != "": + source_path = new_path + if source_path == "": + print("Error : You need to provide a file in SD Upscale mode") + return + settings.strength = user_value( + float, + f"SD Upscale strength ({settings.strength}): ", + settings.strength, + ) + settings.inference_steps = int(steps / settings.strength + 1) + + output_path = FastStableDiffusionPaths.get_upscale_filepath( + source_path, + 2, + config.generated_images.format, + ) + generate_upscaled_image( + config, + source_path, + settings.strength, + upscale_settings=custom_upscale_settings, + context=context, + tile_overlap=32 if settings.use_openvino else 16, + output_path=output_path, + image_format=config.generated_images.format, + ) + user_input = input("Continue in SD Upscale mode? (Y/n): ") + if user_input.upper() == "N": + settings.inference_steps = steps + return diff --git a/frontend/gui/app_window.py b/frontend/gui/app_window.py new file mode 100644 index 0000000000000000000000000000000000000000..59591affbbbd123add5379b9cd83d147d032d5af --- /dev/null +++ b/frontend/gui/app_window.py @@ -0,0 +1,595 @@ +from datetime import datetime + +from app_settings import AppSettings +from backend.models.lcmdiffusion_setting import DiffusionTask +from constants import ( + APP_NAME, + APP_VERSION, + LCM_DEFAULT_MODEL, + LCM_DEFAULT_MODEL_OPENVINO, +) +from context import Context +from frontend.gui.image_variations_widget import ImageVariationsWidget +from frontend.gui.upscaler_widget import UpscalerWidget +from frontend.gui.img2img_widget import Img2ImgWidget +from frontend.utils import ( + enable_openvino_controls, + get_valid_model_id, + is_reshape_required, +) +from paths import FastStableDiffusionPaths +from PyQt5 import QtCore, QtWidgets +from PyQt5.QtCore import QSize, Qt, QThreadPool, QUrl +from PyQt5.QtGui import QDesktopServices +from PyQt5.QtWidgets import ( + QCheckBox, + QComboBox, + QFileDialog, + QHBoxLayout, + QLabel, + QLineEdit, + QMainWindow, + QPushButton, + QSizePolicy, + QSlider, + QSpacerItem, + QTabWidget, + QToolButton, + QVBoxLayout, + QWidget, +) + +from models.interface_types import InterfaceType +from frontend.gui.base_widget import BaseWidget + +# DPI scale fix +QtWidgets.QApplication.setAttribute(QtCore.Qt.AA_EnableHighDpiScaling, True) +QtWidgets.QApplication.setAttribute(QtCore.Qt.AA_UseHighDpiPixmaps, True) + + +class MainWindow(QMainWindow): + settings_changed = QtCore.pyqtSignal() + """ This signal is used for enabling/disabling the negative prompt field for + modes that support it; in particular, negative prompt is supported with OpenVINO models + and in LCM-LoRA mode but not in LCM mode + """ + + def __init__(self, config: AppSettings): + super().__init__() + self.config = config + # Prevent saved LoRA and ControlNet settings from being used by + # default; in GUI mode, the user must explicitly enable those + if self.config.settings.lcm_diffusion_setting.lora: + self.config.settings.lcm_diffusion_setting.lora.enabled = False + if self.config.settings.lcm_diffusion_setting.controlnet: + self.config.settings.lcm_diffusion_setting.controlnet.enabled = False + self.setWindowTitle(APP_NAME) + self.setFixedSize(QSize(600, 670)) + self.init_ui() + self.pipeline = None + self.threadpool = QThreadPool() + self.device = "cpu" + self.previous_width = 0 + self.previous_height = 0 + self.previous_model = "" + self.previous_num_of_images = 0 + self.context = Context(InterfaceType.GUI) + self.init_ui_values() + self.gen_images = [] + self.image_index = 0 + print(f"Output path : {self.config.settings.generated_images.path}") + + def init_ui_values(self): + self.lcm_model.setEnabled( + not self.config.settings.lcm_diffusion_setting.use_openvino + ) + self.guidance.setValue( + int(self.config.settings.lcm_diffusion_setting.guidance_scale * 10) + ) + self.seed_value.setEnabled(self.config.settings.lcm_diffusion_setting.use_seed) + self.safety_checker.setChecked( + self.config.settings.lcm_diffusion_setting.use_safety_checker + ) + self.use_openvino_check.setChecked( + self.config.settings.lcm_diffusion_setting.use_openvino + ) + self.width.setCurrentText( + str(self.config.settings.lcm_diffusion_setting.image_width) + ) + self.height.setCurrentText( + str(self.config.settings.lcm_diffusion_setting.image_height) + ) + self.inference_steps.setValue( + int(self.config.settings.lcm_diffusion_setting.inference_steps) + ) + self.clip_skip.setValue( + int(self.config.settings.lcm_diffusion_setting.clip_skip) + ) + self.token_merging.setValue( + int(self.config.settings.lcm_diffusion_setting.token_merging * 100) + ) + self.seed_check.setChecked(self.config.settings.lcm_diffusion_setting.use_seed) + self.seed_value.setText(str(self.config.settings.lcm_diffusion_setting.seed)) + self.use_local_model_folder.setChecked( + self.config.settings.lcm_diffusion_setting.use_offline_model + ) + self.results_path.setText(self.config.settings.generated_images.path) + self.num_images.setValue( + self.config.settings.lcm_diffusion_setting.number_of_images + ) + self.use_tae_sd.setChecked( + self.config.settings.lcm_diffusion_setting.use_tiny_auto_encoder + ) + self.use_lcm_lora.setChecked( + self.config.settings.lcm_diffusion_setting.use_lcm_lora + ) + self.lcm_model.setCurrentText( + get_valid_model_id( + self.config.lcm_models, + self.config.settings.lcm_diffusion_setting.lcm_model_id, + LCM_DEFAULT_MODEL, + ) + ) + self.base_model_id.setCurrentText( + get_valid_model_id( + self.config.stable_diffsuion_models, + self.config.settings.lcm_diffusion_setting.lcm_lora.base_model_id, + ) + ) + self.lcm_lora_id.setCurrentText( + get_valid_model_id( + self.config.lcm_lora_models, + self.config.settings.lcm_diffusion_setting.lcm_lora.lcm_lora_id, + ) + ) + self.openvino_lcm_model_id.setCurrentText( + get_valid_model_id( + self.config.openvino_lcm_models, + self.config.settings.lcm_diffusion_setting.openvino_lcm_model_id, + LCM_DEFAULT_MODEL_OPENVINO, + ) + ) + self.openvino_lcm_model_id.setEnabled( + self.config.settings.lcm_diffusion_setting.use_openvino + ) + + def init_ui(self): + self.create_main_tab() + self.create_settings_tab() + self.create_about_tab() + self.show() + + def create_main_tab(self): + self.tab_widget = QTabWidget(self) + self.tab_main = BaseWidget(self.config, self) + self.tab_settings = QWidget() + self.tab_about = QWidget() + self.img2img_tab = Img2ImgWidget(self.config, self) + self.variations_tab = ImageVariationsWidget(self.config, self) + self.upscaler_tab = UpscalerWidget(self.config, self) + + # Add main window tabs here + self.tab_widget.addTab(self.tab_main, "Text to Image") + self.tab_widget.addTab(self.img2img_tab, "Image to Image") + self.tab_widget.addTab(self.variations_tab, "Image Variations") + self.tab_widget.addTab(self.upscaler_tab, "Upscaler") + self.tab_widget.addTab(self.tab_settings, "Settings") + self.tab_widget.addTab(self.tab_about, "About") + + self.setCentralWidget(self.tab_widget) + self.use_seed = False + + def create_settings_tab(self): + self.lcm_model_label = QLabel("Latent Consistency Model:") + # self.lcm_model = QLineEdit(LCM_DEFAULT_MODEL) + self.lcm_model = QComboBox(self) + self.lcm_model.addItems(self.config.lcm_models) + self.lcm_model.currentIndexChanged.connect(self.on_lcm_model_changed) + + self.use_lcm_lora = QCheckBox("Use LCM LoRA") + self.use_lcm_lora.setChecked(False) + self.use_lcm_lora.stateChanged.connect(self.use_lcm_lora_changed) + + self.lora_base_model_id_label = QLabel("Lora base model ID :") + self.base_model_id = QComboBox(self) + self.base_model_id.addItems(self.config.stable_diffsuion_models) + self.base_model_id.currentIndexChanged.connect(self.on_base_model_id_changed) + + self.lcm_lora_model_id_label = QLabel("LCM LoRA model ID :") + self.lcm_lora_id = QComboBox(self) + self.lcm_lora_id.addItems(self.config.lcm_lora_models) + self.lcm_lora_id.currentIndexChanged.connect(self.on_lcm_lora_id_changed) + + self.inference_steps_value = QLabel("Number of inference steps: 4") + self.inference_steps = QSlider(orientation=Qt.Orientation.Horizontal) + self.inference_steps.setMaximum(25) + self.inference_steps.setMinimum(1) + self.inference_steps.setValue(4) + self.inference_steps.valueChanged.connect(self.update_steps_label) + + self.num_images_value = QLabel("Number of images: 1") + self.num_images = QSlider(orientation=Qt.Orientation.Horizontal) + self.num_images.setMaximum(100) + self.num_images.setMinimum(1) + self.num_images.setValue(1) + self.num_images.valueChanged.connect(self.update_num_images_label) + + self.guidance_value = QLabel("Guidance scale: 1") + self.guidance = QSlider(orientation=Qt.Orientation.Horizontal) + self.guidance.setMaximum(20) + self.guidance.setMinimum(10) + self.guidance.setValue(10) + self.guidance.valueChanged.connect(self.update_guidance_label) + + self.clip_skip_value = QLabel("CLIP Skip: 1") + self.clip_skip = QSlider(orientation=Qt.Orientation.Horizontal) + self.clip_skip.setMaximum(12) + self.clip_skip.setMinimum(1) + self.clip_skip.setValue(1) + self.clip_skip.valueChanged.connect(self.update_clip_skip_label) + + self.token_merging_value = QLabel("Token Merging: 0") + self.token_merging = QSlider(orientation=Qt.Orientation.Horizontal) + self.token_merging.setMaximum(100) + self.token_merging.setMinimum(0) + self.token_merging.setValue(0) + self.token_merging.valueChanged.connect(self.update_token_merging_label) + + self.width_value = QLabel("Width :") + self.width = QComboBox(self) + self.width.addItem("256") + self.width.addItem("512") + self.width.addItem("768") + self.width.addItem("1024") + self.width.setCurrentText("512") + self.width.currentIndexChanged.connect(self.on_width_changed) + + self.height_value = QLabel("Height :") + self.height = QComboBox(self) + self.height.addItem("256") + self.height.addItem("512") + self.height.addItem("768") + self.height.addItem("1024") + self.height.setCurrentText("512") + self.height.currentIndexChanged.connect(self.on_height_changed) + + self.seed_check = QCheckBox("Use seed") + self.seed_value = QLineEdit() + self.seed_value.setInputMask("9999999999") + self.seed_value.setText("123123") + self.seed_check.stateChanged.connect(self.seed_changed) + + self.safety_checker = QCheckBox("Use safety checker") + self.safety_checker.setChecked(True) + self.safety_checker.stateChanged.connect(self.use_safety_checker_changed) + + self.use_openvino_check = QCheckBox("Use OpenVINO") + self.use_openvino_check.setChecked(False) + self.openvino_model_label = QLabel("OpenVINO LCM model:") + self.use_local_model_folder = QCheckBox( + "Use locally cached model or downloaded model folder(offline)" + ) + self.openvino_lcm_model_id = QComboBox(self) + self.openvino_lcm_model_id.addItems(self.config.openvino_lcm_models) + self.openvino_lcm_model_id.currentIndexChanged.connect( + self.on_openvino_lcm_model_id_changed + ) + + self.use_openvino_check.setEnabled(enable_openvino_controls()) + self.use_local_model_folder.setChecked(False) + self.use_local_model_folder.stateChanged.connect(self.use_offline_model_changed) + self.use_openvino_check.stateChanged.connect(self.use_openvino_changed) + + self.use_tae_sd = QCheckBox( + "Use Tiny Auto Encoder - TAESD (Fast, moderate quality)" + ) + self.use_tae_sd.setChecked(False) + self.use_tae_sd.stateChanged.connect(self.use_tae_sd_changed) + + hlayout = QHBoxLayout() + hlayout.addWidget(self.seed_check) + hlayout.addWidget(self.seed_value) + hspacer = QSpacerItem(20, 10, QSizePolicy.Expanding, QSizePolicy.Minimum) + slider_hspacer = QSpacerItem(20, 10, QSizePolicy.Expanding, QSizePolicy.Minimum) + + self.results_path_label = QLabel("Output path:") + self.results_path = QLineEdit() + self.results_path.textChanged.connect(self.on_path_changed) + self.browse_folder_btn = QToolButton() + self.browse_folder_btn.setText("...") + self.browse_folder_btn.clicked.connect(self.on_browse_folder) + + self.reset = QPushButton("Reset All") + self.reset.clicked.connect(self.reset_all_settings) + + vlayout = QVBoxLayout() + vspacer = QSpacerItem(20, 20, QSizePolicy.Minimum, QSizePolicy.Expanding) + vlayout.addItem(hspacer) + vlayout.setSpacing(3) + vlayout.addWidget(self.lcm_model_label) + vlayout.addWidget(self.lcm_model) + vlayout.addWidget(self.use_local_model_folder) + vlayout.addWidget(self.use_lcm_lora) + vlayout.addWidget(self.lora_base_model_id_label) + vlayout.addWidget(self.base_model_id) + vlayout.addWidget(self.lcm_lora_model_id_label) + vlayout.addWidget(self.lcm_lora_id) + vlayout.addWidget(self.use_openvino_check) + vlayout.addWidget(self.openvino_model_label) + vlayout.addWidget(self.openvino_lcm_model_id) + vlayout.addWidget(self.use_tae_sd) + vlayout.addItem(slider_hspacer) + vlayout.addWidget(self.inference_steps_value) + vlayout.addWidget(self.inference_steps) + vlayout.addWidget(self.num_images_value) + vlayout.addWidget(self.num_images) + vlayout.addWidget(self.width_value) + vlayout.addWidget(self.width) + vlayout.addWidget(self.height_value) + vlayout.addWidget(self.height) + vlayout.addWidget(self.guidance_value) + vlayout.addWidget(self.guidance) + vlayout.addWidget(self.clip_skip_value) + vlayout.addWidget(self.clip_skip) + vlayout.addWidget(self.token_merging_value) + vlayout.addWidget(self.token_merging) + vlayout.addLayout(hlayout) + vlayout.addWidget(self.safety_checker) + + vlayout.addWidget(self.results_path_label) + hlayout_path = QHBoxLayout() + hlayout_path.addWidget(self.results_path) + hlayout_path.addWidget(self.browse_folder_btn) + vlayout.addLayout(hlayout_path) + self.tab_settings.setLayout(vlayout) + hlayout_reset = QHBoxLayout() + hspacer = QSpacerItem(20, 20, QSizePolicy.Expanding, QSizePolicy.Minimum) + hlayout_reset.addItem(hspacer) + hlayout_reset.addWidget(self.reset) + vlayout.addLayout(hlayout_reset) + vlayout.addItem(vspacer) + + def create_about_tab(self): + self.label = QLabel() + self.label.setAlignment(Qt.AlignCenter) + current_year = datetime.now().year + self.label.setText( + f"""
{APP_VERSION} " + current_year = datetime.now().year + footer_msg = version + ( + f' © 2023 - {current_year} ' + " Rupesh Sreeraman
{APP_VERSION} " + current_year = datetime.now().year + footer_msg = version + ( + f' © 2023 - {current_year} ' + " Rupesh Sreeraman