Spaces:

tejani
/

NewApp

Running

App Files Files Community

tejani commited on Apr 20

Commit

5b16c73

verified ·

1 Parent(s): fa5c881

Upload 75 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

__init__.py +0 -0
app.py +554 -0
app_settings.py +124 -0
backend/__init__.py +0 -0
backend/annotators/canny_control.py +15 -0
backend/annotators/control_interface.py +12 -0
backend/annotators/depth_control.py +15 -0
backend/annotators/image_control_factory.py +31 -0
backend/annotators/lineart_control.py +11 -0
backend/annotators/mlsd_control.py +10 -0
backend/annotators/normal_control.py +10 -0
backend/annotators/pose_control.py +10 -0
backend/annotators/shuffle_control.py +10 -0
backend/annotators/softedge_control.py +10 -0
backend/api/mcp_server.py +97 -0
backend/api/models/response.py +16 -0
backend/api/web.py +112 -0
backend/base64_image.py +21 -0
backend/controlnet.py +90 -0
backend/device.py +23 -0
backend/gguf/gguf_diffusion.py +319 -0
backend/gguf/sdcpp_types.py +104 -0
backend/image_saver.py +75 -0
backend/lcm_text_to_image.py +577 -0
backend/lora.py +136 -0
backend/models/device.py +9 -0
backend/models/gen_images.py +17 -0
backend/models/lcmdiffusion_setting.py +76 -0
backend/models/upscale.py +9 -0
backend/openvino/custom_ov_model_vae_decoder.py +21 -0
backend/openvino/flux_pipeline.py +36 -0
backend/openvino/ov_hc_stablediffusion_pipeline.py +93 -0
backend/openvino/ovflux.py +675 -0
backend/openvino/pipelines.py +75 -0
backend/openvino/stable_diffusion_engine.py +1817 -0
backend/pipelines/lcm.py +122 -0
backend/pipelines/lcm_lora.py +81 -0
backend/tiny_decoder.py +32 -0
backend/upscale/aura_sr.py +1004 -0
backend/upscale/aura_sr_upscale.py +9 -0
backend/upscale/edsr_upscale_onnx.py +37 -0
backend/upscale/tiled_upscale.py +237 -0
backend/upscale/upscaler.py +52 -0
configs/lcm-lora-models.txt +4 -0
configs/lcm-models.txt +8 -0
configs/openvino-lcm-models.txt +9 -0
configs/stable-diffusion-models.txt +7 -0
constants.py +25 -0
context.py +85 -0
frontend/cli_interactive.py +661 -0

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,554 @@

+import json
+from argparse import ArgumentParser
+from PIL import Image
+import constants
+from backend.controlnet import controlnet_settings_from_dict
+from backend.device import get_device_name
+from backend.models.gen_images import ImageFormat
+from backend.models.lcmdiffusion_setting import DiffusionTask
+from backend.upscale.tiled_upscale import generate_upscaled_image
+from constants import APP_VERSION, DEVICE
+from frontend.webui.image_variations_ui import generate_image_variations
+from models.interface_types import InterfaceType
+from paths import FastStableDiffusionPaths, ensure_path
+from state import get_context, get_settings
+from utils import show_system_info
+parser = ArgumentParser(description=f"FAST SD CPU {constants.APP_VERSION}")
+parser.add_argument(
+    "-s",
+    "--share",
+    action="store_true",
+    help="Create sharable link(Web UI)",
+    required=False,
+)
+group = parser.add_mutually_exclusive_group(required=False)
+group.add_argument(
+    "-g",
+    "--gui",
+    action="store_true",
+    help="Start desktop GUI",
+)
+group.add_argument(
+    "-w",
+    "--webui",
+    action="store_true",
+    help="Start Web UI",
+)
+group.add_argument(
+    "-a",
+    "--api",
+    action="store_true",
+    help="Start Web API server",
+)
+group.add_argument(
+    "-m",
+    "--mcp",
+    action="store_true",
+    help="Start MCP(Model Context Protocol) server",
+)
+group.add_argument(
+    "-r",
+    "--realtime",
+    action="store_true",
+    help="Start realtime inference UI(experimental)",
+)
+group.add_argument(
+    "-v",
+    "--version",
+    action="store_true",
+    help="Version",
+)
+parser.add_argument(
+    "-b",
+    "--benchmark",
+    action="store_true",
+    help="Run inference benchmark on the selected device",
+)
+parser.add_argument(
+    "--lcm_model_id",
+    type=str,
+    help="Model ID or path,Default stabilityai/sd-turbo",
+    default="stabilityai/sd-turbo",
+)
+parser.add_argument(
+    "--openvino_lcm_model_id",
+    type=str,
+    help="OpenVINO Model ID or path,Default rupeshs/sd-turbo-openvino",
+    default="rupeshs/sd-turbo-openvino",
+)
+parser.add_argument(
+    "--prompt",
+    type=str,
+    help="Describe the image you want to generate",
+    default="",
+)
+parser.add_argument(
+    "--negative_prompt",
+    type=str,
+    help="Describe what you want to exclude from the generation",
+    default="",
+)
+parser.add_argument(
+    "--image_height",
+    type=int,
+    help="Height of the image",
+    default=512,
+)
+parser.add_argument(
+    "--image_width",
+    type=int,
+    help="Width of the image",
+    default=512,
+)
+parser.add_argument(
+    "--inference_steps",
+    type=int,
+    help="Number of steps,default : 1",
+    default=1,
+)
+parser.add_argument(
+    "--guidance_scale",
+    type=float,
+    help="Guidance scale,default : 1.0",
+    default=1.0,
+)
+parser.add_argument(
+    "--number_of_images",
+    type=int,
+    help="Number of images to generate ,default : 1",
+    default=1,
+)
+parser.add_argument(
+    "--seed",
+    type=int,
+    help="Seed,default : -1 (disabled) ",
+    default=-1,
+)
+parser.add_argument(
+    "--use_openvino",
+    action="store_true",
+    help="Use OpenVINO model",
+)
+parser.add_argument(
+    "--use_offline_model",
+    action="store_true",
+    help="Use offline model",
+)
+parser.add_argument(
+    "--clip_skip",
+    type=int,
+    help="CLIP Skip (1-12), default : 1 (disabled) ",
+    default=1,
+)
+parser.add_argument(
+    "--token_merging",
+    type=float,
+    help="Token merging scale, 0.0 - 1.0, default : 0.0",
+    default=0.0,
+)
+parser.add_argument(
+    "--use_safety_checker",
+    action="store_true",
+    help="Use safety checker",
+)
+parser.add_argument(
+    "--use_lcm_lora",
+    action="store_true",
+    help="Use LCM-LoRA",
+)
+parser.add_argument(
+    "--base_model_id",
+    type=str,
+    help="LCM LoRA base model ID,Default Lykon/dreamshaper-8",
+    default="Lykon/dreamshaper-8",
+)
+parser.add_argument(
+    "--lcm_lora_id",
+    type=str,
+    help="LCM LoRA model ID,Default latent-consistency/lcm-lora-sdv1-5",
+    default="latent-consistency/lcm-lora-sdv1-5",
+)
+parser.add_argument(
+    "-i",
+    "--interactive",
+    action="store_true",
+    help="Interactive CLI mode",
+)
+parser.add_argument(
+    "-t",
+    "--use_tiny_auto_encoder",
+    action="store_true",
+    help="Use tiny auto encoder for SD (TAESD)",
+)
+parser.add_argument(
+    "-f",
+    "--file",
+    type=str,
+    help="Input image for img2img mode",
+    default="",
+)
+parser.add_argument(
+    "--img2img",
+    action="store_true",
+    help="img2img mode; requires input file via -f argument",
+)
+parser.add_argument(
+    "--batch_count",
+    type=int,
+    help="Number of sequential generations",
+    default=1,
+)
+parser.add_argument(
+    "--strength",
+    type=float,
+    help="Denoising strength for img2img and Image variations",
+    default=0.3,
+)
+parser.add_argument(
+    "--sdupscale",
+    action="store_true",
+    help="Tiled SD upscale,works only for the resolution 512x512,(2x upscale)",
+)
+parser.add_argument(
+    "--upscale",
+    action="store_true",
+    help="EDSR SD upscale ",
+)
+parser.add_argument(
+    "--custom_settings",
+    type=str,
+    help="JSON file containing custom generation settings",
+    default=None,
+)
+parser.add_argument(
+    "--usejpeg",
+    action="store_true",
+    help="Images will be saved as JPEG format",
+)
+parser.add_argument(
+    "--noimagesave",
+    action="store_true",
+    help="Disable image saving",
+)
+parser.add_argument(
+    "--imagequality", type=int, help="Output image quality [0 to 100]", default=90
+)
+parser.add_argument(
+    "--lora",
+    type=str,
+    help="LoRA model full path e.g D:\lora_models\CuteCartoon15V-LiberteRedmodModel-Cartoon-CuteCartoonAF.safetensors",
+    default=None,
+)
+parser.add_argument(
+    "--lora_weight",
+    type=float,
+    help="LoRA adapter weight [0 to 1.0]",
+    default=0.5,
+)
+parser.add_argument(
+    "--port",
+    type=int,
+    help="Web server port",
+    default=8000,
+)
+args = parser.parse_args()
+if args.version:
+    print(APP_VERSION)
+    exit()
+# parser.print_help()
+print("FastSD CPU - ", APP_VERSION)
+show_system_info()
+print(f"Using device : {constants.DEVICE}")
+if args.webui:
+    app_settings = get_settings()
+else:
+    app_settings = get_settings()
+print(f"Output path : {app_settings.settings.generated_images.path}")
+ensure_path(app_settings.settings.generated_images.path)
+print(f"Found {len(app_settings.lcm_models)} LCM models in config/lcm-models.txt")
+print(
+    f"Found {len(app_settings.stable_diffsuion_models)} stable diffusion models in config/stable-diffusion-models.txt"
+)
+print(
+    f"Found {len(app_settings.lcm_lora_models)} LCM-LoRA models in config/lcm-lora-models.txt"
+)
+print(
+    f"Found {len(app_settings.openvino_lcm_models)} OpenVINO LCM models in config/openvino-lcm-models.txt"
+)
+if args.noimagesave:
+    app_settings.settings.generated_images.save_image = False
+else:
+    app_settings.settings.generated_images.save_image = True
+app_settings.settings.generated_images.save_image_quality = args.imagequality
+if not args.realtime:
+    # To minimize realtime mode dependencies
+    from backend.upscale.upscaler import upscale_image
+    from frontend.cli_interactive import interactive_mode
+if args.gui:
+    from frontend.gui.ui import start_gui
+    print("Starting desktop GUI mode(Qt)")
+    start_gui(
+        [],
+        app_settings,
+    )
+elif args.webui:
+    from frontend.webui.ui import start_webui
+    print("Starting web UI mode")
+    start_webui(
+        args.share,
+    )
+elif args.realtime:
+    from frontend.webui.realtime_ui import start_realtime_text_to_image
+    print("Starting realtime text to image(EXPERIMENTAL)")
+    start_realtime_text_to_image(args.share)
+elif args.api:
+    from backend.api.web import start_web_server
+    start_web_server(args.port)
+elif args.mcp:
+    from backend.api.mcp_server import start_mcp_server
+    start_mcp_server(args.port)
+else:
+    context = get_context(InterfaceType.CLI)
+    config = app_settings.settings
+    if args.use_openvino:
+        config.lcm_diffusion_setting.openvino_lcm_model_id = args.openvino_lcm_model_id
+    else:
+        config.lcm_diffusion_setting.lcm_model_id = args.lcm_model_id
+    config.lcm_diffusion_setting.prompt = args.prompt
+    config.lcm_diffusion_setting.negative_prompt = args.negative_prompt
+    config.lcm_diffusion_setting.image_height = args.image_height
+    config.lcm_diffusion_setting.image_width = args.image_width
+    config.lcm_diffusion_setting.guidance_scale = args.guidance_scale
+    config.lcm_diffusion_setting.number_of_images = args.number_of_images
+    config.lcm_diffusion_setting.inference_steps = args.inference_steps
+    config.lcm_diffusion_setting.strength = args.strength
+    config.lcm_diffusion_setting.seed = args.seed
+    config.lcm_diffusion_setting.use_openvino = args.use_openvino
+    config.lcm_diffusion_setting.use_tiny_auto_encoder = args.use_tiny_auto_encoder
+    config.lcm_diffusion_setting.use_lcm_lora = args.use_lcm_lora
+    config.lcm_diffusion_setting.lcm_lora.base_model_id = args.base_model_id
+    config.lcm_diffusion_setting.lcm_lora.lcm_lora_id = args.lcm_lora_id
+    config.lcm_diffusion_setting.diffusion_task = DiffusionTask.text_to_image.value
+    config.lcm_diffusion_setting.lora.enabled = False
+    config.lcm_diffusion_setting.lora.path = args.lora
+    config.lcm_diffusion_setting.lora.weight = args.lora_weight
+    config.lcm_diffusion_setting.lora.fuse = True
+    if config.lcm_diffusion_setting.lora.path:
+        config.lcm_diffusion_setting.lora.enabled = True
+    if args.usejpeg:
+        config.generated_images.format = ImageFormat.JPEG.value.upper()
+    if args.seed > -1:
+        config.lcm_diffusion_setting.use_seed = True
+    else:
+        config.lcm_diffusion_setting.use_seed = False
+    config.lcm_diffusion_setting.use_offline_model = args.use_offline_model
+    config.lcm_diffusion_setting.clip_skip = args.clip_skip
+    config.lcm_diffusion_setting.token_merging = args.token_merging
+    config.lcm_diffusion_setting.use_safety_checker = args.use_safety_checker
+    # Read custom settings from JSON file
+    custom_settings = {}
+    if args.custom_settings:
+        with open(args.custom_settings) as f:
+            custom_settings = json.load(f)
+    # Basic ControlNet settings; if ControlNet is enabled, an image is
+    # required even in txt2img mode
+    config.lcm_diffusion_setting.controlnet = None
+    controlnet_settings_from_dict(
+        config.lcm_diffusion_setting,
+        custom_settings,
+    )
+    # Interactive mode
+    if args.interactive:
+        # wrapper(interactive_mode, config, context)
+        config.lcm_diffusion_setting.lora.fuse = False
+        interactive_mode(config, context)
+    # Start of non-interactive CLI image generation
+    if args.img2img and args.file != "":
+        config.lcm_diffusion_setting.init_image = Image.open(args.file)
+        config.lcm_diffusion_setting.diffusion_task = DiffusionTask.image_to_image.value
+    elif args.img2img and args.file == "":
+        print("Error : You need to specify a file in img2img mode")
+        exit()
+    elif args.upscale and args.file == "" and args.custom_settings == None:
+        print("Error : You need to specify a file in SD upscale mode")
+        exit()
+    elif (
+        args.prompt == ""
+        and args.file == ""
+        and args.custom_settings == None
+        and not args.benchmark
+    ):
+        print("Error : You need to provide a prompt")
+        exit()
+    if args.upscale:
+        # image = Image.open(args.file)
+        output_path = FastStableDiffusionPaths.get_upscale_filepath(
+            args.file,
+            2,
+            config.generated_images.format,
+        )
+        result = upscale_image(
+            context,
+            args.file,
+            output_path,
+            2,
+        )
+    # Perform Tiled SD upscale (EXPERIMENTAL)
+    elif args.sdupscale:
+        if args.use_openvino:
+            config.lcm_diffusion_setting.strength = 0.3
+        upscale_settings = None
+        if custom_settings != {}:
+            upscale_settings = custom_settings
+        filepath = args.file
+        output_format = config.generated_images.format
+        if upscale_settings:
+            filepath = upscale_settings["source_file"]
+            output_format = upscale_settings["output_format"].upper()
+        output_path = FastStableDiffusionPaths.get_upscale_filepath(
+            filepath,
+            2,
+            output_format,
+        )
+        generate_upscaled_image(
+            config,
+            filepath,
+            config.lcm_diffusion_setting.strength,
+            upscale_settings=upscale_settings,
+            context=context,
+            tile_overlap=32 if config.lcm_diffusion_setting.use_openvino else 16,
+            output_path=output_path,
+            image_format=output_format,
+        )
+        exit()
+    # If img2img argument is set and prompt is empty, use image variations mode
+    elif args.img2img and args.prompt == "":
+        for i in range(0, args.batch_count):
+            generate_image_variations(
+                config.lcm_diffusion_setting.init_image, args.strength
+            )
+    else:
+        if args.benchmark:
+            print("Initializing benchmark...")
+            bench_lcm_setting = config.lcm_diffusion_setting
+            bench_lcm_setting.prompt = "a cat"
+            bench_lcm_setting.use_tiny_auto_encoder = False
+            context.generate_text_to_image(
+                settings=config,
+                device=DEVICE,
+            )
+            latencies = []
+            print("Starting benchmark please wait...")
+            for _ in range(3):
+                context.generate_text_to_image(
+                    settings=config,
+                    device=DEVICE,
+                )
+                latencies.append(context.latency)
+            avg_latency = sum(latencies) / 3
+            bench_lcm_setting.use_tiny_auto_encoder = True
+            context.generate_text_to_image(
+                settings=config,
+                device=DEVICE,
+            )
+            latencies = []
+            for _ in range(3):
+                context.generate_text_to_image(
+                    settings=config,
+                    device=DEVICE,
+                )
+                latencies.append(context.latency)
+            avg_latency_taesd = sum(latencies) / 3
+            benchmark_name = ""
+            if config.lcm_diffusion_setting.use_openvino:
+                benchmark_name = "OpenVINO"
+            else:
+                benchmark_name = "PyTorch"
+            bench_model_id = ""
+            if bench_lcm_setting.use_openvino:
+                bench_model_id = bench_lcm_setting.openvino_lcm_model_id
+            elif bench_lcm_setting.use_lcm_lora:
+                bench_model_id = bench_lcm_setting.lcm_lora.base_model_id
+            else:
+                bench_model_id = bench_lcm_setting.lcm_model_id
+            benchmark_result = [
+                ["Device", f"{DEVICE.upper()},{get_device_name()}"],
+                ["Stable Diffusion Model", bench_model_id],
+                [
+                    "Image Size ",
+                    f"{bench_lcm_setting.image_width}x{bench_lcm_setting.image_height}",
+                ],
+                [
+                    "Inference Steps",
+                    f"{bench_lcm_setting.inference_steps}",
+                ],
+                [
+                    "Benchmark Passes",
+                    3,
+                ],
+                [
+                    "Average Latency",
+                    f"{round(avg_latency, 3)} sec",
+                ],
+                [
+                    "Average Latency(TAESD* enabled)",
+                    f"{round(avg_latency_taesd, 3)} sec",
+                ],
+            ]
+            print()
+            print(
+                f"                          FastSD Benchmark - {benchmark_name:8}                         "
+            )
+            print(f"-" * 80)
+            for benchmark in benchmark_result:
+                print(f"{benchmark[0]:35} - {benchmark[1]}")
+            print(f"-" * 80)
+            print("*TAESD - Tiny AutoEncoder for Stable Diffusion")
+        else:
+            for i in range(0, args.batch_count):
+                context.generate_text_to_image(
+                    settings=config,
+                    device=DEVICE,
+                )

app_settings.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from copy import deepcopy
+from os import makedirs, path
+import yaml
+from constants import (
+    LCM_LORA_MODELS_FILE,
+    LCM_MODELS_FILE,
+    OPENVINO_LCM_MODELS_FILE,
+    SD_MODELS_FILE,
+)
+from paths import FastStableDiffusionPaths, join_paths
+from utils import get_files_in_dir, get_models_from_text_file
+from models.settings import Settings
+class AppSettings:
+    def __init__(self):
+        self.config_path = FastStableDiffusionPaths().get_app_settings_path()
+        self._stable_diffsuion_models = get_models_from_text_file(
+            FastStableDiffusionPaths().get_models_config_path(SD_MODELS_FILE)
+        )
+        self._lcm_lora_models = get_models_from_text_file(
+            FastStableDiffusionPaths().get_models_config_path(LCM_LORA_MODELS_FILE)
+        )
+        self._openvino_lcm_models = get_models_from_text_file(
+            FastStableDiffusionPaths().get_models_config_path(OPENVINO_LCM_MODELS_FILE)
+        )
+        self._lcm_models = get_models_from_text_file(
+            FastStableDiffusionPaths().get_models_config_path(LCM_MODELS_FILE)
+        )
+        self._gguf_diffusion_models = get_files_in_dir(
+            join_paths(FastStableDiffusionPaths().get_gguf_models_path(), "diffusion")
+        )
+        self._gguf_clip_models = get_files_in_dir(
+            join_paths(FastStableDiffusionPaths().get_gguf_models_path(), "clip")
+        )
+        self._gguf_vae_models = get_files_in_dir(
+            join_paths(FastStableDiffusionPaths().get_gguf_models_path(), "vae")
+        )
+        self._gguf_t5xxl_models = get_files_in_dir(
+            join_paths(FastStableDiffusionPaths().get_gguf_models_path(), "t5xxl")
+        )
+        self._config = None
+    @property
+    def settings(self):
+        return self._config
+    @property
+    def stable_diffsuion_models(self):
+        return self._stable_diffsuion_models
+    @property
+    def openvino_lcm_models(self):
+        return self._openvino_lcm_models
+    @property
+    def lcm_models(self):
+        return self._lcm_models
+    @property
+    def lcm_lora_models(self):
+        return self._lcm_lora_models
+    @property
+    def gguf_diffusion_models(self):
+        return self._gguf_diffusion_models
+    @property
+    def gguf_clip_models(self):
+        return self._gguf_clip_models
+    @property
+    def gguf_vae_models(self):
+        return self._gguf_vae_models
+    @property
+    def gguf_t5xxl_models(self):
+        return self._gguf_t5xxl_models
+    def load(self, skip_file=False):
+        if skip_file:
+            print("Skipping config file")
+            settings_dict = self._load_default()
+            self._config = Settings.model_validate(settings_dict)
+        else:
+            if not path.exists(self.config_path):
+                base_dir = path.dirname(self.config_path)
+                if not path.exists(base_dir):
+                    makedirs(base_dir)
+                try:
+                    print("Settings not found creating default settings")
+                    with open(self.config_path, "w") as file:
+                        yaml.dump(
+                            self._load_default(),
+                            file,
+                        )
+                except Exception as ex:
+                    print(f"Error in creating settings : {ex}")
+                    exit()
+            try:
+                with open(self.config_path) as file:
+                    settings_dict = yaml.safe_load(file)
+                    self._config = Settings.model_validate(settings_dict)
+            except Exception as ex:
+                print(f"Error in loading settings : {ex}")
+    def save(self):
+        try:
+            with open(self.config_path, "w") as file:
+                tmp_cfg = deepcopy(self._config)
+                tmp_cfg.lcm_diffusion_setting.init_image = None
+                configurations = tmp_cfg.model_dump(
+                    exclude=["init_image"],
+                )
+                if configurations:
+                    yaml.dump(configurations, file)
+        except Exception as ex:
+            print(f"Error in saving settings : {ex}")
+    def _load_default(self) -> dict:
+        default_config = Settings()
+        return default_config.model_dump()

backend/__init__.py ADDED Viewed

File without changes

backend/annotators/canny_control.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import numpy as np
+from backend.annotators.control_interface import ControlInterface
+from cv2 import Canny
+from PIL import Image
+class CannyControl(ControlInterface):
+    def get_control_image(self, image: Image) -> Image:
+        low_threshold = 100
+        high_threshold = 200
+        image = np.array(image)
+        image = Canny(image, low_threshold, high_threshold)
+        image = image[:, :, None]
+        image = np.concatenate([image, image, image], axis=2)
+        return Image.fromarray(image)

backend/annotators/control_interface.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from abc import ABC, abstractmethod
+from PIL import Image
+class ControlInterface(ABC):
+    @abstractmethod
+    def get_control_image(
+        self,
+        image: Image,
+    ) -> Image:
+        pass

backend/annotators/depth_control.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import numpy as np
+from backend.annotators.control_interface import ControlInterface
+from PIL import Image
+from transformers import pipeline
+class DepthControl(ControlInterface):
+    def get_control_image(self, image: Image) -> Image:
+        depth_estimator = pipeline("depth-estimation")
+        image = depth_estimator(image)["depth"]
+        image = np.array(image)
+        image = image[:, :, None]
+        image = np.concatenate([image, image, image], axis=2)
+        image = Image.fromarray(image)
+        return image

backend/annotators/image_control_factory.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from backend.annotators.canny_control import CannyControl
+from backend.annotators.depth_control import DepthControl
+from backend.annotators.lineart_control import LineArtControl
+from backend.annotators.mlsd_control import MlsdControl
+from backend.annotators.normal_control import NormalControl
+from backend.annotators.pose_control import PoseControl
+from backend.annotators.shuffle_control import ShuffleControl
+from backend.annotators.softedge_control import SoftEdgeControl
+class ImageControlFactory:
+    def create_control(self, controlnet_type: str):
+        if controlnet_type == "Canny":
+            return CannyControl()
+        elif controlnet_type == "Pose":
+            return PoseControl()
+        elif controlnet_type == "MLSD":
+            return MlsdControl()
+        elif controlnet_type == "Depth":
+            return DepthControl()
+        elif controlnet_type == "LineArt":
+            return LineArtControl()
+        elif controlnet_type == "Shuffle":
+            return ShuffleControl()
+        elif controlnet_type == "NormalBAE":
+            return NormalControl()
+        elif controlnet_type == "SoftEdge":
+            return SoftEdgeControl()
+        else:
+            print("Error: Control type not implemented!")
+            raise Exception("Error: Control type not implemented!")

backend/annotators/lineart_control.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import numpy as np
+from backend.annotators.control_interface import ControlInterface
+from controlnet_aux import LineartDetector
+from PIL import Image
+class LineArtControl(ControlInterface):
+    def get_control_image(self, image: Image) -> Image:
+        processor = LineartDetector.from_pretrained("lllyasviel/Annotators")
+        control_image = processor(image)
+        return control_image

backend/annotators/mlsd_control.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from backend.annotators.control_interface import ControlInterface
+from controlnet_aux import MLSDdetector
+from PIL import Image
+class MlsdControl(ControlInterface):
+    def get_control_image(self, image: Image) -> Image:
+        mlsd = MLSDdetector.from_pretrained("lllyasviel/ControlNet")
+        image = mlsd(image)
+        return image

backend/annotators/normal_control.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from backend.annotators.control_interface import ControlInterface
+from controlnet_aux import NormalBaeDetector
+from PIL import Image
+class NormalControl(ControlInterface):
+    def get_control_image(self, image: Image) -> Image:
+        processor = NormalBaeDetector.from_pretrained("lllyasviel/Annotators")
+        control_image = processor(image)
+        return control_image

backend/annotators/pose_control.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from backend.annotators.control_interface import ControlInterface
+from controlnet_aux import OpenposeDetector
+from PIL import Image
+class PoseControl(ControlInterface):
+    def get_control_image(self, image: Image) -> Image:
+        openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
+        image = openpose(image)
+        return image

backend/annotators/shuffle_control.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from backend.annotators.control_interface import ControlInterface
+from controlnet_aux import ContentShuffleDetector
+from PIL import Image
+class ShuffleControl(ControlInterface):
+    def get_control_image(self, image: Image) -> Image:
+        shuffle_processor = ContentShuffleDetector()
+        image = shuffle_processor(image)
+        return image

backend/annotators/softedge_control.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from backend.annotators.control_interface import ControlInterface
+from controlnet_aux import PidiNetDetector
+from PIL import Image
+class SoftEdgeControl(ControlInterface):
+    def get_control_image(self, image: Image) -> Image:
+        processor = PidiNetDetector.from_pretrained("lllyasviel/Annotators")
+        control_image = processor(image)
+        return control_image

backend/api/mcp_server.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import platform
+import uvicorn
+from backend.device import get_device_name
+from backend.models.device import DeviceInfo
+from constants import APP_VERSION, DEVICE
+from context import Context
+from fastapi import FastAPI, Request
+from fastapi_mcp import FastApiMCP
+from state import get_settings
+from fastapi.middleware.cors import CORSMiddleware
+from models.interface_types import InterfaceType
+from fastapi.staticfiles import StaticFiles
+app_settings = get_settings()
+app = FastAPI(
+    title="FastSD CPU",
+    description="Fast stable diffusion on CPU",
+    version=APP_VERSION,
+    license_info={
+        "name": "MIT",
+        "identifier": "MIT",
+    },
+    describe_all_responses=True,
+    describe_full_response_schema=True,
+)
+origins = ["*"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+print(app_settings.settings.lcm_diffusion_setting)
+context = Context(InterfaceType.API_SERVER)
+app.mount("/results", StaticFiles(directory="results"), name="results")
+@app.get(
+    "/info",
+    description="Get system information",
+    summary="Get system information",
+    operation_id="get_system_info",
+)
+async def info() -> dict:
+    device_info = DeviceInfo(
+        device_type=DEVICE,
+        device_name=get_device_name(),
+        os=platform.system(),
+        platform=platform.platform(),
+        processor=platform.processor(),
+    )
+    return device_info.model_dump()
+@app.post(
+    "/generate",
+    description="Generate image from text prompt",
+    summary="Text to image generation",
+    operation_id="generate",
+)
+async def generate(
+    prompt: str,
+    request: Request,
+) -> str:
+    """
+    Returns URL of the generated image for text prompt
+    """
+    app_settings.settings.lcm_diffusion_setting.prompt = prompt
+    images = context.generate_text_to_image(app_settings.settings)
+    image_names = context.save_images(
+        images,
+        app_settings.settings,
+    )
+    url = request.url_for("results", path=image_names[0])
+    image_url = f"The generated image available at the URL {url}"
+    return image_url
+def start_mcp_server(port: int = 8000):
+    mcp = FastApiMCP(
+        app,
+        name="FastSDCPU MCP",
+        description="MCP server for FastSD CPU API",
+        base_url=f"http://localhost:{port}",
+    )
+    mcp.mount()
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=port,
+    )

backend/api/models/response.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import List
+from pydantic import BaseModel
+class StableDiffusionResponse(BaseModel):
+    """
+    Stable diffusion response model
+    Attributes:
+        images (List[str]): List of JPEG image as base64 encoded
+        latency (float): Latency in seconds
+    """
+    images: List[str]
+    latency: float

backend/api/web.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import platform
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from backend.api.models.response import StableDiffusionResponse
+from backend.base64_image import base64_image_to_pil, pil_image_to_base64_str
+from backend.device import get_device_name
+from backend.models.device import DeviceInfo
+from backend.models.lcmdiffusion_setting import DiffusionTask, LCMDiffusionSetting
+from constants import APP_VERSION, DEVICE
+from context import Context
+from models.interface_types import InterfaceType
+from state import get_settings
+app_settings = get_settings()
+app = FastAPI(
+    title="FastSD CPU",
+    description="Fast stable diffusion on CPU",
+    version=APP_VERSION,
+    license_info={
+        "name": "MIT",
+        "identifier": "MIT",
+    },
+    docs_url="/api/docs",
+    redoc_url="/api/redoc",
+    openapi_url="/api/openapi.json",
+)
+print(app_settings.settings.lcm_diffusion_setting)
+origins = ["*"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+context = Context(InterfaceType.API_SERVER)
+@app.get("/api/")
+async def root():
+    return {"message": "Welcome to FastSD CPU API"}
+@app.get(
+    "/api/info",
+    description="Get system information",
+    summary="Get system information",
+)
+async def info():
+    device_info = DeviceInfo(
+        device_type=DEVICE,
+        device_name=get_device_name(),
+        os=platform.system(),
+        platform=platform.platform(),
+        processor=platform.processor(),
+    )
+    return device_info.model_dump()
+@app.get(
+    "/api/config",
+    description="Get current configuration",
+    summary="Get configurations",
+)
+async def config():
+    return app_settings.settings
+@app.get(
+    "/api/models",
+    description="Get available models",
+    summary="Get available models",
+)
+async def models():
+    return {
+        "lcm_lora_models": app_settings.lcm_lora_models,
+        "stable_diffusion": app_settings.stable_diffsuion_models,
+        "openvino_models": app_settings.openvino_lcm_models,
+        "lcm_models": app_settings.lcm_models,
+    }
+@app.post(
+    "/api/generate",
+    description="Generate image(Text to image,Image to Image)",
+    summary="Generate image(Text to image,Image to Image)",
+)
+async def generate(diffusion_config: LCMDiffusionSetting) -> StableDiffusionResponse:
+    app_settings.settings.lcm_diffusion_setting = diffusion_config
+    if diffusion_config.diffusion_task == DiffusionTask.image_to_image:
+        app_settings.settings.lcm_diffusion_setting.init_image = base64_image_to_pil(
+            diffusion_config.init_image
+        )
+    images = context.generate_text_to_image(app_settings.settings)
+    images_base64 = [pil_image_to_base64_str(img) for img in images]
+    return StableDiffusionResponse(
+        latency=round(context.latency, 2),
+        images=images_base64,
+    )
+def start_web_server(port: int = 8000):
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=port,
+    )

backend/base64_image.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from io import BytesIO
+from base64 import b64encode, b64decode
+from PIL import Image
+def pil_image_to_base64_str(
+    image: Image,
+    format: str = "JPEG",
+) -> str:
+    buffer = BytesIO()
+    image.save(buffer, format=format)
+    buffer.seek(0)
+    img_base64 = b64encode(buffer.getvalue()).decode("utf-8")
+    return img_base64
+def base64_image_to_pil(base64_str) -> Image:
+    image_data = b64decode(base64_str)
+    image_buffer = BytesIO(image_data)
+    image = Image.open(image_buffer)
+    return image

backend/controlnet.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import logging
+from PIL import Image
+from diffusers import ControlNetModel
+from backend.models.lcmdiffusion_setting import (
+    DiffusionTask,
+    ControlNetSetting,
+)
+# Prepares ControlNet adapters for use with FastSD CPU
+#
+# This function loads the ControlNet adapters defined by the
+# _lcm_diffusion_setting.controlnet_ object and returns a dictionary
+# with the pipeline arguments required to use the loaded adapters
+def load_controlnet_adapters(lcm_diffusion_setting) -> dict:
+    controlnet_args = {}
+    if (
+        lcm_diffusion_setting.controlnet is None
+        or not lcm_diffusion_setting.controlnet.enabled
+    ):
+        return controlnet_args
+    logging.info("Loading ControlNet adapter")
+    controlnet_adapter = ControlNetModel.from_single_file(
+        lcm_diffusion_setting.controlnet.adapter_path,
+        # local_files_only=True,
+        use_safetensors=True,
+    )
+    controlnet_args["controlnet"] = controlnet_adapter
+    return controlnet_args
+# Updates the ControlNet pipeline arguments to use for image generation
+#
+# This function uses the contents of the _lcm_diffusion_setting.controlnet_
+# object to generate a dictionary with the corresponding pipeline arguments
+# to be used for image generation; in particular, it sets the ControlNet control
+# image and conditioning scale
+def update_controlnet_arguments(lcm_diffusion_setting) -> dict:
+    controlnet_args = {}
+    if (
+        lcm_diffusion_setting.controlnet is None
+        or not lcm_diffusion_setting.controlnet.enabled
+    ):
+        return controlnet_args
+    controlnet_args["controlnet_conditioning_scale"] = (
+        lcm_diffusion_setting.controlnet.conditioning_scale
+    )
+    if lcm_diffusion_setting.diffusion_task == DiffusionTask.text_to_image.value:
+        controlnet_args["image"] = lcm_diffusion_setting.controlnet._control_image
+    elif lcm_diffusion_setting.diffusion_task == DiffusionTask.image_to_image.value:
+        controlnet_args["control_image"] = (
+            lcm_diffusion_setting.controlnet._control_image
+        )
+    return controlnet_args
+# Helper function to adjust ControlNet settings from a dictionary
+def controlnet_settings_from_dict(
+    lcm_diffusion_setting,
+    dictionary,
+) -> None:
+    if lcm_diffusion_setting is None or dictionary is None:
+        logging.error("Invalid arguments!")
+        return
+    if (
+        "controlnet" not in dictionary
+        or dictionary["controlnet"] is None
+        or len(dictionary["controlnet"]) == 0
+    ):
+        logging.warning("ControlNet settings not found, ControlNet will be disabled")
+        lcm_diffusion_setting.controlnet = None
+        return
+    controlnet = ControlNetSetting()
+    controlnet.enabled = dictionary["controlnet"][0]["enabled"]
+    controlnet.conditioning_scale = dictionary["controlnet"][0]["conditioning_scale"]
+    controlnet.adapter_path = dictionary["controlnet"][0]["adapter_path"]
+    controlnet._control_image = None
+    image_path = dictionary["controlnet"][0]["control_image"]
+    if controlnet.enabled:
+        try:
+            controlnet._control_image = Image.open(image_path)
+        except (AttributeError, FileNotFoundError) as err:
+            print(err)
+        if controlnet._control_image is None:
+            logging.error("Wrong ControlNet control image! Disabling ControlNet")
+            controlnet.enabled = False
+    lcm_diffusion_setting.controlnet = controlnet

backend/device.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import platform
+from constants import DEVICE
+import torch
+import openvino as ov
+core = ov.Core()
+def is_openvino_device() -> bool:
+    if DEVICE.lower() == "cpu" or DEVICE.lower()[0] == "g" or DEVICE.lower()[0] == "n":
+        return True
+    else:
+        return False
+def get_device_name() -> str:
+    if DEVICE == "cuda" or DEVICE == "mps":
+        default_gpu_index = torch.cuda.current_device()
+        return torch.cuda.get_device_name(default_gpu_index)
+    elif platform.system().lower() == "darwin":
+        return platform.processor()
+    elif is_openvino_device():
+        return core.get_property(DEVICE.upper(), "FULL_DEVICE_NAME")

backend/gguf/gguf_diffusion.py ADDED Viewed

	@@ -0,0 +1,319 @@

+"""
+Wrapper class to call the stablediffusion.cpp shared library for GGUF support
+"""
+import ctypes
+import platform
+from ctypes import (
+    POINTER,
+    c_bool,
+    c_char_p,
+    c_float,
+    c_int,
+    c_int64,
+    c_void_p,
+)
+from dataclasses import dataclass
+from os import path
+from typing import List, Any
+import numpy as np
+from PIL import Image
+from backend.gguf.sdcpp_types import (
+    RngType,
+    SampleMethod,
+    Schedule,
+    SDCPPLogLevel,
+    SDImage,
+    SdType,
+)
+@dataclass
+class ModelConfig:
+    model_path: str = ""
+    clip_l_path: str = ""
+    t5xxl_path: str = ""
+    diffusion_model_path: str = ""
+    vae_path: str = ""
+    taesd_path: str = ""
+    control_net_path: str = ""
+    lora_model_dir: str = ""
+    embed_dir: str = ""
+    stacked_id_embed_dir: str = ""
+    vae_decode_only: bool = True
+    vae_tiling: bool = False
+    free_params_immediately: bool = False
+    n_threads: int = 4
+    wtype: SdType = SdType.SD_TYPE_Q4_0
+    rng_type: RngType = RngType.CUDA_RNG
+    schedule: Schedule = Schedule.DEFAULT
+    keep_clip_on_cpu: bool = False
+    keep_control_net_cpu: bool = False
+    keep_vae_on_cpu: bool = False
+@dataclass
+class Txt2ImgConfig:
+    prompt: str = "a man wearing sun glasses, highly detailed"
+    negative_prompt: str = ""
+    clip_skip: int = -1
+    cfg_scale: float = 2.0
+    guidance: float = 3.5
+    width: int = 512
+    height: int = 512
+    sample_method: SampleMethod = SampleMethod.EULER_A
+    sample_steps: int = 1
+    seed: int = -1
+    batch_count: int = 2
+    control_cond: Image = None
+    control_strength: float = 0.90
+    style_strength: float = 0.5
+    normalize_input: bool = False
+    input_id_images_path: bytes = b""
+class GGUFDiffusion:
+    """GGUF Diffusion
+    To support GGUF diffusion model based on stablediffusion.cpp
+    https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
+    Implmented based on stablediffusion.h
+    """
+    def __init__(
+        self,
+        libpath: str,
+        config: ModelConfig,
+        logging_enabled: bool = False,
+    ):
+        sdcpp_shared_lib_path = self._get_sdcpp_shared_lib_path(libpath)
+        try:
+            self.libsdcpp = ctypes.CDLL(sdcpp_shared_lib_path)
+        except OSError as e:
+            print(f"Failed to load library {sdcpp_shared_lib_path}")
+            raise ValueError(f"Error: {e}")
+        if not config.clip_l_path or not path.exists(config.clip_l_path):
+            raise ValueError(
+                "CLIP model file not found,please check readme.md for GGUF model usage"
+            )
+        if not config.t5xxl_path or not path.exists(config.t5xxl_path):
+            raise ValueError(
+                "T5XXL model file not found,please check readme.md for GGUF model usage"
+            )
+        if not config.diffusion_model_path or not path.exists(
+            config.diffusion_model_path
+        ):
+            raise ValueError(
+                "Diffusion model file not found,please check readme.md for GGUF model usage"
+            )
+        if not config.vae_path or not path.exists(config.vae_path):
+            raise ValueError(
+                "VAE model file not found,please check readme.md for GGUF model usage"
+            )
+        self.model_config = config
+        self.libsdcpp.new_sd_ctx.argtypes = [
+            c_char_p,  # const char* model_path
+            c_char_p,  # const char* clip_l_path
+            c_char_p,  # const char* t5xxl_path
+            c_char_p,  # const char* diffusion_model_path
+            c_char_p,  # const char* vae_path
+            c_char_p,  # const char* taesd_path
+            c_char_p,  # const char* control_net_path_c_str
+            c_char_p,  # const char* lora_model_dir
+            c_char_p,  # const char* embed_dir_c_str
+            c_char_p,  # const char* stacked_id_embed_dir_c_str
+            c_bool,  # bool vae_decode_only
+            c_bool,  # bool vae_tiling
+            c_bool,  # bool free_params_immediately
+            c_int,  # int n_threads
+            SdType,  # enum sd_type_t wtype
+            RngType,  # enum rng_type_t rng_type
+            Schedule,  # enum schedule_t s
+            c_bool,  # bool keep_clip_on_cpu
+            c_bool,  # bool keep_control_net_cpu
+            c_bool,  # bool keep_vae_on_cpu
+        ]
+        self.libsdcpp.new_sd_ctx.restype = POINTER(c_void_p)
+        self.sd_ctx = self.libsdcpp.new_sd_ctx(
+            self._str_to_bytes(self.model_config.model_path),
+            self._str_to_bytes(self.model_config.clip_l_path),
+            self._str_to_bytes(self.model_config.t5xxl_path),
+            self._str_to_bytes(self.model_config.diffusion_model_path),
+            self._str_to_bytes(self.model_config.vae_path),
+            self._str_to_bytes(self.model_config.taesd_path),
+            self._str_to_bytes(self.model_config.control_net_path),
+            self._str_to_bytes(self.model_config.lora_model_dir),
+            self._str_to_bytes(self.model_config.embed_dir),
+            self._str_to_bytes(self.model_config.stacked_id_embed_dir),
+            self.model_config.vae_decode_only,
+            self.model_config.vae_tiling,
+            self.model_config.free_params_immediately,
+            self.model_config.n_threads,
+            self.model_config.wtype,
+            self.model_config.rng_type,
+            self.model_config.schedule,
+            self.model_config.keep_clip_on_cpu,
+            self.model_config.keep_control_net_cpu,
+            self.model_config.keep_vae_on_cpu,
+        )
+        if logging_enabled:
+            self._set_logcallback()
+    def _set_logcallback(self):
+        print("Setting logging callback")
+        # Define function callback
+        SdLogCallbackType = ctypes.CFUNCTYPE(
+            None,
+            SDCPPLogLevel,
+            ctypes.c_char_p,
+            ctypes.c_void_p,
+        )
+        self.libsdcpp.sd_set_log_callback.argtypes = [
+            SdLogCallbackType,
+            ctypes.c_void_p,
+        ]
+        self.libsdcpp.sd_set_log_callback.restype = None
+        # Convert the Python callback to a C func pointer
+        self.c_log_callback = SdLogCallbackType(
+            self.log_callback
+        )  # prevent GC,keep callback as member variable
+        self.libsdcpp.sd_set_log_callback(self.c_log_callback, None)
+    def _get_sdcpp_shared_lib_path(
+        self,
+        root_path: str,
+    ) -> str:
+        system_name = platform.system()
+        print(f"GGUF Diffusion on {system_name}")
+        lib_name = "stable-diffusion.dll"
+        sdcpp_lib_path = ""
+        if system_name == "Windows":
+            sdcpp_lib_path = path.join(root_path, lib_name)
+        elif system_name == "Linux":
+            lib_name = "libstable-diffusion.so"
+            sdcpp_lib_path = path.join(root_path, lib_name)
+        elif system_name == "Darwin":
+            lib_name = "libstable-diffusion.dylib"
+            sdcpp_lib_path = path.join(root_path, lib_name)
+        else:
+            print("Unknown platform.")
+        return sdcpp_lib_path
+    @staticmethod
+    def log_callback(
+        level,
+        text,
+        data,
+    ):
+        print(f"{text.decode('utf-8')}", end="")
+    def _str_to_bytes(self, in_str: str, encoding: str = "utf-8") -> bytes:
+        if in_str:
+            return in_str.encode(encoding)
+        else:
+            return b""
+    def generate_text2mg(self, txt2img_cfg: Txt2ImgConfig) -> List[Any]:
+        self.libsdcpp.txt2img.restype = POINTER(SDImage)
+        self.libsdcpp.txt2img.argtypes = [
+            c_void_p,  # sd_ctx_t* sd_ctx (pointer to context object)
+            c_char_p,  # const char* prompt
+            c_char_p,  # const char* negative_prompt
+            c_int,  # int clip_skip
+            c_float,  # float cfg_scale
+            c_float,  # float guidance
+            c_int,  # int width
+            c_int,  # int height
+            SampleMethod,  # enum sample_method_t sample_method
+            c_int,  # int sample_steps
+            c_int64,  # int64_t seed
+            c_int,  # int batch_count
+            POINTER(SDImage),  # const sd_image_t* control_cond (pointer to SDImage)
+            c_float,  # float control_strength
+            c_float,  # float style_strength
+            c_bool,  # bool normalize_input
+            c_char_p,  # const char* input_id_images_path
+        ]
+        image_buffer = self.libsdcpp.txt2img(
+            self.sd_ctx,
+            self._str_to_bytes(txt2img_cfg.prompt),
+            self._str_to_bytes(txt2img_cfg.negative_prompt),
+            txt2img_cfg.clip_skip,
+            txt2img_cfg.cfg_scale,
+            txt2img_cfg.guidance,
+            txt2img_cfg.width,
+            txt2img_cfg.height,
+            txt2img_cfg.sample_method,
+            txt2img_cfg.sample_steps,
+            txt2img_cfg.seed,
+            txt2img_cfg.batch_count,
+            txt2img_cfg.control_cond,
+            txt2img_cfg.control_strength,
+            txt2img_cfg.style_strength,
+            txt2img_cfg.normalize_input,
+            txt2img_cfg.input_id_images_path,
+        )
+        images = self._get_sd_images_from_buffer(
+            image_buffer,
+            txt2img_cfg.batch_count,
+        )
+        return images
+    def _get_sd_images_from_buffer(
+        self,
+        image_buffer: Any,
+        batch_count: int,
+    ) -> List[Any]:
+        images = []
+        if image_buffer:
+            for i in range(batch_count):
+                image = image_buffer[i]
+                print(
+                    f"Generated image: {image.width}x{image.height} with {image.channel} channels"
+                )
+                width = image.width
+                height = image.height
+                channels = image.channel
+                pixel_data = np.ctypeslib.as_array(
+                    image.data, shape=(height, width, channels)
+                )
+                if channels == 1:
+                    pil_image = Image.fromarray(pixel_data.squeeze(), mode="L")
+                elif channels == 3:
+                    pil_image = Image.fromarray(pixel_data, mode="RGB")
+                elif channels == 4:
+                    pil_image = Image.fromarray(pixel_data, mode="RGBA")
+                else:
+                    raise ValueError(f"Unsupported number of channels: {channels}")
+                images.append(pil_image)
+        return images
+    def terminate(self):
+        if self.libsdcpp:
+            if self.sd_ctx:
+                self.libsdcpp.free_sd_ctx.argtypes = [c_void_p]
+                self.libsdcpp.free_sd_ctx.restype = None
+                self.libsdcpp.free_sd_ctx(self.sd_ctx)
+                del self.sd_ctx
+                self.sd_ctx = None
+                del self.libsdcpp
+                self.libsdcpp = None

backend/gguf/sdcpp_types.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Ctypes for stablediffusion.cpp shared library
+This is as per the stablediffusion.h  file
+"""
+from enum import IntEnum
+from ctypes import (
+    c_int,
+    c_uint32,
+    c_uint8,
+    POINTER,
+    Structure,
+)
+class CtypesEnum(IntEnum):
+    """A ctypes-compatible IntEnum superclass."""
+    @classmethod
+    def from_param(cls, obj):
+        return int(obj)
+class RngType(CtypesEnum):
+    STD_DEFAULT_RNG = 0
+    CUDA_RNG = 1
+class SampleMethod(CtypesEnum):
+    EULER_A = 0
+    EULER = 1
+    HEUN = 2
+    DPM2 = 3
+    DPMPP2S_A = 4
+    DPMPP2M = 5
+    DPMPP2Mv2 = 6
+    IPNDM = 7
+    IPNDM_V = 7
+    LCM = 8
+    N_SAMPLE_METHODS = 9
+class Schedule(CtypesEnum):
+    DEFAULT = 0
+    DISCRETE = 1
+    KARRAS = 2
+    EXPONENTIAL = 3
+    AYS = 4
+    GITS = 5
+    N_SCHEDULES = 5
+class SdType(CtypesEnum):
+    SD_TYPE_F32 = 0
+    SD_TYPE_F16 = 1
+    SD_TYPE_Q4_0 = 2
+    SD_TYPE_Q4_1 = 3
+    # SD_TYPE_Q4_2 = 4, support has been removed
+    # SD_TYPE_Q4_3 = 5, support has been removed
+    SD_TYPE_Q5_0 = 6
+    SD_TYPE_Q5_1 = 7
+    SD_TYPE_Q8_0 = 8
+    SD_TYPE_Q8_1 = 9
+    SD_TYPE_Q2_K = 10
+    SD_TYPE_Q3_K = 11
+    SD_TYPE_Q4_K = 12
+    SD_TYPE_Q5_K = 13
+    SD_TYPE_Q6_K = 14
+    SD_TYPE_Q8_K = 15
+    SD_TYPE_IQ2_XXS = 16
+    SD_TYPE_IQ2_XS = 17
+    SD_TYPE_IQ3_XXS = 18
+    SD_TYPE_IQ1_S = 19
+    SD_TYPE_IQ4_NL = 20
+    SD_TYPE_IQ3_S = 21
+    SD_TYPE_IQ2_S = 22
+    SD_TYPE_IQ4_XS = 23
+    SD_TYPE_I8 = 24
+    SD_TYPE_I16 = 25
+    SD_TYPE_I32 = 26
+    SD_TYPE_I64 = 27
+    SD_TYPE_F64 = 28
+    SD_TYPE_IQ1_M = 29
+    SD_TYPE_BF16 = 30
+    SD_TYPE_Q4_0_4_4 = 31
+    SD_TYPE_Q4_0_4_8 = 32
+    SD_TYPE_Q4_0_8_8 = 33
+    SD_TYPE_COUNT = 34
+class SDImage(Structure):
+    _fields_ = [
+        ("width", c_uint32),
+        ("height", c_uint32),
+        ("channel", c_uint32),
+        ("data", POINTER(c_uint8)),
+    ]
+class SDCPPLogLevel(c_int):
+    SD_LOG_LEVEL_DEBUG = 0
+    SD_LOG_LEVEL_INFO = 1
+    SD_LOG_LEVEL_WARNING = 2
+    SD_LOG_LEVEL_ERROR = 3

backend/image_saver.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import json
+from os import path, mkdir
+from typing import Any
+from uuid import uuid4
+from backend.models.lcmdiffusion_setting import LCMDiffusionSetting
+from utils import get_image_file_extension
+def get_exclude_keys():
+    exclude_keys = {
+        "init_image": True,
+        "generated_images": True,
+        "lora": {
+            "models_dir": True,
+            "path": True,
+        },
+        "dirs": True,
+        "controlnet": {
+            "adapter_path": True,
+        },
+    }
+    return exclude_keys
+class ImageSaver:
+    @staticmethod
+    def save_images(
+        output_path: str,
+        images: Any,
+        folder_name: str = "",
+        format: str = "PNG",
+        jpeg_quality: int = 90,
+        lcm_diffusion_setting: LCMDiffusionSetting = None,
+    ) -> list[str]:
+        gen_id = uuid4()
+        image_ids = []
+        if images:
+            image_seeds = []
+            for index, image in enumerate(images):
+                image_seed = image.info.get('image_seed')
+                if image_seed is not None:
+                    image_seeds.append(image_seed)
+                if not path.exists(output_path):
+                    mkdir(output_path)
+                if folder_name:
+                    out_path = path.join(
+                        output_path,
+                        folder_name,
+                    )
+                else:
+                    out_path = output_path
+                if not path.exists(out_path):
+                    mkdir(out_path)
+                image_extension = get_image_file_extension(format)
+                image_file_name = f"{gen_id}-{index+1}{image_extension}"
+                image_ids.append(image_file_name)
+                image.save(path.join(out_path, image_file_name), quality = jpeg_quality)
+            if lcm_diffusion_setting:
+                data = lcm_diffusion_setting.model_dump(exclude=get_exclude_keys())
+                if image_seeds:
+                    data['image_seeds'] = image_seeds
+                with open(path.join(out_path, f"{gen_id}.json"), "w") as json_file:
+                    json.dump(
+                        data,
+                        json_file,
+                        indent=4,
+                    )
+        return image_ids

backend/lcm_text_to_image.py ADDED Viewed

	@@ -0,0 +1,577 @@

+import gc
+from math import ceil
+from typing import Any, List
+import random
+import numpy as np
+import torch
+from backend.device import is_openvino_device
+from backend.controlnet import (
+    load_controlnet_adapters,
+    update_controlnet_arguments,
+)
+from backend.models.lcmdiffusion_setting import (
+    DiffusionTask,
+    LCMDiffusionSetting,
+    LCMLora,
+)
+from backend.openvino.pipelines import (
+    get_ov_image_to_image_pipeline,
+    get_ov_text_to_image_pipeline,
+    ov_load_taesd,
+)
+from backend.pipelines.lcm import (
+    get_image_to_image_pipeline,
+    get_lcm_model_pipeline,
+    load_taesd,
+)
+from backend.pipelines.lcm_lora import get_lcm_lora_pipeline
+from constants import DEVICE, GGUF_THREADS
+from diffusers import LCMScheduler
+from image_ops import resize_pil_image
+from backend.openvino.flux_pipeline import get_flux_pipeline
+from backend.openvino.ov_hc_stablediffusion_pipeline import OvHcLatentConsistency
+from backend.gguf.gguf_diffusion import (
+    GGUFDiffusion,
+    ModelConfig,
+    Txt2ImgConfig,
+    SampleMethod,
+)
+from paths import get_app_path
+from pprint import pprint
+try:
+    # support for token merging; keeping it optional for now
+    import tomesd
+except ImportError:
+    print("tomesd library unavailable; disabling token merging support")
+    tomesd = None
+class LCMTextToImage:
+    def __init__(
+        self,
+        device: str = "cpu",
+    ) -> None:
+        self.pipeline = None
+        self.use_openvino = False
+        self.device = ""
+        self.previous_model_id = None
+        self.previous_use_tae_sd = False
+        self.previous_use_lcm_lora = False
+        self.previous_ov_model_id = ""
+        self.previous_token_merging = 0.0
+        self.previous_safety_checker = False
+        self.previous_use_openvino = False
+        self.img_to_img_pipeline = None
+        self.is_openvino_init = False
+        self.previous_lora = None
+        self.task_type = DiffusionTask.text_to_image
+        self.previous_use_gguf_model = False
+        self.previous_gguf_model = None
+        self.torch_data_type = (
+            torch.float32 if is_openvino_device() or DEVICE == "mps" else torch.float16
+        )
+        self.ov_model_id = None
+        print(f"Torch datatype : {self.torch_data_type}")
+    def _pipeline_to_device(self):
+        print(f"Pipeline device : {DEVICE}")
+        print(f"Pipeline dtype : {self.torch_data_type}")
+        self.pipeline.to(
+            torch_device=DEVICE,
+            torch_dtype=self.torch_data_type,
+        )
+    def _add_freeu(self):
+        pipeline_class = self.pipeline.__class__.__name__
+        if isinstance(self.pipeline.scheduler, LCMScheduler):
+            if pipeline_class == "StableDiffusionPipeline":
+                print("Add FreeU - SD")
+                self.pipeline.enable_freeu(
+                    s1=0.9,
+                    s2=0.2,
+                    b1=1.2,
+                    b2=1.4,
+                )
+            elif pipeline_class == "StableDiffusionXLPipeline":
+                print("Add FreeU - SDXL")
+                self.pipeline.enable_freeu(
+                    s1=0.6,
+                    s2=0.4,
+                    b1=1.1,
+                    b2=1.2,
+                )
+    def _enable_vae_tiling(self):
+        self.pipeline.vae.enable_tiling()
+    def _update_lcm_scheduler_params(self):
+        if isinstance(self.pipeline.scheduler, LCMScheduler):
+            self.pipeline.scheduler = LCMScheduler.from_config(
+                self.pipeline.scheduler.config,
+                beta_start=0.001,
+                beta_end=0.01,
+            )
+    def _is_hetero_pipeline(self) -> bool:
+        return "square" in self.ov_model_id.lower()
+    def _load_ov_hetero_pipeline(self):
+        print("Loading Heterogeneous Compute pipeline")
+        if DEVICE.upper() == "NPU":
+            device = ["NPU", "NPU", "NPU"]
+            self.pipeline = OvHcLatentConsistency(self.ov_model_id, device)
+        else:
+            self.pipeline = OvHcLatentConsistency(self.ov_model_id)
+    def _generate_images_hetero_compute(
+        self,
+        lcm_diffusion_setting: LCMDiffusionSetting,
+    ):
+        print("Using OpenVINO ")
+        if lcm_diffusion_setting.diffusion_task == DiffusionTask.text_to_image.value:
+            return [
+                self.pipeline.generate(
+                    prompt=lcm_diffusion_setting.prompt,
+                    neg_prompt=lcm_diffusion_setting.negative_prompt,
+                    init_image=None,
+                    strength=1.0,
+                    num_inference_steps=lcm_diffusion_setting.inference_steps,
+                )
+            ]
+        else:
+            return [
+                self.pipeline.generate(
+                    prompt=lcm_diffusion_setting.prompt,
+                    neg_prompt=lcm_diffusion_setting.negative_prompt,
+                    init_image=lcm_diffusion_setting.init_image,
+                    strength=lcm_diffusion_setting.strength,
+                    num_inference_steps=lcm_diffusion_setting.inference_steps,
+                )
+            ]
+    def _is_valid_mode(
+        self,
+        modes: List,
+    ) -> bool:
+        return modes.count(True) == 1 or modes.count(False) == 3
+    def _validate_mode(
+        self,
+        modes: List,
+    ) -> None:
+        if not self._is_valid_mode(modes):
+            raise ValueError("Invalid mode,delete configs/settings.yaml and retry!")
+    def init(
+        self,
+        device: str = "cpu",
+        lcm_diffusion_setting: LCMDiffusionSetting = LCMDiffusionSetting(),
+    ) -> None:
+        # Mode validation either LCM LoRA or OpenVINO or GGUF
+        modes = [
+            lcm_diffusion_setting.use_gguf_model,
+            lcm_diffusion_setting.use_openvino,
+            lcm_diffusion_setting.use_lcm_lora,
+        ]
+        self._validate_mode(modes)
+        self.device = device
+        self.use_openvino = lcm_diffusion_setting.use_openvino
+        model_id = lcm_diffusion_setting.lcm_model_id
+        use_local_model = lcm_diffusion_setting.use_offline_model
+        use_tiny_auto_encoder = lcm_diffusion_setting.use_tiny_auto_encoder
+        use_lora = lcm_diffusion_setting.use_lcm_lora
+        lcm_lora: LCMLora = lcm_diffusion_setting.lcm_lora
+        token_merging = lcm_diffusion_setting.token_merging
+        self.ov_model_id = lcm_diffusion_setting.openvino_lcm_model_id
+        if lcm_diffusion_setting.diffusion_task == DiffusionTask.image_to_image.value:
+            lcm_diffusion_setting.init_image = resize_pil_image(
+                lcm_diffusion_setting.init_image,
+                lcm_diffusion_setting.image_width,
+                lcm_diffusion_setting.image_height,
+            )
+        if (
+            self.pipeline is None
+            or self.previous_model_id != model_id
+            or self.previous_use_tae_sd != use_tiny_auto_encoder
+            or self.previous_lcm_lora_base_id != lcm_lora.base_model_id
+            or self.previous_lcm_lora_id != lcm_lora.lcm_lora_id
+            or self.previous_use_lcm_lora != use_lora
+            or self.previous_ov_model_id != self.ov_model_id
+            or self.previous_token_merging != token_merging
+            or self.previous_safety_checker != lcm_diffusion_setting.use_safety_checker
+            or self.previous_use_openvino != lcm_diffusion_setting.use_openvino
+            or self.previous_use_gguf_model != lcm_diffusion_setting.use_gguf_model
+            or self.previous_gguf_model != lcm_diffusion_setting.gguf_model
+            or (
+                self.use_openvino
+                and (
+                    self.previous_task_type != lcm_diffusion_setting.diffusion_task
+                    or self.previous_lora != lcm_diffusion_setting.lora
+                )
+            )
+            or lcm_diffusion_setting.rebuild_pipeline
+        ):
+            if self.use_openvino and is_openvino_device():
+                if self.pipeline:
+                    del self.pipeline
+                    self.pipeline = None
+                    gc.collect()
+                self.is_openvino_init = True
+                if (
+                    lcm_diffusion_setting.diffusion_task
+                    == DiffusionTask.text_to_image.value
+                ):
+                    print(
+                        f"***** Init Text to image (OpenVINO) - {self.ov_model_id} *****"
+                    )
+                    if "flux" in self.ov_model_id.lower():
+                        print("Loading OpenVINO Flux pipeline")
+                        self.pipeline = get_flux_pipeline(
+                            self.ov_model_id,
+                            lcm_diffusion_setting.use_tiny_auto_encoder,
+                        )
+                    elif self._is_hetero_pipeline():
+                        self._load_ov_hetero_pipeline()
+                    else:
+                        self.pipeline = get_ov_text_to_image_pipeline(
+                            self.ov_model_id,
+                            use_local_model,
+                        )
+                elif (
+                    lcm_diffusion_setting.diffusion_task
+                    == DiffusionTask.image_to_image.value
+                ):
+                    if not self.pipeline and self._is_hetero_pipeline():
+                        self._load_ov_hetero_pipeline()
+                    else:
+                        print(
+                            f"***** Image to image (OpenVINO) - {self.ov_model_id} *****"
+                        )
+                        self.pipeline = get_ov_image_to_image_pipeline(
+                            self.ov_model_id,
+                            use_local_model,
+                        )
+            elif lcm_diffusion_setting.use_gguf_model:
+                model = lcm_diffusion_setting.gguf_model.diffusion_path
+                print(f"***** Init Text to image (GGUF) - {model} *****")
+                # if self.pipeline:
+                #     self.pipeline.terminate()
+                #     del self.pipeline
+                #     self.pipeline = None
+                self._init_gguf_diffusion(lcm_diffusion_setting)
+            else:
+                if self.pipeline or self.img_to_img_pipeline:
+                    self.pipeline = None
+                    self.img_to_img_pipeline = None
+                    gc.collect()
+                controlnet_args = load_controlnet_adapters(lcm_diffusion_setting)
+                if use_lora:
+                    print(
+                        f"***** Init LCM-LoRA pipeline - {lcm_lora.base_model_id} *****"
+                    )
+                    self.pipeline = get_lcm_lora_pipeline(
+                        lcm_lora.base_model_id,
+                        lcm_lora.lcm_lora_id,
+                        use_local_model,
+                        torch_data_type=self.torch_data_type,
+                        pipeline_args=controlnet_args,
+                    )
+                else:
+                    print(f"***** Init LCM Model pipeline - {model_id} *****")
+                    self.pipeline = get_lcm_model_pipeline(
+                        model_id,
+                        use_local_model,
+                        controlnet_args,
+                    )
+                self.img_to_img_pipeline = get_image_to_image_pipeline(self.pipeline)
+                if tomesd and token_merging > 0.001:
+                    print(f"***** Token Merging: {token_merging} *****")
+                    tomesd.apply_patch(self.pipeline, ratio=token_merging)
+                    tomesd.apply_patch(self.img_to_img_pipeline, ratio=token_merging)
+            if use_tiny_auto_encoder:
+                if self.use_openvino and is_openvino_device():
+                    if self.pipeline.__class__.__name__ != "OVFluxPipeline":
+                        print("Using Tiny Auto Encoder (OpenVINO)")
+                        ov_load_taesd(
+                            self.pipeline,
+                            use_local_model,
+                        )
+                else:
+                    print("Using Tiny Auto Encoder")
+                    load_taesd(
+                        self.pipeline,
+                        use_local_model,
+                        self.torch_data_type,
+                    )
+                    load_taesd(
+                        self.img_to_img_pipeline,
+                        use_local_model,
+                        self.torch_data_type,
+                    )
+            if not self.use_openvino and not is_openvino_device():
+                self._pipeline_to_device()
+            if not self._is_hetero_pipeline():
+                if (
+                    lcm_diffusion_setting.diffusion_task
+                    == DiffusionTask.image_to_image.value
+                    and lcm_diffusion_setting.use_openvino
+                ):
+                    self.pipeline.scheduler = LCMScheduler.from_config(
+                        self.pipeline.scheduler.config,
+                    )
+                else:
+                    if not lcm_diffusion_setting.use_gguf_model:
+                        self._update_lcm_scheduler_params()
+            if use_lora:
+                self._add_freeu()
+            self.previous_model_id = model_id
+            self.previous_ov_model_id = self.ov_model_id
+            self.previous_use_tae_sd = use_tiny_auto_encoder
+            self.previous_lcm_lora_base_id = lcm_lora.base_model_id
+            self.previous_lcm_lora_id = lcm_lora.lcm_lora_id
+            self.previous_use_lcm_lora = use_lora
+            self.previous_token_merging = lcm_diffusion_setting.token_merging
+            self.previous_safety_checker = lcm_diffusion_setting.use_safety_checker
+            self.previous_use_openvino = lcm_diffusion_setting.use_openvino
+            self.previous_task_type = lcm_diffusion_setting.diffusion_task
+            self.previous_lora = lcm_diffusion_setting.lora.model_copy(deep=True)
+            self.previous_use_gguf_model = lcm_diffusion_setting.use_gguf_model
+            self.previous_gguf_model = lcm_diffusion_setting.gguf_model.model_copy(
+                deep=True
+            )
+            lcm_diffusion_setting.rebuild_pipeline = False
+            if (
+                lcm_diffusion_setting.diffusion_task
+                == DiffusionTask.text_to_image.value
+            ):
+                print(f"Pipeline : {self.pipeline}")
+            elif (
+                lcm_diffusion_setting.diffusion_task
+                == DiffusionTask.image_to_image.value
+            ):
+                if self.use_openvino and is_openvino_device():
+                    print(f"Pipeline : {self.pipeline}")
+                else:
+                    print(f"Pipeline : {self.img_to_img_pipeline}")
+            if self.use_openvino:
+                if lcm_diffusion_setting.lora.enabled:
+                    print("Warning: Lora models not supported on OpenVINO mode")
+            elif not lcm_diffusion_setting.use_gguf_model:
+                adapters = self.pipeline.get_active_adapters()
+                print(f"Active adapters : {adapters}")
+    def _get_timesteps(self):
+        time_steps = self.pipeline.scheduler.config.get("timesteps")
+        time_steps_value = [int(time_steps)] if time_steps else None
+        return time_steps_value
+    def generate(
+        self,
+        lcm_diffusion_setting: LCMDiffusionSetting,
+        reshape: bool = False,
+    ) -> Any:
+        guidance_scale = lcm_diffusion_setting.guidance_scale
+        img_to_img_inference_steps = lcm_diffusion_setting.inference_steps
+        check_step_value = int(
+            lcm_diffusion_setting.inference_steps * lcm_diffusion_setting.strength
+        )
+        if (
+            lcm_diffusion_setting.diffusion_task == DiffusionTask.image_to_image.value
+            and check_step_value < 1
+        ):
+            img_to_img_inference_steps = ceil(1 / lcm_diffusion_setting.strength)
+            print(
+                f"Strength: {lcm_diffusion_setting.strength},{img_to_img_inference_steps}"
+            )
+        pipeline_extra_args = {}
+        if lcm_diffusion_setting.use_seed:
+            cur_seed = lcm_diffusion_setting.seed
+            # for multiple images with a fixed seed, use sequential seeds
+            seeds = [
+                (cur_seed + i) for i in range(lcm_diffusion_setting.number_of_images)
+            ]
+        else:
+            seeds = [
+                random.randint(0, 999999999)
+                for i in range(lcm_diffusion_setting.number_of_images)
+            ]
+        if self.use_openvino:
+            # no support for generators; try at least to ensure reproducible results for single images
+            np.random.seed(seeds[0])
+            if self._is_hetero_pipeline():
+                torch.manual_seed(seeds[0])
+                lcm_diffusion_setting.seed = seeds[0]
+        else:
+            pipeline_extra_args["generator"] = [
+                torch.Generator(device=self.device).manual_seed(s) for s in seeds
+            ]
+        is_openvino_pipe = lcm_diffusion_setting.use_openvino and is_openvino_device()
+        if is_openvino_pipe and not self._is_hetero_pipeline():
+            print("Using OpenVINO")
+            if reshape and not self.is_openvino_init:
+                print("Reshape and compile")
+                self.pipeline.reshape(
+                    batch_size=-1,
+                    height=lcm_diffusion_setting.image_height,
+                    width=lcm_diffusion_setting.image_width,
+                    num_images_per_prompt=lcm_diffusion_setting.number_of_images,
+                )
+                self.pipeline.compile()
+            if self.is_openvino_init:
+                self.is_openvino_init = False
+        if is_openvino_pipe and self._is_hetero_pipeline():
+            return self._generate_images_hetero_compute(lcm_diffusion_setting)
+        elif lcm_diffusion_setting.use_gguf_model:
+            return self._generate_images_gguf(lcm_diffusion_setting)
+        if lcm_diffusion_setting.clip_skip > 1:
+            # We follow the convention that "CLIP Skip == 2" means "skip
+            # the last layer", so "CLIP Skip == 1" means "no skipping"
+            pipeline_extra_args["clip_skip"] = lcm_diffusion_setting.clip_skip - 1
+        if not lcm_diffusion_setting.use_safety_checker:
+            self.pipeline.safety_checker = None
+            if (
+                lcm_diffusion_setting.diffusion_task
+                == DiffusionTask.image_to_image.value
+                and not is_openvino_pipe
+            ):
+                self.img_to_img_pipeline.safety_checker = None
+        if (
+            not lcm_diffusion_setting.use_lcm_lora
+            and not lcm_diffusion_setting.use_openvino
+            and lcm_diffusion_setting.guidance_scale != 1.0
+        ):
+            print("Not using LCM-LoRA so setting guidance_scale 1.0")
+            guidance_scale = 1.0
+        controlnet_args = update_controlnet_arguments(lcm_diffusion_setting)
+        if lcm_diffusion_setting.use_openvino:
+            if (
+                lcm_diffusion_setting.diffusion_task
+                == DiffusionTask.text_to_image.value
+            ):
+                result_images = self.pipeline(
+                    prompt=lcm_diffusion_setting.prompt,
+                    negative_prompt=lcm_diffusion_setting.negative_prompt,
+                    num_inference_steps=lcm_diffusion_setting.inference_steps,
+                    guidance_scale=guidance_scale,
+                    width=lcm_diffusion_setting.image_width,
+                    height=lcm_diffusion_setting.image_height,
+                    num_images_per_prompt=lcm_diffusion_setting.number_of_images,
+                ).images
+            elif (
+                lcm_diffusion_setting.diffusion_task
+                == DiffusionTask.image_to_image.value
+            ):
+                result_images = self.pipeline(
+                    image=lcm_diffusion_setting.init_image,
+                    strength=lcm_diffusion_setting.strength,
+                    prompt=lcm_diffusion_setting.prompt,
+                    negative_prompt=lcm_diffusion_setting.negative_prompt,
+                    num_inference_steps=img_to_img_inference_steps * 3,
+                    guidance_scale=guidance_scale,
+                    num_images_per_prompt=lcm_diffusion_setting.number_of_images,
+                ).images
+        else:
+            if (
+                lcm_diffusion_setting.diffusion_task
+                == DiffusionTask.text_to_image.value
+            ):
+                result_images = self.pipeline(
+                    prompt=lcm_diffusion_setting.prompt,
+                    negative_prompt=lcm_diffusion_setting.negative_prompt,
+                    num_inference_steps=lcm_diffusion_setting.inference_steps,
+                    guidance_scale=guidance_scale,
+                    width=lcm_diffusion_setting.image_width,
+                    height=lcm_diffusion_setting.image_height,
+                    num_images_per_prompt=lcm_diffusion_setting.number_of_images,
+                    timesteps=self._get_timesteps(),
+                    **pipeline_extra_args,
+                    **controlnet_args,
+                ).images
+            elif (
+                lcm_diffusion_setting.diffusion_task
+                == DiffusionTask.image_to_image.value
+            ):
+                result_images = self.img_to_img_pipeline(
+                    image=lcm_diffusion_setting.init_image,
+                    strength=lcm_diffusion_setting.strength,
+                    prompt=lcm_diffusion_setting.prompt,
+                    negative_prompt=lcm_diffusion_setting.negative_prompt,
+                    num_inference_steps=img_to_img_inference_steps,
+                    guidance_scale=guidance_scale,
+                    width=lcm_diffusion_setting.image_width,
+                    height=lcm_diffusion_setting.image_height,
+                    num_images_per_prompt=lcm_diffusion_setting.number_of_images,
+                    **pipeline_extra_args,
+                    **controlnet_args,
+                ).images
+        for i, seed in enumerate(seeds):
+            result_images[i].info["image_seed"] = seed
+        return result_images
+    def _init_gguf_diffusion(
+        self,
+        lcm_diffusion_setting: LCMDiffusionSetting,
+    ):
+        config = ModelConfig()
+        config.model_path = lcm_diffusion_setting.gguf_model.diffusion_path
+        config.diffusion_model_path = lcm_diffusion_setting.gguf_model.diffusion_path
+        config.clip_l_path = lcm_diffusion_setting.gguf_model.clip_path
+        config.t5xxl_path = lcm_diffusion_setting.gguf_model.t5xxl_path
+        config.vae_path = lcm_diffusion_setting.gguf_model.vae_path
+        config.n_threads = GGUF_THREADS
+        print(f"GGUF Threads : {GGUF_THREADS} ")
+        print("GGUF - Model config")
+        pprint(lcm_diffusion_setting.gguf_model.model_dump())
+        self.pipeline = GGUFDiffusion(
+            get_app_path(),  # Place DLL in fastsdcpu folder
+            config,
+            True,
+        )
+    def _generate_images_gguf(
+        self,
+        lcm_diffusion_setting: LCMDiffusionSetting,
+    ):
+        if lcm_diffusion_setting.diffusion_task == DiffusionTask.text_to_image.value:
+            t2iconfig = Txt2ImgConfig()
+            t2iconfig.prompt = lcm_diffusion_setting.prompt
+            t2iconfig.batch_count = lcm_diffusion_setting.number_of_images
+            t2iconfig.cfg_scale = lcm_diffusion_setting.guidance_scale
+            t2iconfig.height = lcm_diffusion_setting.image_height
+            t2iconfig.width = lcm_diffusion_setting.image_width
+            t2iconfig.sample_steps = lcm_diffusion_setting.inference_steps
+            t2iconfig.sample_method = SampleMethod.EULER
+            if lcm_diffusion_setting.use_seed:
+                t2iconfig.seed = lcm_diffusion_setting.seed
+            else:
+                t2iconfig.seed = -1
+            return self.pipeline.generate_text2mg(t2iconfig)

backend/lora.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import glob
+from os import path
+from paths import get_file_name, FastStableDiffusionPaths
+from pathlib import Path
+# A basic class to keep track of the currently loaded LoRAs and
+# their weights; the diffusers function \c get_active_adapters()
+# returns a list of adapter names but not their weights so we need
+# a way to keep track of the current LoRA weights to set whenever
+# a new LoRA is loaded
+class _lora_info:
+    def __init__(
+        self,
+        path: str,
+        weight: float,
+    ):
+        self.path = path
+        self.adapter_name = get_file_name(path)
+        self.weight = weight
+    def __del__(self):
+        self.path = None
+        self.adapter_name = None
+_loaded_loras = []
+_current_pipeline = None
+# This function loads a LoRA from the LoRA path setting, so it's
+# possible to load multiple LoRAs by calling this function more than
+# once with a different LoRA path setting; note that if you plan to
+# load multiple LoRAs and dynamically change their weights, you
+# might want to set the LoRA fuse option to False
+def load_lora_weight(
+    pipeline,
+    lcm_diffusion_setting,
+):
+    if not lcm_diffusion_setting.lora.path:
+        raise Exception("Empty lora model path")
+    if not path.exists(lcm_diffusion_setting.lora.path):
+        raise Exception("Lora model path is invalid")
+    # If the pipeline has been rebuilt since the last call, remove all
+    # references to previously loaded LoRAs and store the new pipeline
+    global _loaded_loras
+    global _current_pipeline
+    if pipeline != _current_pipeline:
+        for lora in _loaded_loras:
+            del lora
+        del _loaded_loras
+        _loaded_loras = []
+        _current_pipeline = pipeline
+    current_lora = _lora_info(
+        lcm_diffusion_setting.lora.path,
+        lcm_diffusion_setting.lora.weight,
+    )
+    _loaded_loras.append(current_lora)
+    if lcm_diffusion_setting.lora.enabled:
+        print(f"LoRA adapter name : {current_lora.adapter_name}")
+        pipeline.load_lora_weights(
+            FastStableDiffusionPaths.get_lora_models_path(),
+            weight_name=Path(lcm_diffusion_setting.lora.path).name,
+            local_files_only=True,
+            adapter_name=current_lora.adapter_name,
+        )
+        update_lora_weights(
+            pipeline,
+            lcm_diffusion_setting,
+        )
+        if lcm_diffusion_setting.lora.fuse:
+            pipeline.fuse_lora()
+def get_lora_models(root_dir: str):
+    lora_models = glob.glob(f"{root_dir}/**/*.safetensors", recursive=True)
+    lora_models_map = {}
+    for file_path in lora_models:
+        lora_name = get_file_name(file_path)
+        if lora_name is not None:
+            lora_models_map[lora_name] = file_path
+    return lora_models_map
+# This function returns a list of (adapter_name, weight) tuples for the
+# currently loaded LoRAs
+def get_active_lora_weights():
+    active_loras = []
+    for lora_info in _loaded_loras:
+        active_loras.append(
+            (
+                lora_info.adapter_name,
+                lora_info.weight,
+            )
+        )
+    return active_loras
+# This function receives a pipeline, an lcm_diffusion_setting object and
+# an optional list of updated (adapter_name, weight) tuples
+def update_lora_weights(
+    pipeline,
+    lcm_diffusion_setting,
+    lora_weights=None,
+):
+    global _loaded_loras
+    global _current_pipeline
+    if pipeline != _current_pipeline:
+        print("Wrong pipeline when trying to update LoRA weights")
+        return
+    if lora_weights:
+        for idx, lora in enumerate(lora_weights):
+            if _loaded_loras[idx].adapter_name != lora[0]:
+                print("Wrong adapter name in LoRA enumeration!")
+                continue
+            _loaded_loras[idx].weight = lora[1]
+    adapter_names = []
+    adapter_weights = []
+    if lcm_diffusion_setting.use_lcm_lora:
+        adapter_names.append("lcm")
+        adapter_weights.append(1.0)
+    for lora in _loaded_loras:
+        adapter_names.append(lora.adapter_name)
+        adapter_weights.append(lora.weight)
+    pipeline.set_adapters(
+        adapter_names,
+        adapter_weights=adapter_weights,
+    )
+    adapter_weights = zip(adapter_names, adapter_weights)
+    print(f"Adapters: {list(adapter_weights)}")

backend/models/device.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from pydantic import BaseModel
+class DeviceInfo(BaseModel):
+    device_type: str
+    device_name: str
+    os: str
+    platform: str
+    processor: str

backend/models/gen_images.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from pydantic import BaseModel
+from enum import Enum
+from paths import FastStableDiffusionPaths
+class ImageFormat(str, Enum):
+    """Image format"""
+    JPEG = "jpeg"
+    PNG = "png"
+class GeneratedImages(BaseModel):
+    path: str = FastStableDiffusionPaths.get_results_path()
+    format: str = ImageFormat.PNG.value.upper()
+    save_image: bool = True
+    save_image_quality: int = 90

backend/models/lcmdiffusion_setting.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from enum import Enum
+from PIL import Image
+from typing import Any, Optional, Union
+from constants import LCM_DEFAULT_MODEL, LCM_DEFAULT_MODEL_OPENVINO
+from paths import FastStableDiffusionPaths
+from pydantic import BaseModel
+class LCMLora(BaseModel):
+    base_model_id: str = "Lykon/dreamshaper-8"
+    lcm_lora_id: str = "latent-consistency/lcm-lora-sdv1-5"
+class DiffusionTask(str, Enum):
+    """Diffusion task types"""
+    text_to_image = "text_to_image"
+    image_to_image = "image_to_image"
+class Lora(BaseModel):
+    models_dir: str = FastStableDiffusionPaths.get_lora_models_path()
+    path: Optional[Any] = None
+    weight: Optional[float] = 0.5
+    fuse: bool = True
+    enabled: bool = False
+class ControlNetSetting(BaseModel):
+    adapter_path: Optional[str] = None  # ControlNet adapter path
+    conditioning_scale: float = 0.5
+    enabled: bool = False
+    _control_image: Image = None  # Control image, PIL image
+class GGUFModel(BaseModel):
+    gguf_models: str = FastStableDiffusionPaths.get_gguf_models_path()
+    diffusion_path: Optional[str] = None
+    clip_path: Optional[str] = None
+    t5xxl_path: Optional[str] = None
+    vae_path: Optional[str] = None
+class LCMDiffusionSetting(BaseModel):
+    lcm_model_id: str = LCM_DEFAULT_MODEL
+    openvino_lcm_model_id: str = LCM_DEFAULT_MODEL_OPENVINO
+    use_offline_model: bool = False
+    use_lcm_lora: bool = False
+    lcm_lora: Optional[LCMLora] = LCMLora()
+    use_tiny_auto_encoder: bool = False
+    use_openvino: bool = False
+    prompt: str = ""
+    negative_prompt: str = ""
+    init_image: Any = None
+    strength: Optional[float] = 0.6
+    image_height: Optional[int] = 512
+    image_width: Optional[int] = 512
+    inference_steps: Optional[int] = 1
+    guidance_scale: Optional[float] = 1
+    clip_skip: Optional[int] = 1
+    token_merging: Optional[float] = 0
+    number_of_images: Optional[int] = 1
+    seed: Optional[int] = 123123
+    use_seed: bool = False
+    use_safety_checker: bool = False
+    diffusion_task: str = DiffusionTask.text_to_image.value
+    lora: Optional[Lora] = Lora()
+    controlnet: Optional[Union[ControlNetSetting, list[ControlNetSetting]]] = None
+    dirs: dict = {
+        "controlnet": FastStableDiffusionPaths.get_controlnet_models_path(),
+        "lora": FastStableDiffusionPaths.get_lora_models_path(),
+    }
+    rebuild_pipeline: bool = False
+    use_gguf_model: bool = False
+    gguf_model: Optional[GGUFModel] = GGUFModel()

backend/models/upscale.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from enum import Enum
+class UpscaleMode(str, Enum):
+    """Diffusion task types"""
+    normal = "normal"
+    sd_upscale = "sd_upscale"
+    aura_sr = "aura_sr"

backend/openvino/custom_ov_model_vae_decoder.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from backend.device import is_openvino_device
+if is_openvino_device():
+    from optimum.intel.openvino.modeling_diffusion import OVModelVaeDecoder
+class CustomOVModelVaeDecoder(OVModelVaeDecoder):
+    def __init__(
+        self,
+        model,
+        parent_model,
+        ov_config=None,
+        model_dir=None,
+    ):
+        super(OVModelVaeDecoder, self).__init__(
+            model,
+            parent_model,
+            ov_config,
+            "vae_decoder",
+            model_dir,
+        )

backend/openvino/flux_pipeline.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from pathlib import Path
+from constants import DEVICE, LCM_DEFAULT_MODEL_OPENVINO, TAEF1_MODEL_OPENVINO
+from huggingface_hub import snapshot_download
+from backend.openvino.ovflux import (
+    TEXT_ENCODER_2_PATH,
+    TEXT_ENCODER_PATH,
+    TRANSFORMER_PATH,
+    VAE_DECODER_PATH,
+    init_pipeline,
+)
+def get_flux_pipeline(
+    model_id: str = LCM_DEFAULT_MODEL_OPENVINO,
+    use_taef1: bool = False,
+    taef1_path: str = TAEF1_MODEL_OPENVINO,
+):
+    model_dir = Path(snapshot_download(model_id))
+    vae_dir = Path(snapshot_download(taef1_path)) if use_taef1 else model_dir
+    model_dict = {
+        "transformer": model_dir / TRANSFORMER_PATH,
+        "text_encoder": model_dir / TEXT_ENCODER_PATH,
+        "text_encoder_2": model_dir / TEXT_ENCODER_2_PATH,
+        "vae": vae_dir / VAE_DECODER_PATH,
+    }
+    ov_pipe = init_pipeline(
+        model_dir,
+        model_dict,
+        device=DEVICE.upper(),
+        use_taef1=use_taef1,
+    )
+    return ov_pipe

backend/openvino/ov_hc_stablediffusion_pipeline.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""This is an experimental pipeline used to test AI PC NPU and GPU"""
+from pathlib import Path
+from diffusers import EulerDiscreteScheduler,LCMScheduler
+from huggingface_hub import snapshot_download
+from PIL import Image
+from backend.openvino.stable_diffusion_engine import (
+    StableDiffusionEngineAdvanced,
+    LatentConsistencyEngineAdvanced
+)
+class OvHcStableDiffusion:
+    "OpenVINO Heterogeneous compute Stablediffusion"
+    def __init__(
+        self,
+        model_path,
+        device: list = ["GPU", "NPU", "GPU", "GPU"],
+    ):
+        model_dir = Path(snapshot_download(model_path))
+        self.scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+        )
+        self.ov_sd_pipleline = StableDiffusionEngineAdvanced(
+            model=model_dir,
+            device=device,
+        )
+    def generate(
+        self,
+        prompt: str,
+        neg_prompt: str,
+        init_image: Image = None,
+        strength: float = 1.0,
+    ):
+        image = self.ov_sd_pipleline(
+            prompt=prompt,
+            negative_prompt=neg_prompt,
+            init_image=init_image,
+            strength=strength,
+            num_inference_steps=25,
+            scheduler=self.scheduler,
+        )
+        image_rgb = image[..., ::-1]
+        return Image.fromarray(image_rgb)
+class OvHcLatentConsistency:
+    """
+    OpenVINO Heterogeneous compute Latent consistency models
+    For the current Intel Cor Ultra, the Text Encoder and Unet can run on NPU
+    Supports following  - Text to image , Image to image and image variations
+    """
+    def __init__(
+        self,
+        model_path,
+        device: list = ["NPU", "NPU", "GPU"],
+    ):
+        model_dir = Path(snapshot_download(model_path))
+        self.scheduler = LCMScheduler(
+                beta_start=0.001,
+                beta_end=0.01,
+            )
+        self.ov_sd_pipleline = LatentConsistencyEngineAdvanced(
+            model=model_dir,
+            device=device,
+        )
+    def generate(
+        self,
+        prompt: str,
+        neg_prompt: str,
+        init_image: Image = None,
+         num_inference_steps=4,
+        strength: float = 0.5,
+    ):
+        image = self.ov_sd_pipleline(
+            prompt=prompt,
+            init_image = init_image,
+            strength = strength,
+            num_inference_steps=num_inference_steps,
+            scheduler=self.scheduler,
+            seed=None,
+        )
+        return image

backend/openvino/ovflux.py ADDED Viewed

	@@ -0,0 +1,675 @@

+"""Based on  https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/flux.1-image-generation/flux_helper.py"""
+import inspect
+import json
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+import numpy as np
+import openvino as ov
+import torch
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils.torch_utils import randn_tensor
+from transformers import AutoTokenizer
+TRANSFORMER_PATH = Path("transformer/transformer.xml")
+VAE_DECODER_PATH = Path("vae/vae_decoder.xml")
+TEXT_ENCODER_PATH = Path("text_encoder/text_encoder.xml")
+TEXT_ENCODER_2_PATH = Path("text_encoder_2/text_encoder_2.xml")
+def cleanup_torchscript_cache():
+    """
+    Helper for removing cached model representation
+    """
+    torch._C._jit_clear_class_registry()
+    torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
+    torch.jit._state._clear_class_state()
+def _prepare_latent_image_ids(
+    batch_size, height, width, device=torch.device("cpu"), dtype=torch.float32
+):
+    latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+    latent_image_ids[..., 1] = (
+        latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+    )
+    latent_image_ids[..., 2] = (
+        latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+    )
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = (
+        latent_image_ids.shape
+    )
+    latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
+    latent_image_ids = latent_image_ids.reshape(
+        batch_size,
+        latent_image_id_height * latent_image_id_width,
+        latent_image_id_channels,
+    )
+    return latent_image_ids.to(device=device, dtype=dtype)
+def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
+    assert dim % 2 == 0, "The dimension must be even."
+    scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    batch_size, seq_length = pos.shape
+    out = pos.unsqueeze(-1) * omega.unsqueeze(0).unsqueeze(0)
+    cos_out = torch.cos(out)
+    sin_out = torch.sin(out)
+    stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1)
+    out = stacked_out.view(batch_size, -1, dim // 2, 2, 2)
+    return out.float()
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class OVFluxPipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        scheduler,
+        transformer,
+        vae,
+        text_encoder,
+        text_encoder_2,
+        tokenizer,
+        tokenizer_2,
+        transformer_config,
+        vae_config,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_config = vae_config
+        self.transformer_config = transformer_config
+        self.vae_scale_factor = 2 ** (
+            len(self.vae_config.get("block_out_channels", [0] * 16))
+            if hasattr(self, "vae") and self.vae is not None
+            else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length
+            if hasattr(self, "tokenizer") and self.tokenizer is not None
+            else 77
+        )
+        self.default_sample_size = 64
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_embeds = torch.from_numpy(self.text_encoder_2(text_input_ids)[0])
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            batch_size * num_images_per_prompt, seq_len, -1
+        )
+        return prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_embeds = torch.from_numpy(self.text_encoder(text_input_ids)[1])
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+            )
+        text_ids = torch.zeros(batch_size, prompt_embeds.shape[1], 3)
+        text_ids = text_ids.repeat(num_images_per_prompt, 1, 1)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        elif prompt_2 is not None and (
+            not isinstance(prompt_2, str) and not isinstance(prompt_2, list)
+        ):
+            raise ValueError(
+                f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}"
+            )
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(
+                f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}"
+            )
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width):
+        return _prepare_latent_image_ids(batch_size, height, width)
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(
+            batch_size, num_channels_latents, height // 2, 2, width // 2, 2
+        )
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(
+            batch_size, (height // 2) * (width // 2), num_channels_latents * 4
+        )
+        return latents
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(
+            batch_size, channels // (2 * 2), height * 2, width * 2
+        )
+        return latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width)
+            return latents, latent_image_ids
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator)
+        latents = self._pack_latents(
+            latents, batch_size, num_channels_latents, height, width
+        )
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width)
+        return latents, latent_image_ids
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        negative_prompt: str = None,
+        num_inference_steps: int = 28,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+        )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer_config.get("in_channels", 64) // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            generator,
+            latents,
+        )
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            scheduler=self.scheduler,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+        self._num_timesteps = len(timesteps)
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                # handle guidance
+                if self.transformer_config.get("guidance_embeds"):
+                    guidance = torch.tensor([guidance_scale])
+                    guidance = guidance.expand(latents.shape[0])
+                else:
+                    guidance = None
+                transformer_input = {
+                    "hidden_states": latents,
+                    "timestep": timestep / 1000,
+                    "pooled_projections": pooled_prompt_embeds,
+                    "encoder_hidden_states": prompt_embeds,
+                    "txt_ids": text_ids,
+                    "img_ids": latent_image_ids,
+                }
+                if guidance is not None:
+                    transformer_input["guidance"] = guidance
+                noise_pred = torch.from_numpy(self.transformer(transformer_input)[0])
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, return_dict=False
+                )[0]
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(
+                latents, height, width, self.vae_scale_factor
+            )
+            latents = latents / self.vae_config.get(
+                "scaling_factor"
+            ) + self.vae_config.get("shift_factor")
+            image = self.vae(latents)[0]
+            image = self.image_processor.postprocess(
+                torch.from_numpy(image), output_type=output_type
+            )
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)
+def init_pipeline(
+    model_dir,
+    models_dict: Dict[str, Any],
+    device: str,
+    use_taef1: bool = False,
+):
+    pipeline_args = {}
+    print("OpenVINO FLUX Model compilation")
+    core = ov.Core()
+    for model_name, model_path in models_dict.items():
+        pipeline_args[model_name] = core.compile_model(model_path, device)
+        if model_name == "vae" and use_taef1:
+            print(f"✅ VAE(TAEF1) - Done!")
+        else:
+            print(f"✅ {model_name} - Done!")
+    transformer_path = models_dict["transformer"]
+    transformer_config_path = transformer_path.parent / "config.json"
+    with transformer_config_path.open("r") as f:
+        transformer_config = json.load(f)
+    vae_path = models_dict["vae"]
+    vae_config_path = vae_path.parent / "config.json"
+    with vae_config_path.open("r") as f:
+        vae_config = json.load(f)
+    pipeline_args["vae_config"] = vae_config
+    pipeline_args["transformer_config"] = transformer_config
+    scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(model_dir / "scheduler")
+    tokenizer = AutoTokenizer.from_pretrained(model_dir / "tokenizer")
+    tokenizer_2 = AutoTokenizer.from_pretrained(model_dir / "tokenizer_2")
+    pipeline_args["scheduler"] = scheduler
+    pipeline_args["tokenizer"] = tokenizer
+    pipeline_args["tokenizer_2"] = tokenizer_2
+    ov_pipe = OVFluxPipeline(**pipeline_args)
+    return ov_pipe

backend/openvino/pipelines.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from constants import DEVICE, LCM_DEFAULT_MODEL_OPENVINO
+from backend.tiny_decoder import get_tiny_decoder_vae_model
+from typing import Any
+from backend.device import is_openvino_device
+from paths import get_base_folder_name
+if is_openvino_device():
+    from huggingface_hub import snapshot_download
+    from optimum.intel.openvino.modeling_diffusion import OVBaseModel
+    from optimum.intel.openvino.modeling_diffusion import (
+        OVStableDiffusionPipeline,
+        OVStableDiffusionImg2ImgPipeline,
+        OVStableDiffusionXLPipeline,
+        OVStableDiffusionXLImg2ImgPipeline,
+    )
+    from backend.openvino.custom_ov_model_vae_decoder import CustomOVModelVaeDecoder
+def ov_load_taesd(
+    pipeline: Any,
+    use_local_model: bool = False,
+):
+    taesd_dir = snapshot_download(
+        repo_id=get_tiny_decoder_vae_model(pipeline.__class__.__name__),
+        local_files_only=use_local_model,
+    )
+    pipeline.vae_decoder = CustomOVModelVaeDecoder(
+        model=OVBaseModel.load_model(f"{taesd_dir}/vae_decoder/openvino_model.xml"),
+        parent_model=pipeline,
+        model_dir=taesd_dir,
+    )
+def get_ov_text_to_image_pipeline(
+    model_id: str = LCM_DEFAULT_MODEL_OPENVINO,
+    use_local_model: bool = False,
+) -> Any:
+    if "xl" in get_base_folder_name(model_id).lower():
+        pipeline = OVStableDiffusionXLPipeline.from_pretrained(
+            model_id,
+            local_files_only=use_local_model,
+            ov_config={"CACHE_DIR": ""},
+            device=DEVICE.upper(),
+        )
+    else:
+        pipeline = OVStableDiffusionPipeline.from_pretrained(
+            model_id,
+            local_files_only=use_local_model,
+            ov_config={"CACHE_DIR": ""},
+            device=DEVICE.upper(),
+        )
+    return pipeline
+def get_ov_image_to_image_pipeline(
+    model_id: str = LCM_DEFAULT_MODEL_OPENVINO,
+    use_local_model: bool = False,
+) -> Any:
+    if "xl" in get_base_folder_name(model_id).lower():
+        pipeline = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(
+            model_id,
+            local_files_only=use_local_model,
+            ov_config={"CACHE_DIR": ""},
+            device=DEVICE.upper(),
+        )
+    else:
+        pipeline = OVStableDiffusionImg2ImgPipeline.from_pretrained(
+            model_id,
+            local_files_only=use_local_model,
+            ov_config={"CACHE_DIR": ""},
+            device=DEVICE.upper(),
+        )
+    return pipeline

backend/openvino/stable_diffusion_engine.py ADDED Viewed

	@@ -0,0 +1,1817 @@

+"""
+Copyright(C) 2022-2023 Intel Corporation
+SPDX - License - Identifier: Apache - 2.0
+"""
+import inspect
+from typing import Union, Optional, Any, List, Dict
+import numpy as np
+# openvino
+from openvino.runtime import Core
+# tokenizer
+from transformers import CLIPTokenizer
+import torch
+import random
+from diffusers import DiffusionPipeline
+from diffusers.schedulers import (DDIMScheduler,
+                                  LMSDiscreteScheduler,
+                                  PNDMScheduler,
+                                  EulerDiscreteScheduler,
+                                  EulerAncestralDiscreteScheduler)
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import PIL_INTERPOLATION
+import cv2
+import os
+import sys
+# for multithreading
+import concurrent.futures
+#For GIF
+import PIL
+from PIL import Image
+import glob
+import json
+import time
+def scale_fit_to_window(dst_width:int, dst_height:int, image_width:int, image_height:int):
+    """
+    Preprocessing helper function for calculating image size for resize with peserving original aspect ratio
+    and fitting image to specific window size
+    Parameters:
+      dst_width (int): destination window width
+      dst_height (int): destination window height
+      image_width (int): source image width
+      image_height (int): source image height
+    Returns:
+      result_width (int): calculated width for resize
+      result_height (int): calculated height for resize
+    """
+    im_scale = min(dst_height / image_height, dst_width / image_width)
+    return int(im_scale * image_width), int(im_scale * image_height)
+def preprocess(image: PIL.Image.Image, ht=512, wt=512):
+    """
+    Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512,
+    then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that
+    converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW.
+    The function returns preprocessed input tensor and padding size, which can be used in postprocessing.
+    Parameters:
+      image (PIL.Image.Image): input image
+    Returns:
+       image (np.ndarray): preprocessed image tensor
+       meta (Dict): dictionary with preprocessing metadata info
+    """
+    src_width, src_height = image.size
+    image = image.convert('RGB')
+    dst_width, dst_height = scale_fit_to_window(
+        wt, ht, src_width, src_height)
+    image = np.array(image.resize((dst_width, dst_height),
+                     resample=PIL.Image.Resampling.LANCZOS))[None, :]
+    pad_width = wt - dst_width
+    pad_height = ht - dst_height
+    pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0))
+    image = np.pad(image, pad, mode="constant")
+    image = image.astype(np.float32) / 255.0
+    image = 2.0 * image - 1.0
+    image = image.transpose(0, 3, 1, 2)
+    return image, {"padding": pad, "src_width": src_width, "src_height": src_height}
+def try_enable_npu_turbo(device, core):
+    import platform
+    if "windows" in platform.system().lower():
+        if "NPU" in device and "3720" not in core.get_property('NPU', 'DEVICE_ARCHITECTURE'):
+            try:
+                core.set_property(properties={'NPU_TURBO': 'YES'},device_name='NPU')
+            except:
+                print(f"Failed loading NPU_TURBO for device {device}. Skipping... ")
+            else:
+                print_npu_turbo_art()
+        else:
+            print(f"Skipping NPU_TURBO for device {device}")
+    elif "linux" in platform.system().lower():
+        if os.path.isfile('/sys/module/intel_vpu/parameters/test_mode'):
+            with open('/sys/module/intel_vpu/version', 'r') as f:
+                version = f.readline().split()[0]
+                if tuple(map(int, version.split('.'))) < tuple(map(int, '1.9.0'.split('.'))):
+                    print(f"The driver intel_vpu-1.9.0 (or later) needs to be loaded for NPU Turbo (currently {version}). Skipping...")
+                else:
+                    with open('/sys/module/intel_vpu/parameters/test_mode', 'r') as tm_file:
+                        test_mode = int(tm_file.readline().split()[0])
+                        if test_mode == 512:
+                            print_npu_turbo_art()
+                        else:
+                            print("The driver >=intel_vpu-1.9.0 was must be loaded with "
+                                  "\"modprobe intel_vpu test_mode=512\" to enable NPU_TURBO "
+                                  f"(currently test_mode={test_mode}). Skipping...")
+        else:
+            print(f"The driver >=intel_vpu-1.9.0 must be loaded with  \"modprobe intel_vpu test_mode=512\" to enable NPU_TURBO. Skipping...")
+    else:
+        print(f"This platform ({platform.system()}) does not support NPU Turbo")
+def result(var):
+    return next(iter(var.values()))
+class StableDiffusionEngineAdvanced(DiffusionPipeline):
+    def __init__(self, model="runwayml/stable-diffusion-v1-5",
+                  tokenizer="openai/clip-vit-large-patch14",
+                  device=["CPU", "CPU", "CPU", "CPU"]):
+        try:
+            self.tokenizer = CLIPTokenizer.from_pretrained(model, local_files_only=True)
+        except:
+            self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer)
+            self.tokenizer.save_pretrained(model)
+        self.core = Core()
+        self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')})
+        try_enable_npu_turbo(device, self.core)
+        print("Loading models... ")
+        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+            futures = {
+                "unet_time_proj": executor.submit(self.core.compile_model, os.path.join(model, "unet_time_proj.xml"), device[0]),
+                "text": executor.submit(self.load_model, model, "text_encoder", device[0]),
+                "unet": executor.submit(self.load_model, model, "unet_int8", device[1]),
+                "unet_neg": executor.submit(self.load_model, model, "unet_int8", device[2]) if device[1] != device[2] else None,
+                "vae_decoder": executor.submit(self.load_model, model, "vae_decoder", device[3]),
+                "vae_encoder": executor.submit(self.load_model, model, "vae_encoder", device[3])
+            }
+        self.unet_time_proj = futures["unet_time_proj"].result()
+        self.text_encoder = futures["text"].result()
+        self.unet = futures["unet"].result()
+        self.unet_neg = futures["unet_neg"].result() if futures["unet_neg"] else self.unet
+        self.vae_decoder = futures["vae_decoder"].result()
+        self.vae_encoder = futures["vae_encoder"].result()
+        print("Text Device:", device[0])
+        print("unet Device:", device[1])
+        print("unet-neg Device:", device[2])
+        print("VAE Device:", device[3])
+        self._text_encoder_output = self.text_encoder.output(0)
+        self._vae_d_output = self.vae_decoder.output(0)
+        self._vae_e_output = self.vae_encoder.output(0) if self.vae_encoder else None
+        self.set_dimensions()
+        self.infer_request_neg = self.unet_neg.create_infer_request()
+        self.infer_request = self.unet.create_infer_request()
+        self.infer_request_time_proj = self.unet_time_proj.create_infer_request()
+        self.time_proj_constants = np.load(os.path.join(model, "time_proj_constants.npy"))
+    def load_model(self, model, model_name, device):
+        if "NPU" in device:
+            with open(os.path.join(model, f"{model_name}.blob"), "rb") as f:
+                return self.core.import_model(f.read(), device)
+        return self.core.compile_model(os.path.join(model, f"{model_name}.xml"), device)
+    def set_dimensions(self):
+        latent_shape = self.unet.input("latent_model_input").shape
+        if latent_shape[1] == 4:
+            self.height = latent_shape[2] * 8
+            self.width = latent_shape[3] * 8
+        else:
+            self.height = latent_shape[1] * 8
+            self.width = latent_shape[2] * 8
+    def __call__(
+            self,
+            prompt,
+            init_image = None,
+            negative_prompt=None,
+            scheduler=None,
+            strength = 0.5,
+            num_inference_steps = 32,
+            guidance_scale = 7.5,
+            eta = 0.0,
+            create_gif = False,
+            model = None,
+            callback = None,
+            callback_userdata = None
+    ):
+        # extract condition
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids)[self._text_encoder_output]
+        # do classifier free guidance
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if do_classifier_free_guidance:
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            else:
+                uncond_tokens = negative_prompt
+            tokens_uncond = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length, #truncation=True,
+                return_tensors="np"
+            )
+            uncond_embeddings = self.text_encoder(tokens_uncond.input_ids)[self._text_encoder_output]
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+        scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, scheduler)
+        latent_timestep = timesteps[:1]
+        # get the initial random noise unless the user supplied it
+        latents, meta = self.prepare_latents(init_image, latent_timestep, scheduler)
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        if create_gif:
+            frames = []
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            if callback:
+               callback(i, callback_userdata)
+            # expand the latents if we are doing classifier free guidance
+            noise_pred = []
+            latent_model_input = latents
+            latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+            latent_model_input_neg = latent_model_input
+            if self.unet.input("latent_model_input").shape[1] != 4:
+                #print("In transpose")
+                try:
+                    latent_model_input = latent_model_input.permute(0,2,3,1)
+                except:
+                    latent_model_input = latent_model_input.transpose(0,2,3,1)
+            if self.unet_neg.input("latent_model_input").shape[1] != 4:
+                #print("In transpose")
+                try:
+                    latent_model_input_neg = latent_model_input_neg.permute(0,2,3,1)
+                except:
+                    latent_model_input_neg = latent_model_input_neg.transpose(0,2,3,1)
+            time_proj_constants_fp16 = np.float16(self.time_proj_constants)
+            t_scaled_fp16 = time_proj_constants_fp16 * np.float16(t)
+            cosine_t_fp16 = np.cos(t_scaled_fp16)
+            sine_t_fp16 = np.sin(t_scaled_fp16)
+            t_scaled = self.time_proj_constants * np.float32(t)
+            cosine_t = np.cos(t_scaled)
+            sine_t = np.sin(t_scaled)
+            time_proj_dict = {"sine_t" : np.float32(sine_t), "cosine_t" : np.float32(cosine_t)}
+            self.infer_request_time_proj.start_async(time_proj_dict)
+            self.infer_request_time_proj.wait()
+            time_proj = self.infer_request_time_proj.get_output_tensor(0).data.astype(np.float32)
+            input_tens_neg_dict = {"time_proj": np.float32(time_proj), "latent_model_input":latent_model_input_neg, "encoder_hidden_states": np.expand_dims(text_embeddings[0], axis=0)}
+            input_tens_dict = {"time_proj": np.float32(time_proj), "latent_model_input":latent_model_input, "encoder_hidden_states": np.expand_dims(text_embeddings[1], axis=0)}
+            self.infer_request_neg.start_async(input_tens_neg_dict)
+            self.infer_request.start_async(input_tens_dict)
+            self.infer_request_neg.wait()
+            self.infer_request.wait()
+            noise_pred_neg = self.infer_request_neg.get_output_tensor(0)
+            noise_pred_pos = self.infer_request.get_output_tensor(0)
+            noise_pred.append(noise_pred_neg.data.astype(np.float32))
+            noise_pred.append(noise_pred_pos.data.astype(np.float32))
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1]
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs)["prev_sample"].numpy()
+            if create_gif:
+                frames.append(latents)
+        if callback:
+            callback(num_inference_steps, callback_userdata)
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        start = time.time()
+        image = self.vae_decoder(latents)[self._vae_d_output]
+        print("Decoder ended:",time.time() - start)
+        image = self.postprocess_image(image, meta)
+        if create_gif:
+            gif_folder=os.path.join(model,"../../../gif")
+            print("gif_folder:",gif_folder)
+            if not os.path.exists(gif_folder):
+                os.makedirs(gif_folder)
+            for i in range(0,len(frames)):
+                image = self.vae_decoder(frames[i]*(1/0.18215))[self._vae_d_output]
+                image = self.postprocess_image(image, meta)
+                output = gif_folder + "/" + str(i).zfill(3) +".png"
+                cv2.imwrite(output, image)
+            with open(os.path.join(gif_folder, "prompt.json"), "w") as file:
+                json.dump({"prompt": prompt}, file)
+            frames_image =  [Image.open(image) for image in glob.glob(f"{gif_folder}/*.png")]
+            frame_one = frames_image[0]
+            gif_file=os.path.join(gif_folder,"stable_diffusion.gif")
+            frame_one.save(gif_file, format="GIF", append_images=frames_image, save_all=True, duration=100, loop=0)
+        return image
+    def prepare_latents(self, image:PIL.Image.Image = None, latent_timestep:torch.Tensor = None, scheduler = LMSDiscreteScheduler):
+        """
+        Function for getting initial latents for starting generation
+        Parameters:
+            image (PIL.Image.Image, *optional*, None):
+                Input image for generation, if not provided randon noise will be used as starting point
+            latent_timestep (torch.Tensor, *optional*, None):
+                Predicted by scheduler initial step for image generation, required for latent image mixing with nosie
+        Returns:
+            latents (np.ndarray):
+                Image encoded in latent space
+        """
+        latents_shape = (1, 4, self.height // 8, self.width // 8)
+        noise = np.random.randn(*latents_shape).astype(np.float32)
+        if image is None:
+            ##print("Image is NONE")
+            # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas
+            if isinstance(scheduler, LMSDiscreteScheduler):
+                noise = noise * scheduler.sigmas[0].numpy()
+                return noise, {}
+            elif isinstance(scheduler, EulerDiscreteScheduler) or isinstance(scheduler,EulerAncestralDiscreteScheduler):
+                noise = noise * scheduler.sigmas.max().numpy()
+                return noise, {}
+            else:
+                return noise, {}
+        input_image, meta = preprocess(image,self.height,self.width)
+        moments = self.vae_encoder(input_image)[self._vae_e_output]
+        mean, logvar = np.split(moments, 2, axis=1)
+        std = np.exp(logvar * 0.5)
+        latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215
+        latents = scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy()
+        return latents, meta
+    def postprocess_image(self, image:np.ndarray, meta:Dict):
+        """
+        Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initial image size (if required),
+        normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format
+        Parameters:
+            image (np.ndarray):
+                Generated image
+            meta (Dict):
+                Metadata obtained on latents preparing step, can be empty
+            output_type (str, *optional*, pil):
+                Output format for result, can be pil or numpy
+        Returns:
+            image (List of np.ndarray or PIL.Image.Image):
+                Postprocessed images
+                        if "src_height" in meta:
+            orig_height, orig_width = meta["src_height"], meta["src_width"]
+            image = [cv2.resize(img, (orig_width, orig_height))
+                        for img in image]
+        return image
+        """
+        if "padding" in meta:
+            pad = meta["padding"]
+            (_, end_h), (_, end_w) = pad[1:3]
+            h, w = image.shape[2:]
+            #print("image shape",image.shape[2:])
+            unpad_h = h - end_h
+            unpad_w = w - end_w
+            image = image[:, :, :unpad_h, :unpad_w]
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8)
+        if "src_height" in meta:
+            orig_height, orig_width = meta["src_height"], meta["src_width"]
+            image = cv2.resize(image, (orig_width, orig_height))
+        return image
+    def get_timesteps(self, num_inference_steps:int, strength:float, scheduler):
+        """
+        Helper function for getting scheduler timesteps for generation
+        In case of image-to-image generation, it updates number of steps according to strength
+        Parameters:
+           num_inference_steps (int):
+              number of inference steps for generation
+           strength (float):
+               value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
+               Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
+        """
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = scheduler.timesteps[t_start:]
+        return timesteps, num_inference_steps - t_start
+class StableDiffusionEngine(DiffusionPipeline):
+    def __init__(
+            self,
+            model="bes-dev/stable-diffusion-v1-4-openvino",
+            tokenizer="openai/clip-vit-large-patch14",
+            device=["CPU","CPU","CPU","CPU"]):
+        self.core = Core()
+        self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')})
+        self.batch_size = 2 if device[1] == device[2] and device[1] == "GPU" else 1
+        try_enable_npu_turbo(device, self.core)
+        try:
+            self.tokenizer = CLIPTokenizer.from_pretrained(model, local_files_only=True)
+        except Exception as e:
+            print("Local tokenizer not found. Attempting to download...")
+            self.tokenizer = self.download_tokenizer(tokenizer, model)
+        print("Loading models... ")
+        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+            text_future = executor.submit(self.load_model, model, "text_encoder", device[0])
+            vae_de_future = executor.submit(self.load_model, model, "vae_decoder", device[3])
+            vae_en_future = executor.submit(self.load_model, model, "vae_encoder", device[3])
+            if self.batch_size == 1:
+                if "int8" not in model:
+                    unet_future = executor.submit(self.load_model, model, "unet_bs1", device[1])
+                    unet_neg_future = executor.submit(self.load_model, model, "unet_bs1", device[2]) if device[1] != device[2] else None
+                else:
+                    unet_future = executor.submit(self.load_model, model, "unet_int8a16", device[1])
+                    unet_neg_future = executor.submit(self.load_model, model, "unet_int8a16", device[2]) if device[1] != device[2] else None
+            else:
+                unet_future = executor.submit(self.load_model, model, "unet", device[1])
+                unet_neg_future = None
+            self.unet = unet_future.result()
+            self.unet_neg = unet_neg_future.result() if unet_neg_future else self.unet
+            self.text_encoder = text_future.result()
+            self.vae_decoder = vae_de_future.result()
+            self.vae_encoder = vae_en_future.result()
+            print("Text Device:", device[0])
+            print("unet Device:", device[1])
+            print("unet-neg Device:", device[2])
+            print("VAE Device:", device[3])
+            self._text_encoder_output = self.text_encoder.output(0)
+            self._unet_output = self.unet.output(0)
+            self._vae_d_output = self.vae_decoder.output(0)
+            self._vae_e_output = self.vae_encoder.output(0) if self.vae_encoder else None
+            self.unet_input_tensor_name = "sample" if 'sample' in self.unet.input(0).names else "latent_model_input"
+            if self.batch_size == 1:
+                self.infer_request = self.unet.create_infer_request()
+                self.infer_request_neg = self.unet_neg.create_infer_request()
+                self._unet_neg_output = self.unet_neg.output(0)
+            else:
+                self.infer_request = None
+                self.infer_request_neg = None
+                self._unet_neg_output = None
+        self.set_dimensions()
+    def load_model(self, model, model_name, device):
+        if "NPU" in device:
+            with open(os.path.join(model, f"{model_name}.blob"), "rb") as f:
+                return self.core.import_model(f.read(), device)
+        return self.core.compile_model(os.path.join(model, f"{model_name}.xml"), device)
+    def set_dimensions(self):
+        latent_shape = self.unet.input(self.unet_input_tensor_name).shape
+        if latent_shape[1] == 4:
+            self.height = latent_shape[2] * 8
+            self.width = latent_shape[3] * 8
+        else:
+            self.height = latent_shape[1] * 8
+            self.width = latent_shape[2] * 8
+    def __call__(
+            self,
+            prompt,
+            init_image=None,
+            negative_prompt=None,
+            scheduler=None,
+            strength=0.5,
+            num_inference_steps=32,
+            guidance_scale=7.5,
+            eta=0.0,
+            create_gif=False,
+            model=None,
+            callback=None,
+            callback_userdata=None
+    ):
+        # extract condition
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids)[self._text_encoder_output]
+        # do classifier free guidance
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if do_classifier_free_guidance:
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            else:
+                uncond_tokens = negative_prompt
+            tokens_uncond = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,  # truncation=True,
+                return_tensors="np"
+            )
+            uncond_embeddings = self.text_encoder(tokens_uncond.input_ids)[self._text_encoder_output]
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+        scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, scheduler)
+        latent_timestep = timesteps[:1]
+        # get the initial random noise unless the user supplied it
+        latents, meta = self.prepare_latents(init_image, latent_timestep, scheduler,model)
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        if create_gif:
+            frames = []
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            if callback:
+                callback(i, callback_userdata)
+            if self.batch_size == 1:
+                # expand the latents if we are doing classifier free guidance
+                noise_pred = []
+                latent_model_input = latents
+                #Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+                latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input_pos = latent_model_input
+                latent_model_input_neg = latent_model_input
+                if self.unet.input(self.unet_input_tensor_name).shape[1] != 4:
+                    try:
+                        latent_model_input_pos = latent_model_input_pos.permute(0,2,3,1)
+                    except:
+                        latent_model_input_pos = latent_model_input_pos.transpose(0,2,3,1)
+                if self.unet_neg.input(self.unet_input_tensor_name).shape[1] != 4:
+                    try:
+                        latent_model_input_neg = latent_model_input_neg.permute(0,2,3,1)
+                    except:
+                        latent_model_input_neg = latent_model_input_neg.transpose(0,2,3,1)
+                if "sample" in self.unet_input_tensor_name:
+                    input_tens_neg_dict = {"sample" : latent_model_input_neg, "encoder_hidden_states": np.expand_dims(text_embeddings[0], axis=0), "timestep": np.expand_dims(np.float32(t), axis=0)}
+                    input_tens_pos_dict = {"sample" : latent_model_input_pos, "encoder_hidden_states": np.expand_dims(text_embeddings[1], axis=0), "timestep": np.expand_dims(np.float32(t), axis=0)}
+                else:
+                    input_tens_neg_dict = {"latent_model_input" : latent_model_input_neg, "encoder_hidden_states": np.expand_dims(text_embeddings[0], axis=0), "t": np.expand_dims(np.float32(t), axis=0)}
+                    input_tens_pos_dict = {"latent_model_input" : latent_model_input_pos, "encoder_hidden_states": np.expand_dims(text_embeddings[1], axis=0), "t": np.expand_dims(np.float32(t), axis=0)}
+                self.infer_request_neg.start_async(input_tens_neg_dict)
+                self.infer_request.start_async(input_tens_pos_dict)
+                self.infer_request_neg.wait()
+                self.infer_request.wait()
+                noise_pred_neg = self.infer_request_neg.get_output_tensor(0)
+                noise_pred_pos = self.infer_request.get_output_tensor(0)
+                noise_pred.append(noise_pred_neg.data.astype(np.float32))
+                noise_pred.append(noise_pred_pos.data.astype(np.float32))
+            else:
+                latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+                noise_pred = self.unet([latent_model_input, np.array(t, dtype=np.float32), text_embeddings])[self._unet_output]
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1]
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs)["prev_sample"].numpy()
+            if create_gif:
+                frames.append(latents)
+        if callback:
+            callback(num_inference_steps, callback_userdata)
+        # scale and decode the image latents with vae
+        #if self.height == 512 and self.width == 512:
+        latents = 1 / 0.18215 * latents
+        image = self.vae_decoder(latents)[self._vae_d_output]
+        image = self.postprocess_image(image, meta)
+        return image
+    def prepare_latents(self, image: PIL.Image.Image = None, latent_timestep: torch.Tensor = None,
+                        scheduler=LMSDiscreteScheduler,model=None):
+        """
+        Function for getting initial latents for starting generation
+        Parameters:
+            image (PIL.Image.Image, *optional*, None):
+                Input image for generation, if not provided randon noise will be used as starting point
+            latent_timestep (torch.Tensor, *optional*, None):
+                Predicted by scheduler initial step for image generation, required for latent image mixing with nosie
+        Returns:
+            latents (np.ndarray):
+                Image encoded in latent space
+        """
+        latents_shape = (1, 4, self.height // 8, self.width // 8)
+        noise = np.random.randn(*latents_shape).astype(np.float32)
+        if image is None:
+            #print("Image is NONE")
+            # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas
+            if isinstance(scheduler, LMSDiscreteScheduler):
+                noise = noise * scheduler.sigmas[0].numpy()
+                return noise, {}
+            elif isinstance(scheduler, EulerDiscreteScheduler):
+                noise = noise * scheduler.sigmas.max().numpy()
+                return noise, {}
+            else:
+                return noise, {}
+        input_image, meta = preprocess(image, self.height, self.width)
+        moments = self.vae_encoder(input_image)[self._vae_e_output]
+        if "sd_2.1" in model:
+            latents = moments * 0.18215
+        else:
+            mean, logvar = np.split(moments, 2, axis=1)
+            std = np.exp(logvar * 0.5)
+            latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215
+        latents = scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy()
+        return latents, meta
+    def postprocess_image(self, image: np.ndarray, meta: Dict):
+        """
+        Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initila image size (if required),
+        normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format
+        Parameters:
+            image (np.ndarray):
+                Generated image
+            meta (Dict):
+                Metadata obtained on latents preparing step, can be empty
+            output_type (str, *optional*, pil):
+                Output format for result, can be pil or numpy
+        Returns:
+            image (List of np.ndarray or PIL.Image.Image):
+                Postprocessed images
+                        if "src_height" in meta:
+            orig_height, orig_width = meta["src_height"], meta["src_width"]
+            image = [cv2.resize(img, (orig_width, orig_height))
+                        for img in image]
+        return image
+        """
+        if "padding" in meta:
+            pad = meta["padding"]
+            (_, end_h), (_, end_w) = pad[1:3]
+            h, w = image.shape[2:]
+            # print("image shape",image.shape[2:])
+            unpad_h = h - end_h
+            unpad_w = w - end_w
+            image = image[:, :, :unpad_h, :unpad_w]
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8)
+        if "src_height" in meta:
+            orig_height, orig_width = meta["src_height"], meta["src_width"]
+            image = cv2.resize(image, (orig_width, orig_height))
+        return image
+        # image = (image / 2 + 0.5).clip(0, 1)
+        # image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8)
+    def get_timesteps(self, num_inference_steps: int, strength: float, scheduler):
+        """
+        Helper function for getting scheduler timesteps for generation
+        In case of image-to-image generation, it updates number of steps according to strength
+        Parameters:
+           num_inference_steps (int):
+              number of inference steps for generation
+           strength (float):
+               value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
+               Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
+        """
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = scheduler.timesteps[t_start:]
+        return timesteps, num_inference_steps - t_start
+class LatentConsistencyEngine(DiffusionPipeline):
+    def __init__(
+        self,
+            model="SimianLuo/LCM_Dreamshaper_v7",
+            tokenizer="openai/clip-vit-large-patch14",
+            device=["CPU", "CPU", "CPU"],
+    ):
+        super().__init__()
+        try:
+            self.tokenizer = CLIPTokenizer.from_pretrained(model, local_files_only=True)
+        except:
+            self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer)
+            self.tokenizer.save_pretrained(model)
+        self.core = Core()
+        self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')})  # adding caching to reduce init time
+        try_enable_npu_turbo(device, self.core)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+            text_future = executor.submit(self.load_model, model, "text_encoder", device[0])
+            unet_future = executor.submit(self.load_model, model, "unet", device[1])
+            vae_de_future = executor.submit(self.load_model, model, "vae_decoder", device[2])
+        print("Text Device:", device[0])
+        self.text_encoder = text_future.result()
+        self._text_encoder_output = self.text_encoder.output(0)
+        print("Unet Device:", device[1])
+        self.unet = unet_future.result()
+        self._unet_output = self.unet.output(0)
+        self.infer_request = self.unet.create_infer_request()
+        print(f"VAE Device: {device[2]}")
+        self.vae_decoder = vae_de_future.result()
+        self.infer_request_vae = self.vae_decoder.create_infer_request()
+        self.safety_checker = None #pipe.safety_checker
+        self.feature_extractor = None #pipe.feature_extractor
+        self.vae_scale_factor = 2 ** 3
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def load_model(self, model, model_name, device):
+        if "NPU" in device:
+            with open(os.path.join(model, f"{model_name}.blob"), "rb") as f:
+                return self.core.import_model(f.read(), device)
+        return self.core.compile_model(os.path.join(model, f"{model_name}.xml"), device)
+    def _encode_prompt(
+        self,
+        prompt,
+        num_images_per_prompt,
+        prompt_embeds: None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+        """
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt"
+            ).input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[
+                -1
+            ] and not torch.equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            prompt_embeds = self.text_encoder(text_input_ids, share_inputs=True, share_outputs=True)
+            prompt_embeds = torch.from_numpy(prompt_embeds[0])
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+        # Don't need to get uncond prompt embedding because of LCM Guided Distillation
+        return prompt_embeds
+    def run_safety_checker(self, image, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(
+                    image, output_type="pil"
+                )
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(
+                feature_extractor_input, return_tensors="pt"
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+    def prepare_latents(
+        self, batch_size, num_channels_latents, height, width, dtype, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if latents is None:
+            latents = torch.randn(shape, dtype=dtype)
+        # scale the initial noise by the standard deviation required by the scheduler
+        return latents
+    def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+        timesteps: torch.Tensor: generate embedding vectors at these timesteps
+        embedding_dim: int: dimension of the embeddings to generate
+        dtype: data type of the generated embeddings
+        Returns:
+        embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        guidance_scale: float = 7.5,
+        scheduler = None,
+        num_images_per_prompt: Optional[int] = 1,
+        latents: Optional[torch.FloatTensor] = None,
+        num_inference_steps: int = 4,
+        lcm_origin_steps: int = 50,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        model: Optional[Dict[str, any]] = None,
+        seed: Optional[int] = 1234567,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback = None,
+        callback_userdata = None
+    ):
+        # 1. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if seed is not None:
+            torch.manual_seed(seed)
+        #print("After Step 1: batch size is ", batch_size)
+        # do_classifier_free_guidance = guidance_scale > 0.0
+        # In LCM Implementation:  cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG)
+        # 2. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            prompt_embeds=prompt_embeds,
+        )
+        #print("After Step 2: prompt embeds is ", prompt_embeds)
+        #print("After Step 2: scheduler is ", scheduler )
+        # 3. Prepare timesteps
+        scheduler.set_timesteps(num_inference_steps, original_inference_steps=lcm_origin_steps)
+        timesteps = scheduler.timesteps
+        #print("After Step 3: timesteps is ", timesteps)
+        # 4. Prepare latent variable
+        num_channels_latents = 4
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            latents,
+        )
+        latents = latents * scheduler.init_noise_sigma
+        #print("After Step 4: ")
+        bs = batch_size * num_images_per_prompt
+        # 5. Get Guidance Scale Embedding
+        w = torch.tensor(guidance_scale).repeat(bs)
+        w_embedding = self.get_w_embedding(w, embedding_dim=256)
+        #print("After Step 5: ")
+        # 6. LCM MultiStep Sampling Loop:
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if callback:
+                    callback(i+1, callback_userdata)
+                ts = torch.full((bs,), t, dtype=torch.long)
+                # model prediction (v-prediction, eps, x)
+                model_pred = self.unet([latents, ts, prompt_embeds, w_embedding],share_inputs=True, share_outputs=True)[0]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents, denoised = scheduler.step(
+                    torch.from_numpy(model_pred), t, latents, return_dict=False
+                )
+                progress_bar.update()
+        #print("After Step 6: ")
+        vae_start = time.time()
+        if not output_type == "latent":
+            image = torch.from_numpy(self.vae_decoder(denoised / 0.18215, share_inputs=True, share_outputs=True)[0])
+        else:
+            image = denoised
+        print("Decoder Ended: ", time.time() - vae_start)
+        #post_start = time.time()
+        #if has_nsfw_concept is None:
+        do_denormalize = [True] * image.shape[0]
+        #else:
+        #    do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        #print ("After do_denormalize: image is ", image)
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
+        return image[0]
+class LatentConsistencyEngineAdvanced(DiffusionPipeline):
+    def __init__(
+        self,
+            model="SimianLuo/LCM_Dreamshaper_v7",
+            tokenizer="openai/clip-vit-large-patch14",
+            device=["CPU", "CPU", "CPU"],
+    ):
+        super().__init__()
+        try:
+            self.tokenizer = CLIPTokenizer.from_pretrained(model, local_files_only=True)
+        except:
+            self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer)
+            self.tokenizer.save_pretrained(model)
+        self.core = Core()
+        self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')})  # adding caching to reduce init time
+        #try_enable_npu_turbo(device, self.core)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+            text_future = executor.submit(self.load_model, model, "text_encoder", device[0])
+            unet_future = executor.submit(self.load_model, model, "unet", device[1])
+            vae_de_future = executor.submit(self.load_model, model, "vae_decoder", device[2])
+            vae_encoder_future = executor.submit(self.load_model, model, "vae_encoder", device[2])
+        print("Text Device:", device[0])
+        self.text_encoder = text_future.result()
+        self._text_encoder_output = self.text_encoder.output(0)
+        print("Unet Device:", device[1])
+        self.unet = unet_future.result()
+        self._unet_output = self.unet.output(0)
+        self.infer_request = self.unet.create_infer_request()
+        print(f"VAE Device: {device[2]}")
+        self.vae_decoder = vae_de_future.result()
+        self.vae_encoder = vae_encoder_future.result()
+        self._vae_e_output = self.vae_encoder.output(0) if self.vae_encoder else None
+        self.infer_request_vae = self.vae_decoder.create_infer_request()
+        self.safety_checker = None #pipe.safety_checker
+        self.feature_extractor = None #pipe.feature_extractor
+        self.vae_scale_factor = 2 ** 3
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def load_model(self, model, model_name, device):
+        print(f"Compiling the {model_name} to {device} ...")
+        return self.core.compile_model(os.path.join(model, f"{model_name}.xml"), device)
+    def get_timesteps(self, num_inference_steps:int, strength:float, scheduler):
+        """
+        Helper function for getting scheduler timesteps for generation
+        In case of image-to-image generation, it updates number of steps according to strength
+        Parameters:
+           num_inference_steps (int):
+              number of inference steps for generation
+           strength (float):
+               value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
+               Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
+        """
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = scheduler.timesteps[t_start:]
+        return timesteps, num_inference_steps - t_start
+    def _encode_prompt(
+        self,
+        prompt,
+        num_images_per_prompt,
+        prompt_embeds: None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+        """
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt"
+            ).input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[
+                -1
+            ] and not torch.equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            prompt_embeds = self.text_encoder(text_input_ids, share_inputs=True, share_outputs=True)
+            prompt_embeds = torch.from_numpy(prompt_embeds[0])
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+        # Don't need to get uncond prompt embedding because of LCM Guided Distillation
+        return prompt_embeds
+    def run_safety_checker(self, image, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(
+                    image, output_type="pil"
+                )
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(
+                feature_extractor_input, return_tensors="pt"
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concep
+    def prepare_latents(
+        self,image,timestep,batch_size, num_channels_latents, height, width, dtype, scheduler,latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if image:
+            #latents_shape = (1, 4, 512, 512 // 8)
+            #input_image, meta = preprocess(image,512,512)
+            latents_shape = (1, 4, 512 // 8, 512 // 8)
+            noise = np.random.randn(*latents_shape).astype(np.float32)
+            input_image,meta = preprocess(image,512,512)
+            moments = self.vae_encoder(input_image)[self._vae_e_output]
+            mean, logvar = np.split(moments, 2, axis=1)
+            std = np.exp(logvar * 0.5)
+            latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215
+            noise = torch.randn(shape, dtype=dtype)
+            #latents = scheduler.add_noise(init_latents, noise, timestep)
+            latents = scheduler.add_noise(torch.from_numpy(latents), noise, timestep)
+        else:
+            latents = torch.randn(shape, dtype=dtype)
+        # scale the initial noise by the standard deviation required by the scheduler
+        return latents
+    def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+        timesteps: torch.Tensor: generate embedding vectors at these timesteps
+        embedding_dim: int: dimension of the embeddings to generate
+        dtype: data type of the generated embeddings
+        Returns:
+        embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        init_image: Optional[PIL.Image.Image] = None,
+        strength: Optional[float] = 0.8,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        guidance_scale: float = 7.5,
+        scheduler = None,
+        num_images_per_prompt: Optional[int] = 1,
+        latents: Optional[torch.FloatTensor] = None,
+        num_inference_steps: int = 4,
+        lcm_origin_steps: int = 50,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        model: Optional[Dict[str, any]] = None,
+        seed: Optional[int] = 1234567,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback = None,
+        callback_userdata = None
+    ):
+        # 1. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if seed is not None:
+            torch.manual_seed(seed)
+        #print("After Step 1: batch size is ", batch_size)
+        # do_classifier_free_guidance = guidance_scale > 0.0
+        # In LCM Implementation:  cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG)
+        # 2. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            prompt_embeds=prompt_embeds,
+        )
+        #print("After Step 2: prompt embeds is ", prompt_embeds)
+        #print("After Step 2: scheduler is ", scheduler )
+        # 3. Prepare timesteps
+        #scheduler.set_timesteps(num_inference_steps, original_inference_steps=lcm_origin_steps)
+        latent_timestep = None
+        if init_image:
+            scheduler.set_timesteps(num_inference_steps, original_inference_steps=lcm_origin_steps)
+            timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, scheduler)
+            latent_timestep = timesteps[:1]
+        else:
+             scheduler.set_timesteps(num_inference_steps, original_inference_steps=lcm_origin_steps)
+             timesteps = scheduler.timesteps
+        #timesteps = scheduler.timesteps
+        #latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        #print("timesteps: ", latent_timestep)
+        #print("After Step 3: timesteps is ", timesteps)
+        # 4. Prepare latent variable
+        num_channels_latents = 4
+        latents = self.prepare_latents(
+                init_image,
+                latent_timestep,
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                prompt_embeds.dtype,
+                scheduler,
+                latents,
+            )
+        latents = latents * scheduler.init_noise_sigma
+        #print("After Step 4: ")
+        bs = batch_size * num_images_per_prompt
+        # 5. Get Guidance Scale Embedding
+        w = torch.tensor(guidance_scale).repeat(bs)
+        w_embedding = self.get_w_embedding(w, embedding_dim=256)
+        #print("After Step 5: ")
+        # 6. LCM MultiStep Sampling Loop:
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if callback:
+                    callback(i+1, callback_userdata)
+                ts = torch.full((bs,), t, dtype=torch.long)
+                # model prediction (v-prediction, eps, x)
+                model_pred = self.unet([latents, ts, prompt_embeds, w_embedding],share_inputs=True, share_outputs=True)[0]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents, denoised = scheduler.step(
+                    torch.from_numpy(model_pred), t, latents, return_dict=False
+                )
+                progress_bar.update()
+        #print("After Step 6: ")
+        vae_start = time.time()
+        if not output_type == "latent":
+            image = torch.from_numpy(self.vae_decoder(denoised / 0.18215, share_inputs=True, share_outputs=True)[0])
+        else:
+            image = denoised
+        print("Decoder Ended: ", time.time() - vae_start)
+        #post_start = time.time()
+        #if has_nsfw_concept is None:
+        do_denormalize = [True] * image.shape[0]
+        #else:
+        #    do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        #print ("After do_denormalize: image is ", image)
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
+        return image[0]
+class StableDiffusionEngineReferenceOnly(DiffusionPipeline):
+    def __init__(
+            self,
+            #scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+            model="bes-dev/stable-diffusion-v1-4-openvino",
+            tokenizer="openai/clip-vit-large-patch14",
+            device=["CPU","CPU","CPU"]
+            ):
+        #self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer)
+        try:
+            self.tokenizer = CLIPTokenizer.from_pretrained(model,local_files_only=True)
+        except:
+            self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer)
+            self.tokenizer.save_pretrained(model)
+        #self.scheduler = scheduler
+        # models
+        self.core = Core()
+        self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')}) #adding caching to reduce init time
+        # text features
+        print("Text Device:",device[0])
+        self.text_encoder = self.core.compile_model(os.path.join(model, "text_encoder.xml"), device[0])
+        self._text_encoder_output = self.text_encoder.output(0)
+        # diffusion
+        print("unet_w Device:",device[1])
+        self.unet_w = self.core.compile_model(os.path.join(model, "unet_reference_write.xml"), device[1])
+        self._unet_w_output = self.unet_w.output(0)
+        self.latent_shape = tuple(self.unet_w.inputs[0].shape)[1:]
+        print("unet_r Device:",device[1])
+        self.unet_r = self.core.compile_model(os.path.join(model, "unet_reference_read.xml"), device[1])
+        self._unet_r_output = self.unet_r.output(0)
+        # decoder
+        print("Vae Device:",device[2])
+        self.vae_decoder = self.core.compile_model(os.path.join(model, "vae_decoder.xml"), device[2])
+        # encoder
+        self.vae_encoder = self.core.compile_model(os.path.join(model, "vae_encoder.xml"), device[2])
+        self.init_image_shape = tuple(self.vae_encoder.inputs[0].shape)[2:]
+        self._vae_d_output = self.vae_decoder.output(0)
+        self._vae_e_output = self.vae_encoder.output(0) if self.vae_encoder is not None else None
+        self.height = self.unet_w.input(0).shape[2] * 8
+        self.width = self.unet_w.input(0).shape[3] * 8
+    def __call__(
+            self,
+            prompt,
+            image = None,
+            negative_prompt=None,
+            scheduler=None,
+            strength = 1.0,
+            num_inference_steps = 32,
+            guidance_scale = 7.5,
+            eta = 0.0,
+            create_gif = False,
+            model = None,
+            callback = None,
+            callback_userdata = None
+    ):
+        # extract condition
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids)[self._text_encoder_output]
+        # do classifier free guidance
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if do_classifier_free_guidance:
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            else:
+                uncond_tokens = negative_prompt
+            tokens_uncond = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length, #truncation=True,
+                return_tensors="np"
+            )
+            uncond_embeddings = self.text_encoder(tokens_uncond.input_ids)[self._text_encoder_output]
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+        scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, scheduler)
+        latent_timestep = timesteps[:1]
+        ref_image = self.prepare_image(
+            image=image,
+            width=512,
+            height=512,
+        )
+        # get the initial random noise unless the user supplied it
+        latents, meta = self.prepare_latents(None, latent_timestep, scheduler)
+        #ref_image_latents, _ = self.prepare_latents(init_image, latent_timestep, scheduler)
+        ref_image_latents = self.ov_prepare_ref_latents(ref_image)
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        if create_gif:
+            frames = []
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            if callback:
+               callback(i, callback_userdata)
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+            # ref only part
+            noise = randn_tensor(
+                ref_image_latents.shape
+            )
+            ref_xt = scheduler.add_noise(
+                torch.from_numpy(ref_image_latents),
+                noise,
+                t.reshape(
+                    1,
+                ),
+            ).numpy()
+            ref_xt = np.concatenate([ref_xt] * 2) if do_classifier_free_guidance else ref_xt
+            ref_xt = scheduler.scale_model_input(ref_xt, t)
+            # MODE = "write"
+            result_w_dict = self.unet_w([
+                ref_xt,
+                t,
+                text_embeddings
+            ])
+            down_0_attn0 = result_w_dict["/unet/down_blocks.0/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            down_0_attn1 = result_w_dict["/unet/down_blocks.0/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            down_1_attn0 = result_w_dict["/unet/down_blocks.1/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            down_1_attn1 = result_w_dict["/unet/down_blocks.1/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            down_2_attn0 = result_w_dict["/unet/down_blocks.2/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            down_2_attn1 = result_w_dict["/unet/down_blocks.2/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            mid_attn0    = result_w_dict["/unet/mid_block/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            up_1_attn0   = result_w_dict["/unet/up_blocks.1/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            up_1_attn1   = result_w_dict["/unet/up_blocks.1/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            up_1_attn2   = result_w_dict["/unet/up_blocks.1/attentions.2/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            up_2_attn0   = result_w_dict["/unet/up_blocks.2/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            up_2_attn1   = result_w_dict["/unet/up_blocks.2/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            up_2_attn2   = result_w_dict["/unet/up_blocks.2/attentions.2/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            up_3_attn0   = result_w_dict["/unet/up_blocks.3/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            up_3_attn1   = result_w_dict["/unet/up_blocks.3/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            up_3_attn2   = result_w_dict["/unet/up_blocks.3/attentions.2/transformer_blocks.0/norm1/LayerNormalization_output_0"]
+            # MODE = "read"
+            noise_pred = self.unet_r([
+                latent_model_input, t, text_embeddings, down_0_attn0, down_0_attn1, down_1_attn0,
+                down_1_attn1, down_2_attn0, down_2_attn1, mid_attn0, up_1_attn0, up_1_attn1, up_1_attn2,
+                up_2_attn0, up_2_attn1, up_2_attn2, up_3_attn0, up_3_attn1, up_3_attn2
+            ])[0]
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1]
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs)["prev_sample"].numpy()
+            if create_gif:
+                frames.append(latents)
+        if callback:
+            callback(num_inference_steps, callback_userdata)
+        # scale and decode the image latents with vae
+        image = self.vae_decoder(latents)[self._vae_d_output]
+        image = self.postprocess_image(image, meta)
+        if create_gif:
+            gif_folder=os.path.join(model,"../../../gif")
+            if not os.path.exists(gif_folder):
+                os.makedirs(gif_folder)
+            for i in range(0,len(frames)):
+                image = self.vae_decoder(frames[i])[self._vae_d_output]
+                image = self.postprocess_image(image, meta)
+                output = gif_folder + "/" + str(i).zfill(3) +".png"
+                cv2.imwrite(output, image)
+            with open(os.path.join(gif_folder, "prompt.json"), "w") as file:
+                json.dump({"prompt": prompt}, file)
+            frames_image =  [Image.open(image) for image in glob.glob(f"{gif_folder}/*.png")]
+            frame_one = frames_image[0]
+            gif_file=os.path.join(gif_folder,"stable_diffusion.gif")
+            frame_one.save(gif_file, format="GIF", append_images=frames_image, save_all=True, duration=100, loop=0)
+        return image
+    def ov_prepare_ref_latents(self, refimage, vae_scaling_factor=0.18215):
+        #refimage = refimage.to(device=device, dtype=dtype)
+        # encode the mask image into latents space so we can concatenate it to the latents
+        moments = self.vae_encoder(refimage)[0]
+        mean, logvar = np.split(moments, 2, axis=1)
+        std = np.exp(logvar * 0.5)
+        ref_image_latents = (mean + std * np.random.randn(*mean.shape))
+        ref_image_latents = vae_scaling_factor * ref_image_latents
+        #ref_image_latents = scheduler.add_noise(torch.from_numpy(ref_image_latents), torch.from_numpy(noise), latent_timestep).numpy()
+        # aligning device to prevent device errors when concating it with the latent model input
+        #ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
+        return ref_image_latents
+    def prepare_latents(self, image:PIL.Image.Image = None, latent_timestep:torch.Tensor = None, scheduler = LMSDiscreteScheduler):
+        """
+        Function for getting initial latents for starting generation
+        Parameters:
+            image (PIL.Image.Image, *optional*, None):
+                Input image for generation, if not provided randon noise will be used as starting point
+            latent_timestep (torch.Tensor, *optional*, None):
+                Predicted by scheduler initial step for image generation, required for latent image mixing with nosie
+        Returns:
+            latents (np.ndarray):
+                Image encoded in latent space
+        """
+        latents_shape = (1, 4, self.height // 8, self.width // 8)
+        noise = np.random.randn(*latents_shape).astype(np.float32)
+        if image is None:
+            #print("Image is NONE")
+            # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas
+            if isinstance(scheduler, LMSDiscreteScheduler):
+                noise = noise * scheduler.sigmas[0].numpy()
+                return noise, {}
+            elif isinstance(scheduler, EulerDiscreteScheduler):
+                noise = noise * scheduler.sigmas.max().numpy()
+                return noise, {}
+            else:
+                return noise, {}
+        input_image, meta = preprocess(image,self.height,self.width)
+        moments = self.vae_encoder(input_image)[self._vae_e_output]
+        mean, logvar = np.split(moments, 2, axis=1)
+        std = np.exp(logvar * 0.5)
+        latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215
+        latents = scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy()
+        return latents, meta
+    def postprocess_image(self, image:np.ndarray, meta:Dict):
+        """
+        Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initila image size (if required),
+        normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format
+        Parameters:
+            image (np.ndarray):
+                Generated image
+            meta (Dict):
+                Metadata obtained on latents preparing step, can be empty
+            output_type (str, *optional*, pil):
+                Output format for result, can be pil or numpy
+        Returns:
+            image (List of np.ndarray or PIL.Image.Image):
+                Postprocessed images
+                        if "src_height" in meta:
+            orig_height, orig_width = meta["src_height"], meta["src_width"]
+            image = [cv2.resize(img, (orig_width, orig_height))
+                        for img in image]
+        return image
+        """
+        if "padding" in meta:
+            pad = meta["padding"]
+            (_, end_h), (_, end_w) = pad[1:3]
+            h, w = image.shape[2:]
+            #print("image shape",image.shape[2:])
+            unpad_h = h - end_h
+            unpad_w = w - end_w
+            image = image[:, :, :unpad_h, :unpad_w]
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8)
+        if "src_height" in meta:
+            orig_height, orig_width = meta["src_height"], meta["src_width"]
+            image = cv2.resize(image, (orig_width, orig_height))
+        return image
+                      #image = (image / 2 + 0.5).clip(0, 1)
+        #image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8)
+    def get_timesteps(self, num_inference_steps:int, strength:float, scheduler):
+        """
+        Helper function for getting scheduler timesteps for generation
+        In case of image-to-image generation, it updates number of steps according to strength
+        Parameters:
+           num_inference_steps (int):
+              number of inference steps for generation
+           strength (float):
+               value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
+               Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
+        """
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = scheduler.timesteps[t_start:]
+        return timesteps, num_inference_steps - t_start
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if not isinstance(image, np.ndarray):
+            if isinstance(image, PIL.Image.Image):
+                image = [image]
+            if isinstance(image[0], PIL.Image.Image):
+                images = []
+                for image_ in image:
+                    image_ = image_.convert("RGB")
+                    image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
+                    image_ = np.array(image_)
+                    image_ = image_[None, :]
+                    images.append(image_)
+                image = images
+                image = np.concatenate(image, axis=0)
+                image = np.array(image).astype(np.float32) / 255.0
+                image = (image - 0.5) / 0.5
+                image = image.transpose(0, 3, 1, 2)
+            elif isinstance(image[0], np.ndarray):
+                image = np.concatenate(image, dim=0)
+        if do_classifier_free_guidance and not guess_mode:
+            image = np.concatenate([image] * 2)
+        return image
+def print_npu_turbo_art():
+    random_number = random.randint(1, 3)
+    if random_number == 1:
+        print("                                                                                                                      ")
+        print("      ___           ___         ___                                ___           ___                         ___      ")
+        print("     /\  \         /\  \       /\  \                              /\  \         /\  \         _____         /\  \     ")
+        print("     \:\  \       /::\  \      \:\  \                ___          \:\  \       /::\  \       /::\  \       /::\  \    ")
+        print("      \:\  \     /:/\:\__\      \:\  \              /\__\          \:\  \     /:/\:\__\     /:/\:\  \     /:/\:\  \   ")
+        print("  _____\:\  \   /:/ /:/  /  ___  \:\  \            /:/  /      ___  \:\  \   /:/ /:/  /    /:/ /::\__\   /:/  \:\  \  ")
+        print(" /::::::::\__\ /:/_/:/  /  /\  \  \:\__\          /:/__/      /\  \  \:\__\ /:/_/:/__/___ /:/_/:/\:|__| /:/__/ \:\__\ ")
+        print(" \:\~~\~~\/__/ \:\/:/  /   \:\  \ /:/  /         /::\  \      \:\  \ /:/  / \:\/:::::/  / \:\/:/ /:/  / \:\  \ /:/  / ")
+        print("  \:\  \        \::/__/     \:\  /:/  /         /:/\:\  \      \:\  /:/  /   \::/~~/~~~~   \::/_/:/  /   \:\  /:/  /  ")
+        print("   \:\  \        \:\  \      \:\/:/  /          \/__\:\  \      \:\/:/  /     \:\~~\        \:\/:/  /     \:\/:/  /   ")
+        print("    \:\__\        \:\__\      \::/  /                \:\__\      \::/  /       \:\__\        \::/  /       \::/  /    ")
+        print("     \/__/         \/__/       \/__/                  \/__/       \/__/         \/__/         \/__/         \/__/     ")
+        print("                                                                                                                      ")
+    elif random_number == 2:
+        print(" _   _   ____    _   _     _____   _   _   ____    ____     ___  ")
+        print("| \ | | |  _ \  | | | |   |_   _| | | | | |  _ \  | __ )   / _ \ ")
+        print("|  \| | | |_) | | | | |     | |   | | | | | |_) | |  _ \  | | | |")
+        print("| |\  | |  __/  | |_| |     | |   | |_| | |  _ <  | |_) | | |_| |")
+        print("|_| \_| |_|      \___/      |_|    \___/  |_| \_\ |____/   \___/ ")
+        print("                                                                 ")
+    else:
+        print("")
+        print("    )   (                                 (                )   ")
+        print(" ( /(   )\ )              *   )           )\ )     (    ( /(   ")
+        print(" )\()) (()/(      (     ` )  /(      (   (()/(   ( )\   )\())  ")
+        print("((_)\   /(_))     )\     ( )(_))     )\   /(_))  )((_) ((_)\   ")
+        print(" _((_) (_))    _ ((_)   (_(_())   _ ((_) (_))   ((_)_    ((_)  ")
+        print("| \| | | _ \  | | | |   |_   _|  | | | | | _ \   | _ )  / _ \  ")
+        print("| .` | |  _/  | |_| |     | |    | |_| | |   /   | _ \ | (_) | ")
+        print("|_|\_| |_|     \___/      |_|     \___/  |_|_\   |___/  \___/  ")
+        print("                                                               ")

backend/pipelines/lcm.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from constants import LCM_DEFAULT_MODEL
+from diffusers import (
+    DiffusionPipeline,
+    AutoencoderTiny,
+    UNet2DConditionModel,
+    LCMScheduler,
+    StableDiffusionPipeline,
+)
+import torch
+from backend.tiny_decoder import get_tiny_decoder_vae_model
+from typing import Any
+from diffusers import (
+    LCMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionXLImg2ImgPipeline,
+    AutoPipelineForText2Image,
+    AutoPipelineForImage2Image,
+    StableDiffusionControlNetPipeline,
+)
+import pathlib
+def _get_lcm_pipeline_from_base_model(
+    lcm_model_id: str,
+    base_model_id: str,
+    use_local_model: bool,
+):
+    pipeline = None
+    unet = UNet2DConditionModel.from_pretrained(
+        lcm_model_id,
+        torch_dtype=torch.float32,
+        local_files_only=use_local_model,
+        resume_download=True,
+    )
+    pipeline = DiffusionPipeline.from_pretrained(
+        base_model_id,
+        unet=unet,
+        torch_dtype=torch.float32,
+        local_files_only=use_local_model,
+        resume_download=True,
+    )
+    pipeline.scheduler = LCMScheduler.from_config(pipeline.scheduler.config)
+    return pipeline
+def load_taesd(
+    pipeline: Any,
+    use_local_model: bool = False,
+    torch_data_type: torch.dtype = torch.float32,
+):
+    vae_model = get_tiny_decoder_vae_model(pipeline.__class__.__name__)
+    pipeline.vae = AutoencoderTiny.from_pretrained(
+        vae_model,
+        torch_dtype=torch_data_type,
+        local_files_only=use_local_model,
+    )
+def get_lcm_model_pipeline(
+    model_id: str = LCM_DEFAULT_MODEL,
+    use_local_model: bool = False,
+    pipeline_args={},
+):
+    pipeline = None
+    if model_id == "latent-consistency/lcm-sdxl":
+        pipeline = _get_lcm_pipeline_from_base_model(
+            model_id,
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            use_local_model,
+        )
+    elif model_id == "latent-consistency/lcm-ssd-1b":
+        pipeline = _get_lcm_pipeline_from_base_model(
+            model_id,
+            "segmind/SSD-1B",
+            use_local_model,
+        )
+    elif pathlib.Path(model_id).suffix == ".safetensors":
+        # When loading a .safetensors model, the pipeline has to be created
+        # with StableDiffusionPipeline() since it's the only class that
+        # defines the method from_single_file()
+        dummy_pipeline = StableDiffusionPipeline.from_single_file(
+            model_id,
+            safety_checker=None,
+            run_safety_checker=False,
+            load_safety_checker=False,
+            local_files_only=use_local_model,
+            use_safetensors=True,
+        )
+        if 'lcm' in model_id.lower():
+            dummy_pipeline.scheduler = LCMScheduler.from_config(dummy_pipeline.scheduler.config)
+        pipeline = AutoPipelineForText2Image.from_pipe(
+            dummy_pipeline,
+            **pipeline_args,
+        )
+        del dummy_pipeline
+    else:
+        # pipeline = DiffusionPipeline.from_pretrained(
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            model_id,
+            local_files_only=use_local_model,
+            **pipeline_args,
+        )
+    return pipeline
+def get_image_to_image_pipeline(pipeline: Any) -> Any:
+    components = pipeline.components
+    pipeline_class = pipeline.__class__.__name__
+    if (
+        pipeline_class == "LatentConsistencyModelPipeline"
+        or pipeline_class == "StableDiffusionPipeline"
+    ):
+        return StableDiffusionImg2ImgPipeline(**components)
+    elif pipeline_class == "StableDiffusionControlNetPipeline":
+        return AutoPipelineForImage2Image.from_pipe(pipeline)
+    elif pipeline_class == "StableDiffusionXLPipeline":
+        return StableDiffusionXLImg2ImgPipeline(**components)
+    else:
+        raise Exception(f"Unknown pipeline {pipeline_class}")

backend/pipelines/lcm_lora.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import pathlib
+from os import path
+import torch
+from diffusers import (
+    AutoPipelineForText2Image,
+    LCMScheduler,
+    StableDiffusionPipeline,
+)
+def load_lcm_weights(
+    pipeline,
+    use_local_model,
+    lcm_lora_id,
+):
+    kwargs = {
+        "local_files_only": use_local_model,
+        "weight_name": "pytorch_lora_weights.safetensors",
+    }
+    pipeline.load_lora_weights(
+        lcm_lora_id,
+        **kwargs,
+        adapter_name="lcm",
+    )
+def get_lcm_lora_pipeline(
+    base_model_id: str,
+    lcm_lora_id: str,
+    use_local_model: bool,
+    torch_data_type: torch.dtype,
+    pipeline_args={},
+):
+    if pathlib.Path(base_model_id).suffix == ".safetensors":
+        # SD 1.5 models only
+        # When loading a .safetensors model, the pipeline has to be created
+        # with StableDiffusionPipeline() since it's the only class that
+        # defines the method from_single_file(); afterwards a new pipeline
+        # is created using AutoPipelineForText2Image() for ControlNet
+        # support, in case ControlNet is enabled
+        if not path.exists(base_model_id):
+            raise FileNotFoundError(
+                f"Model file not found,Please check your model path: {base_model_id}"
+            )
+        print("Using single file Safetensors model (Supported models - SD 1.5 models)")
+        dummy_pipeline = StableDiffusionPipeline.from_single_file(
+            base_model_id,
+            torch_dtype=torch_data_type,
+            safety_checker=None,
+            local_files_only=use_local_model,
+            use_safetensors=True,
+        )
+        pipeline = AutoPipelineForText2Image.from_pipe(
+            dummy_pipeline,
+            **pipeline_args,
+        )
+        del dummy_pipeline
+    else:
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            base_model_id,
+            torch_dtype=torch_data_type,
+            local_files_only=use_local_model,
+            **pipeline_args,
+        )
+    load_lcm_weights(
+        pipeline,
+        use_local_model,
+        lcm_lora_id,
+    )
+    # Always fuse LCM-LoRA
+    # pipeline.fuse_lora()
+    if "lcm" in lcm_lora_id.lower() or "hypersd" in lcm_lora_id.lower():
+        print("LCM LoRA model detected so using recommended LCMScheduler")
+        pipeline.scheduler = LCMScheduler.from_config(pipeline.scheduler.config)
+    # pipeline.unet.to(memory_format=torch.channels_last)
+    return pipeline

backend/tiny_decoder.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from constants import (
+    TAESD_MODEL,
+    TAESDXL_MODEL,
+    TAESD_MODEL_OPENVINO,
+    TAESDXL_MODEL_OPENVINO,
+)
+def get_tiny_decoder_vae_model(pipeline_class) -> str:
+    print(f"Pipeline class : {pipeline_class}")
+    if (
+        pipeline_class == "LatentConsistencyModelPipeline"
+        or pipeline_class == "StableDiffusionPipeline"
+        or pipeline_class == "StableDiffusionImg2ImgPipeline"
+        or pipeline_class == "StableDiffusionControlNetPipeline"
+        or pipeline_class == "StableDiffusionControlNetImg2ImgPipeline"
+    ):
+        return TAESD_MODEL
+    elif (
+        pipeline_class == "StableDiffusionXLPipeline"
+        or pipeline_class == "StableDiffusionXLImg2ImgPipeline"
+    ):
+        return TAESDXL_MODEL
+    elif (
+        pipeline_class == "OVStableDiffusionPipeline"
+        or pipeline_class == "OVStableDiffusionImg2ImgPipeline"
+    ):
+        return TAESD_MODEL_OPENVINO
+    elif pipeline_class == "OVStableDiffusionXLPipeline":
+        return TAESDXL_MODEL_OPENVINO
+    else:
+        raise Exception("No valid pipeline class found!")

backend/upscale/aura_sr.py ADDED Viewed

	@@ -0,0 +1,1004 @@

+# AuraSR: GAN-based Super-Resolution for real-world, a reproduction of the GigaGAN* paper. Implementation is
+# based on the unofficial lucidrains/gigagan-pytorch repository. Heavily modified from there.
+#
+# https://mingukkang.github.io/GigaGAN/
+from math import log2, ceil
+from functools import partial
+from typing import Any, Optional, List, Iterable
+import torch
+from torchvision import transforms
+from PIL import Image
+from torch import nn, einsum, Tensor
+import torch.nn.functional as F
+from einops import rearrange, repeat, reduce
+from einops.layers.torch import Rearrange
+from torchvision.utils import save_image
+import math
+def get_same_padding(size, kernel, dilation, stride):
+    return ((size - 1) * (stride - 1) + dilation * (kernel - 1)) // 2
+class AdaptiveConv2DMod(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        kernel,
+        *,
+        demod=True,
+        stride=1,
+        dilation=1,
+        eps=1e-8,
+        num_conv_kernels=1,  # set this to be greater than 1 for adaptive
+    ):
+        super().__init__()
+        self.eps = eps
+        self.dim_out = dim_out
+        self.kernel = kernel
+        self.stride = stride
+        self.dilation = dilation
+        self.adaptive = num_conv_kernels > 1
+        self.weights = nn.Parameter(
+            torch.randn((num_conv_kernels, dim_out, dim, kernel, kernel))
+        )
+        self.demod = demod
+        nn.init.kaiming_normal_(
+            self.weights, a=0, mode="fan_in", nonlinearity="leaky_relu"
+        )
+    def forward(
+        self, fmap, mod: Optional[Tensor] = None, kernel_mod: Optional[Tensor] = None
+    ):
+        """
+        notation
+        b - batch
+        n - convs
+        o - output
+        i - input
+        k - kernel
+        """
+        b, h = fmap.shape[0], fmap.shape[-2]
+        # account for feature map that has been expanded by the scale in the first dimension
+        # due to multiscale inputs and outputs
+        if mod.shape[0] != b:
+            mod = repeat(mod, "b ... -> (s b) ...", s=b // mod.shape[0])
+        if exists(kernel_mod):
+            kernel_mod_has_el = kernel_mod.numel() > 0
+            assert self.adaptive or not kernel_mod_has_el
+            if kernel_mod_has_el and kernel_mod.shape[0] != b:
+                kernel_mod = repeat(
+                    kernel_mod, "b ... -> (s b) ...", s=b // kernel_mod.shape[0]
+                )
+        # prepare weights for modulation
+        weights = self.weights
+        if self.adaptive:
+            weights = repeat(weights, "... -> b ...", b=b)
+            # determine an adaptive weight and 'select' the kernel to use with softmax
+            assert exists(kernel_mod) and kernel_mod.numel() > 0
+            kernel_attn = kernel_mod.softmax(dim=-1)
+            kernel_attn = rearrange(kernel_attn, "b n -> b n 1 1 1 1")
+            weights = reduce(weights * kernel_attn, "b n ... -> b ...", "sum")
+        # do the modulation, demodulation, as done in stylegan2
+        mod = rearrange(mod, "b i -> b 1 i 1 1")
+        weights = weights * (mod + 1)
+        if self.demod:
+            inv_norm = (
+                reduce(weights**2, "b o i k1 k2 -> b o 1 1 1", "sum")
+                .clamp(min=self.eps)
+                .rsqrt()
+            )
+            weights = weights * inv_norm
+        fmap = rearrange(fmap, "b c h w -> 1 (b c) h w")
+        weights = rearrange(weights, "b o ... -> (b o) ...")
+        padding = get_same_padding(h, self.kernel, self.dilation, self.stride)
+        fmap = F.conv2d(fmap, weights, padding=padding, groups=b)
+        return rearrange(fmap, "1 (b o) ... -> b o ...", b=b)
+class Attend(nn.Module):
+    def __init__(self, dropout=0.0, flash=False):
+        super().__init__()
+        self.dropout = dropout
+        self.attn_dropout = nn.Dropout(dropout)
+        self.scale = nn.Parameter(torch.randn(1))
+        self.flash = flash
+    def flash_attn(self, q, k, v):
+        q, k, v = map(lambda t: t.contiguous(), (q, k, v))
+        out = F.scaled_dot_product_attention(
+            q, k, v, dropout_p=self.dropout if self.training else 0.0
+        )
+        return out
+    def forward(self, q, k, v):
+        if self.flash:
+            return self.flash_attn(q, k, v)
+        scale = q.shape[-1] ** -0.5
+        # similarity
+        sim = einsum("b h i d, b h j d -> b h i j", q, k) * scale
+        # attention
+        attn = sim.softmax(dim=-1)
+        attn = self.attn_dropout(attn)
+        # aggregate values
+        out = einsum("b h i j, b h j d -> b h i d", attn, v)
+        return out
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+def cast_tuple(t, length=1):
+    if isinstance(t, tuple):
+        return t
+    return (t,) * length
+def identity(t, *args, **kwargs):
+    return t
+def is_power_of_two(n):
+    return log2(n).is_integer()
+def null_iterator():
+    while True:
+        yield None
+def Downsample(dim, dim_out=None):
+    return nn.Sequential(
+        Rearrange("b c (h p1) (w p2) -> b (c p1 p2) h w", p1=2, p2=2),
+        nn.Conv2d(dim * 4, default(dim_out, dim), 1),
+    )
+class RMSNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.g = nn.Parameter(torch.ones(1, dim, 1, 1))
+        self.eps = 1e-4
+    def forward(self, x):
+        return F.normalize(x, dim=1) * self.g * (x.shape[1] ** 0.5)
+# building block modules
+class Block(nn.Module):
+    def __init__(self, dim, dim_out, groups=8, num_conv_kernels=0):
+        super().__init__()
+        self.proj = AdaptiveConv2DMod(
+            dim, dim_out, kernel=3, num_conv_kernels=num_conv_kernels
+        )
+        self.kernel = 3
+        self.dilation = 1
+        self.stride = 1
+        self.act = nn.SiLU()
+    def forward(self, x, conv_mods_iter: Optional[Iterable] = None):
+        conv_mods_iter = default(conv_mods_iter, null_iterator())
+        x = self.proj(x, mod=next(conv_mods_iter), kernel_mod=next(conv_mods_iter))
+        x = self.act(x)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(
+        self, dim, dim_out, *, groups=8, num_conv_kernels=0, style_dims: List = []
+    ):
+        super().__init__()
+        style_dims.extend([dim, num_conv_kernels, dim_out, num_conv_kernels])
+        self.block1 = Block(
+            dim, dim_out, groups=groups, num_conv_kernels=num_conv_kernels
+        )
+        self.block2 = Block(
+            dim_out, dim_out, groups=groups, num_conv_kernels=num_conv_kernels
+        )
+        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+    def forward(self, x, conv_mods_iter: Optional[Iterable] = None):
+        h = self.block1(x, conv_mods_iter=conv_mods_iter)
+        h = self.block2(h, conv_mods_iter=conv_mods_iter)
+        return h + self.res_conv(x)
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Sequential(nn.Conv2d(hidden_dim, dim, 1), RMSNorm(dim))
+    def forward(self, x):
+        b, c, h, w = x.shape
+        x = self.norm(x)
+        qkv = self.to_qkv(x).chunk(3, dim=1)
+        q, k, v = map(
+            lambda t: rearrange(t, "b (h c) x y -> b h c (x y)", h=self.heads), qkv
+        )
+        q = q.softmax(dim=-2)
+        k = k.softmax(dim=-1)
+        q = q * self.scale
+        context = torch.einsum("b h d n, b h e n -> b h d e", k, v)
+        out = torch.einsum("b h d e, b h d n -> b h e n", context, q)
+        out = rearrange(out, "b h c (x y) -> b (h c) x y", h=self.heads, x=h, y=w)
+        return self.to_out(out)
+class Attention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32, flash=False):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.norm = RMSNorm(dim)
+        self.attend = Attend(flash=flash)
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        x = self.norm(x)
+        qkv = self.to_qkv(x).chunk(3, dim=1)
+        q, k, v = map(
+            lambda t: rearrange(t, "b (h c) x y -> b h (x y) c", h=self.heads), qkv
+        )
+        out = self.attend(q, k, v)
+        out = rearrange(out, "b h (x y) d -> b (h d) x y", x=h, y=w)
+        return self.to_out(out)
+# feedforward
+def FeedForward(dim, mult=4):
+    return nn.Sequential(
+        RMSNorm(dim),
+        nn.Conv2d(dim, dim * mult, 1),
+        nn.GELU(),
+        nn.Conv2d(dim * mult, dim, 1),
+    )
+# transformers
+class Transformer(nn.Module):
+    def __init__(self, dim, dim_head=64, heads=8, depth=1, flash_attn=True, ff_mult=4):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        Attention(
+                            dim=dim, dim_head=dim_head, heads=heads, flash=flash_attn
+                        ),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+        return x
+class LinearTransformer(nn.Module):
+    def __init__(self, dim, dim_head=64, heads=8, depth=1, ff_mult=4):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        LinearAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+        return x
+class NearestNeighborhoodUpsample(nn.Module):
+    def __init__(self, dim, dim_out=None):
+        super().__init__()
+        dim_out = default(dim_out, dim)
+        self.conv = nn.Conv2d(dim, dim_out, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        if x.shape[0] >= 64:
+            x = x.contiguous()
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class EqualLinear(nn.Module):
+    def __init__(self, dim, dim_out, lr_mul=1, bias=True):
+        super().__init__()
+        self.weight = nn.Parameter(torch.randn(dim_out, dim))
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(dim_out))
+        self.lr_mul = lr_mul
+    def forward(self, input):
+        return F.linear(input, self.weight * self.lr_mul, bias=self.bias * self.lr_mul)
+class StyleGanNetwork(nn.Module):
+    def __init__(self, dim_in=128, dim_out=512, depth=8, lr_mul=0.1, dim_text_latent=0):
+        super().__init__()
+        self.dim_in = dim_in
+        self.dim_out = dim_out
+        self.dim_text_latent = dim_text_latent
+        layers = []
+        for i in range(depth):
+            is_first = i == 0
+            if is_first:
+                dim_in_layer = dim_in + dim_text_latent
+            else:
+                dim_in_layer = dim_out
+            dim_out_layer = dim_out
+            layers.extend(
+                [EqualLinear(dim_in_layer, dim_out_layer, lr_mul), nn.LeakyReLU(0.2)]
+            )
+        self.net = nn.Sequential(*layers)
+    def forward(self, x, text_latent=None):
+        x = F.normalize(x, dim=1)
+        if self.dim_text_latent > 0:
+            assert exists(text_latent)
+            x = torch.cat((x, text_latent), dim=-1)
+        return self.net(x)
+class UnetUpsampler(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        *,
+        image_size: int,
+        input_image_size: int,
+        init_dim: Optional[int] = None,
+        out_dim: Optional[int] = None,
+        style_network: Optional[dict] = None,
+        up_dim_mults: tuple = (1, 2, 4, 8, 16),
+        down_dim_mults: tuple = (4, 8, 16),
+        channels: int = 3,
+        resnet_block_groups: int = 8,
+        full_attn: tuple = (False, False, False, True, True),
+        flash_attn: bool = True,
+        self_attn_dim_head: int = 64,
+        self_attn_heads: int = 8,
+        attn_depths: tuple = (2, 2, 2, 2, 4),
+        mid_attn_depth: int = 4,
+        num_conv_kernels: int = 4,
+        resize_mode: str = "bilinear",
+        unconditional: bool = True,
+        skip_connect_scale: Optional[float] = None,
+    ):
+        super().__init__()
+        self.style_network = style_network = StyleGanNetwork(**style_network)
+        self.unconditional = unconditional
+        assert not (
+            unconditional
+            and exists(style_network)
+            and style_network.dim_text_latent > 0
+        )
+        assert is_power_of_two(image_size) and is_power_of_two(
+            input_image_size
+        ), "both output image size and input image size must be power of 2"
+        assert (
+            input_image_size < image_size
+        ), "input image size must be smaller than the output image size, thus upsampling"
+        self.image_size = image_size
+        self.input_image_size = input_image_size
+        style_embed_split_dims = []
+        self.channels = channels
+        input_channels = channels
+        init_dim = default(init_dim, dim)
+        up_dims = [init_dim, *map(lambda m: dim * m, up_dim_mults)]
+        init_down_dim = up_dims[len(up_dim_mults) - len(down_dim_mults)]
+        down_dims = [init_down_dim, *map(lambda m: dim * m, down_dim_mults)]
+        self.init_conv = nn.Conv2d(input_channels, init_down_dim, 7, padding=3)
+        up_in_out = list(zip(up_dims[:-1], up_dims[1:]))
+        down_in_out = list(zip(down_dims[:-1], down_dims[1:]))
+        block_klass = partial(
+            ResnetBlock,
+            groups=resnet_block_groups,
+            num_conv_kernels=num_conv_kernels,
+            style_dims=style_embed_split_dims,
+        )
+        FullAttention = partial(Transformer, flash_attn=flash_attn)
+        *_, mid_dim = up_dims
+        self.skip_connect_scale = default(skip_connect_scale, 2**-0.5)
+        self.downs = nn.ModuleList([])
+        self.ups = nn.ModuleList([])
+        block_count = 6
+        for ind, (
+            (dim_in, dim_out),
+            layer_full_attn,
+            layer_attn_depth,
+        ) in enumerate(zip(down_in_out, full_attn, attn_depths)):
+            attn_klass = FullAttention if layer_full_attn else LinearTransformer
+            blocks = []
+            for i in range(block_count):
+                blocks.append(block_klass(dim_in, dim_in))
+            self.downs.append(
+                nn.ModuleList(
+                    [
+                        nn.ModuleList(blocks),
+                        nn.ModuleList(
+                            [
+                                (
+                                    attn_klass(
+                                        dim_in,
+                                        dim_head=self_attn_dim_head,
+                                        heads=self_attn_heads,
+                                        depth=layer_attn_depth,
+                                    )
+                                    if layer_full_attn
+                                    else None
+                                ),
+                                nn.Conv2d(
+                                    dim_in, dim_out, kernel_size=3, stride=2, padding=1
+                                ),
+                            ]
+                        ),
+                    ]
+                )
+            )
+        self.mid_block1 = block_klass(mid_dim, mid_dim)
+        self.mid_attn = FullAttention(
+            mid_dim,
+            dim_head=self_attn_dim_head,
+            heads=self_attn_heads,
+            depth=mid_attn_depth,
+        )
+        self.mid_block2 = block_klass(mid_dim, mid_dim)
+        *_, last_dim = up_dims
+        for ind, (
+            (dim_in, dim_out),
+            layer_full_attn,
+            layer_attn_depth,
+        ) in enumerate(
+            zip(
+                reversed(up_in_out),
+                reversed(full_attn),
+                reversed(attn_depths),
+            )
+        ):
+            attn_klass = FullAttention if layer_full_attn else LinearTransformer
+            blocks = []
+            input_dim = dim_in * 2 if ind < len(down_in_out) else dim_in
+            for i in range(block_count):
+                blocks.append(block_klass(input_dim, dim_in))
+            self.ups.append(
+                nn.ModuleList(
+                    [
+                        nn.ModuleList(blocks),
+                        nn.ModuleList(
+                            [
+                                NearestNeighborhoodUpsample(
+                                    last_dim if ind == 0 else dim_out,
+                                    dim_in,
+                                ),
+                                (
+                                    attn_klass(
+                                        dim_in,
+                                        dim_head=self_attn_dim_head,
+                                        heads=self_attn_heads,
+                                        depth=layer_attn_depth,
+                                    )
+                                    if layer_full_attn
+                                    else None
+                                ),
+                            ]
+                        ),
+                    ]
+                )
+            )
+        self.out_dim = default(out_dim, channels)
+        self.final_res_block = block_klass(dim, dim)
+        self.final_to_rgb = nn.Conv2d(dim, channels, 1)
+        self.resize_mode = resize_mode
+        self.style_to_conv_modulations = nn.Linear(
+            style_network.dim_out, sum(style_embed_split_dims)
+        )
+        self.style_embed_split_dims = style_embed_split_dims
+    @property
+    def allowable_rgb_resolutions(self):
+        input_res_base = int(log2(self.input_image_size))
+        output_res_base = int(log2(self.image_size))
+        allowed_rgb_res_base = list(range(input_res_base, output_res_base))
+        return [*map(lambda p: 2**p, allowed_rgb_res_base)]
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def total_params(self):
+        return sum([p.numel() for p in self.parameters()])
+    def resize_image_to(self, x, size):
+        return F.interpolate(x, (size, size), mode=self.resize_mode)
+    def forward(
+        self,
+        lowres_image: torch.Tensor,
+        styles: Optional[torch.Tensor] = None,
+        noise: Optional[torch.Tensor] = None,
+        global_text_tokens: Optional[torch.Tensor] = None,
+        return_all_rgbs: bool = False,
+    ):
+        x = lowres_image
+        noise_scale = 0.001  # Adjust the scale of the noise as needed
+        noise_aug = torch.randn_like(x) * noise_scale
+        x = x + noise_aug
+        x = x.clamp(0, 1)
+        shape = x.shape
+        batch_size = shape[0]
+        assert shape[-2:] == ((self.input_image_size,) * 2)
+        # styles
+        if not exists(styles):
+            assert exists(self.style_network)
+            noise = default(
+                noise,
+                torch.randn(
+                    (batch_size, self.style_network.dim_in), device=self.device
+                ),
+            )
+            styles = self.style_network(noise, global_text_tokens)
+        # project styles to conv modulations
+        conv_mods = self.style_to_conv_modulations(styles)
+        conv_mods = conv_mods.split(self.style_embed_split_dims, dim=-1)
+        conv_mods = iter(conv_mods)
+        x = self.init_conv(x)
+        h = []
+        for blocks, (attn, downsample) in self.downs:
+            for block in blocks:
+                x = block(x, conv_mods_iter=conv_mods)
+                h.append(x)
+            if attn is not None:
+                x = attn(x)
+            x = downsample(x)
+        x = self.mid_block1(x, conv_mods_iter=conv_mods)
+        x = self.mid_attn(x)
+        x = self.mid_block2(x, conv_mods_iter=conv_mods)
+        for (
+            blocks,
+            (
+                upsample,
+                attn,
+            ),
+        ) in self.ups:
+            x = upsample(x)
+            for block in blocks:
+                if h != []:
+                    res = h.pop()
+                    res = res * self.skip_connect_scale
+                    x = torch.cat((x, res), dim=1)
+                x = block(x, conv_mods_iter=conv_mods)
+            if attn is not None:
+                x = attn(x)
+        x = self.final_res_block(x, conv_mods_iter=conv_mods)
+        rgb = self.final_to_rgb(x)
+        if not return_all_rgbs:
+            return rgb
+        return rgb, []
+def tile_image(image, chunk_size=64):
+    c, h, w = image.shape
+    h_chunks = ceil(h / chunk_size)
+    w_chunks = ceil(w / chunk_size)
+    tiles = []
+    for i in range(h_chunks):
+        for j in range(w_chunks):
+            tile = image[
+                :,
+                i * chunk_size : (i + 1) * chunk_size,
+                j * chunk_size : (j + 1) * chunk_size,
+            ]
+            tiles.append(tile)
+    return tiles, h_chunks, w_chunks
+# This helps create a checkboard pattern with some edge blending
+def create_checkerboard_weights(tile_size):
+    x = torch.linspace(-1, 1, tile_size)
+    y = torch.linspace(-1, 1, tile_size)
+    x, y = torch.meshgrid(x, y, indexing="ij")
+    d = torch.sqrt(x * x + y * y)
+    sigma, mu = 0.5, 0.0
+    weights = torch.exp(-((d - mu) ** 2 / (2.0 * sigma**2)))
+    # saturate the values to sure get high weights in the center
+    weights = weights**8
+    return weights / weights.max()  # Normalize to [0, 1]
+def repeat_weights(weights, image_size):
+    tile_size = weights.shape[0]
+    repeats = (
+        math.ceil(image_size[0] / tile_size),
+        math.ceil(image_size[1] / tile_size),
+    )
+    return weights.repeat(repeats)[: image_size[0], : image_size[1]]
+def create_offset_weights(weights, image_size):
+    tile_size = weights.shape[0]
+    offset = tile_size // 2
+    full_weights = repeat_weights(
+        weights, (image_size[0] + offset, image_size[1] + offset)
+    )
+    return full_weights[offset:, offset:]
+def merge_tiles(tiles, h_chunks, w_chunks, chunk_size=64):
+    # Determine the shape of the output tensor
+    c = tiles[0].shape[0]
+    h = h_chunks * chunk_size
+    w = w_chunks * chunk_size
+    # Create an empty tensor to hold the merged image
+    merged = torch.zeros((c, h, w), dtype=tiles[0].dtype)
+    # Iterate over the tiles and place them in the correct position
+    for idx, tile in enumerate(tiles):
+        i = idx // w_chunks
+        j = idx % w_chunks
+        h_start = i * chunk_size
+        w_start = j * chunk_size
+        tile_h, tile_w = tile.shape[1:]
+        merged[:, h_start : h_start + tile_h, w_start : w_start + tile_w] = tile
+    return merged
+class AuraSR:
+    def __init__(self, config: dict[str, Any], device: str = "cuda"):
+        self.upsampler = UnetUpsampler(**config).to(device)
+        self.input_image_size = config["input_image_size"]
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_id: str = "fal-ai/AuraSR",
+        use_safetensors: bool = True,
+        device: str = "cuda",
+    ):
+        import json
+        import torch
+        from pathlib import Path
+        from huggingface_hub import snapshot_download
+        # Check if model_id is a local file
+        if Path(model_id).is_file():
+            local_file = Path(model_id)
+            if local_file.suffix == ".safetensors":
+                use_safetensors = True
+            elif local_file.suffix == ".ckpt":
+                use_safetensors = False
+            else:
+                raise ValueError(
+                    f"Unsupported file format: {local_file.suffix}. Please use .safetensors or .ckpt files."
+                )
+            # For local files, we need to provide the config separately
+            config_path = local_file.with_name("config.json")
+            if not config_path.exists():
+                raise FileNotFoundError(
+                    f"Config file not found: {config_path}. "
+                    f"When loading from a local file, ensure that 'config.json' "
+                    f"is present in the same directory as '{local_file.name}'. "
+                    f"If you're trying to load a model from Hugging Face, "
+                    f"please provide the model ID instead of a file path."
+                )
+            config = json.loads(config_path.read_text())
+            hf_model_path = local_file.parent
+        else:
+            hf_model_path = Path(
+                snapshot_download(model_id, ignore_patterns=["*.ckpt"])
+            )
+            config = json.loads((hf_model_path / "config.json").read_text())
+        model = cls(config, device)
+        if use_safetensors:
+            try:
+                from safetensors.torch import load_file
+                checkpoint = load_file(
+                    hf_model_path / "model.safetensors"
+                    if not Path(model_id).is_file()
+                    else model_id
+                )
+            except ImportError:
+                raise ImportError(
+                    "The safetensors library is not installed. "
+                    "Please install it with `pip install safetensors` "
+                    "or use `use_safetensors=False` to load the model with PyTorch."
+                )
+        else:
+            checkpoint = torch.load(
+                hf_model_path / "model.ckpt"
+                if not Path(model_id).is_file()
+                else model_id
+            )
+        model.upsampler.load_state_dict(checkpoint, strict=True)
+        return model
+    @torch.no_grad()
+    def upscale_4x(self, image: Image.Image, max_batch_size=8) -> Image.Image:
+        tensor_transform = transforms.ToTensor()
+        device = self.upsampler.device
+        image_tensor = tensor_transform(image).unsqueeze(0)
+        _, _, h, w = image_tensor.shape
+        pad_h = (
+            self.input_image_size - h % self.input_image_size
+        ) % self.input_image_size
+        pad_w = (
+            self.input_image_size - w % self.input_image_size
+        ) % self.input_image_size
+        # Pad the image
+        image_tensor = torch.nn.functional.pad(
+            image_tensor, (0, pad_w, 0, pad_h), mode="reflect"
+        ).squeeze(0)
+        tiles, h_chunks, w_chunks = tile_image(image_tensor, self.input_image_size)
+        # Batch processing of tiles
+        num_tiles = len(tiles)
+        batches = [
+            tiles[i : i + max_batch_size] for i in range(0, num_tiles, max_batch_size)
+        ]
+        reconstructed_tiles = []
+        for batch in batches:
+            model_input = torch.stack(batch).to(device)
+            generator_output = self.upsampler(
+                lowres_image=model_input,
+                noise=torch.randn(model_input.shape[0], 128, device=device),
+            )
+            reconstructed_tiles.extend(
+                list(generator_output.clamp_(0, 1).detach().cpu())
+            )
+        merged_tensor = merge_tiles(
+            reconstructed_tiles, h_chunks, w_chunks, self.input_image_size * 4
+        )
+        unpadded = merged_tensor[:, : h * 4, : w * 4]
+        to_pil = transforms.ToPILImage()
+        return to_pil(unpadded)
+    # Tiled 4x upscaling with overlapping tiles to reduce seam artifacts
+    # weights options are 'checkboard' and 'constant'
+    @torch.no_grad()
+    def upscale_4x_overlapped(self, image, max_batch_size=8, weight_type="checkboard"):
+        tensor_transform = transforms.ToTensor()
+        device = self.upsampler.device
+        image_tensor = tensor_transform(image).unsqueeze(0)
+        _, _, h, w = image_tensor.shape
+        # Calculate paddings
+        pad_h = (
+            self.input_image_size - h % self.input_image_size
+        ) % self.input_image_size
+        pad_w = (
+            self.input_image_size - w % self.input_image_size
+        ) % self.input_image_size
+        # Pad the image
+        image_tensor = torch.nn.functional.pad(
+            image_tensor, (0, pad_w, 0, pad_h), mode="reflect"
+        ).squeeze(0)
+        # Function to process tiles
+        def process_tiles(tiles, h_chunks, w_chunks):
+            num_tiles = len(tiles)
+            batches = [
+                tiles[i : i + max_batch_size]
+                for i in range(0, num_tiles, max_batch_size)
+            ]
+            reconstructed_tiles = []
+            for batch in batches:
+                model_input = torch.stack(batch).to(device)
+                generator_output = self.upsampler(
+                    lowres_image=model_input,
+                    noise=torch.randn(model_input.shape[0], 128, device=device),
+                )
+                reconstructed_tiles.extend(
+                    list(generator_output.clamp_(0, 1).detach().cpu())
+                )
+            return merge_tiles(
+                reconstructed_tiles, h_chunks, w_chunks, self.input_image_size * 4
+            )
+        # First pass
+        tiles1, h_chunks1, w_chunks1 = tile_image(image_tensor, self.input_image_size)
+        result1 = process_tiles(tiles1, h_chunks1, w_chunks1)
+        # Second pass with offset
+        offset = self.input_image_size // 2
+        image_tensor_offset = torch.nn.functional.pad(
+            image_tensor, (offset, offset, offset, offset), mode="reflect"
+        ).squeeze(0)
+        tiles2, h_chunks2, w_chunks2 = tile_image(
+            image_tensor_offset, self.input_image_size
+        )
+        result2 = process_tiles(tiles2, h_chunks2, w_chunks2)
+        # unpad
+        offset_4x = offset * 4
+        result2_interior = result2[:, offset_4x:-offset_4x, offset_4x:-offset_4x]
+        if weight_type == "checkboard":
+            weight_tile = create_checkerboard_weights(self.input_image_size * 4)
+            weight_shape = result2_interior.shape[1:]
+            weights_1 = create_offset_weights(weight_tile, weight_shape)
+            weights_2 = repeat_weights(weight_tile, weight_shape)
+            normalizer = weights_1 + weights_2
+            weights_1 = weights_1 / normalizer
+            weights_2 = weights_2 / normalizer
+            weights_1 = weights_1.unsqueeze(0).repeat(3, 1, 1)
+            weights_2 = weights_2.unsqueeze(0).repeat(3, 1, 1)
+        elif weight_type == "constant":
+            weights_1 = torch.ones_like(result2_interior) * 0.5
+            weights_2 = weights_1
+        else:
+            raise ValueError(
+                "weight_type should be either 'gaussian' or 'constant' but got",
+                weight_type,
+            )
+        result1 = result1 * weights_2
+        result2 = result2_interior * weights_1
+        # Average the overlapping region
+        result1 = result1 + result2
+        # Remove padding
+        unpadded = result1[:, : h * 4, : w * 4]
+        to_pil = transforms.ToPILImage()
+        return to_pil(unpadded)

backend/upscale/aura_sr_upscale.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from backend.upscale.aura_sr import AuraSR
+from PIL import Image
+def upscale_aura_sr(image_path: str):
+    aura_sr = AuraSR.from_pretrained("fal/AuraSR-v2", device="cpu")
+    image_in = Image.open(image_path)  # .resize((256, 256))
+    return aura_sr.upscale_4x(image_in)

backend/upscale/edsr_upscale_onnx.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import numpy as np
+import onnxruntime
+from huggingface_hub import hf_hub_download
+from PIL import Image
+def upscale_edsr_2x(image_path: str):
+    input_image = Image.open(image_path).convert("RGB")
+    input_image = np.array(input_image).astype("float32")
+    input_image = np.transpose(input_image, (2, 0, 1))
+    img_arr = np.expand_dims(input_image, axis=0)
+    if np.max(img_arr) > 256:  # 16-bit image
+        max_range = 65535
+    else:
+        max_range = 255.0
+        img = img_arr / max_range
+    model_path = hf_hub_download(
+        repo_id="rupeshs/edsr-onnx",
+        filename="edsr_onnxsim_2x.onnx",
+    )
+    sess = onnxruntime.InferenceSession(model_path)
+    input_name = sess.get_inputs()[0].name
+    output_name = sess.get_outputs()[0].name
+    output = sess.run(
+        [output_name],
+        {input_name: img},
+    )[0]
+    result = output.squeeze()
+    result = result.clip(0, 1)
+    image_array = np.transpose(result, (1, 2, 0))
+    image_array = np.uint8(image_array * 255)
+    upscaled_image = Image.fromarray(image_array)
+    return upscaled_image

backend/upscale/tiled_upscale.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import time
+import math
+import logging
+from PIL import Image, ImageDraw, ImageFilter
+from backend.models.lcmdiffusion_setting import DiffusionTask
+from context import Context
+from constants import DEVICE
+def generate_upscaled_image(
+    config,
+    input_path=None,
+    strength=0.3,
+    scale_factor=2.0,
+    tile_overlap=16,
+    upscale_settings=None,
+    context: Context = None,
+    output_path=None,
+    image_format="PNG",
+):
+    if config == None or (
+        input_path == None or input_path == "" and upscale_settings == None
+    ):
+        logging.error("Wrong arguments in tiled upscale function call!")
+        return
+    # Use the upscale_settings dict if provided; otherwise, build the
+    # upscale_settings dict using the function arguments and default values
+    if upscale_settings == None:
+        upscale_settings = {
+            "source_file": input_path,
+            "target_file": None,
+            "output_format": image_format,
+            "strength": strength,
+            "scale_factor": scale_factor,
+            "prompt": config.lcm_diffusion_setting.prompt,
+            "tile_overlap": tile_overlap,
+            "tile_size": 256,
+            "tiles": [],
+        }
+        source_image = Image.open(input_path)  # PIL image
+    else:
+        source_image = Image.open(upscale_settings["source_file"])
+    upscale_settings["source_image"] = source_image
+    if upscale_settings["target_file"]:
+        result = Image.open(upscale_settings["target_file"])
+    else:
+        result = Image.new(
+            mode="RGBA",
+            size=(
+                source_image.size[0] * int(upscale_settings["scale_factor"]),
+                source_image.size[1] * int(upscale_settings["scale_factor"]),
+            ),
+            color=(0, 0, 0, 0),
+        )
+    upscale_settings["target_image"] = result
+    # If the custom tile definition array 'tiles' is empty, proceed with the
+    # default tiled upscale task by defining all the possible image tiles; note
+    # that the actual tile size is 'tile_size' + 'tile_overlap' and the target
+    # image width and height are no longer constrained to multiples of 256 but
+    # are instead multiples of the actual tile size
+    if len(upscale_settings["tiles"]) == 0:
+        tile_size = upscale_settings["tile_size"]
+        scale_factor = upscale_settings["scale_factor"]
+        tile_overlap = upscale_settings["tile_overlap"]
+        total_cols = math.ceil(
+            source_image.size[0] / tile_size
+        )  # Image width / tile size
+        total_rows = math.ceil(
+            source_image.size[1] / tile_size
+        )  # Image height / tile size
+        for y in range(0, total_rows):
+            y_offset = tile_overlap if y > 0 else 0  # Tile mask offset
+            for x in range(0, total_cols):
+                x_offset = tile_overlap if x > 0 else 0  # Tile mask offset
+                x1 = x * tile_size
+                y1 = y * tile_size
+                w = tile_size + (tile_overlap if x < total_cols - 1 else 0)
+                h = tile_size + (tile_overlap if y < total_rows - 1 else 0)
+                mask_box = (  # Default tile mask box definition
+                    x_offset,
+                    y_offset,
+                    int(w * scale_factor),
+                    int(h * scale_factor),
+                )
+                upscale_settings["tiles"].append(
+                    {
+                        "x": x1,
+                        "y": y1,
+                        "w": w,
+                        "h": h,
+                        "mask_box": mask_box,
+                        "prompt": upscale_settings["prompt"],  # Use top level prompt if available
+                        "scale_factor": scale_factor,
+                    }
+                )
+    # Generate the output image tiles
+    for i in range(0, len(upscale_settings["tiles"])):
+        generate_upscaled_tile(
+            config,
+            i,
+            upscale_settings,
+            context=context,
+        )
+    # Save completed upscaled image
+    if upscale_settings["output_format"].upper() == "JPEG":
+        result_rgb = result.convert("RGB")
+        result.close()
+        result = result_rgb
+    result.save(output_path)
+    result.close()
+    source_image.close()
+    return
+def get_current_tile(
+    config,
+    context,
+    strength,
+):
+    config.lcm_diffusion_setting.strength = strength
+    config.lcm_diffusion_setting.diffusion_task = DiffusionTask.image_to_image.value
+    if (
+        config.lcm_diffusion_setting.use_tiny_auto_encoder
+        and config.lcm_diffusion_setting.use_openvino
+    ):
+        config.lcm_diffusion_setting.use_tiny_auto_encoder = False
+    current_tile = context.generate_text_to_image(
+        settings=config,
+        reshape=True,
+        device=DEVICE,
+        save_config=False,
+    )[0]
+    return current_tile
+# Generates a single tile from the source image as defined in the
+# upscale_settings["tiles"] array with the corresponding index and pastes the
+# generated tile into the target image using the corresponding mask and scale
+# factor; note that scale factor for the target image and the individual tiles
+# can be different, this function will adjust scale factors as needed
+def generate_upscaled_tile(
+    config,
+    index,
+    upscale_settings,
+    context: Context = None,
+):
+    if config == None or upscale_settings == None:
+        logging.error("Wrong arguments in tile creation function call!")
+        return
+    x = upscale_settings["tiles"][index]["x"]
+    y = upscale_settings["tiles"][index]["y"]
+    w = upscale_settings["tiles"][index]["w"]
+    h = upscale_settings["tiles"][index]["h"]
+    tile_prompt = upscale_settings["tiles"][index]["prompt"]
+    scale_factor = upscale_settings["scale_factor"]
+    tile_scale_factor = upscale_settings["tiles"][index]["scale_factor"]
+    target_width = int(w * tile_scale_factor)
+    target_height = int(h * tile_scale_factor)
+    strength = upscale_settings["strength"]
+    source_image = upscale_settings["source_image"]
+    target_image = upscale_settings["target_image"]
+    mask_image = generate_tile_mask(config, index, upscale_settings)
+    config.lcm_diffusion_setting.number_of_images = 1
+    config.lcm_diffusion_setting.prompt = tile_prompt
+    config.lcm_diffusion_setting.image_width = target_width
+    config.lcm_diffusion_setting.image_height = target_height
+    config.lcm_diffusion_setting.init_image = source_image.crop((x, y, x + w, y + h))
+    current_tile = None
+    print(f"[SD Upscale] Generating tile {index + 1}/{len(upscale_settings['tiles'])} ")
+    if tile_prompt == None or tile_prompt == "":
+        config.lcm_diffusion_setting.prompt = ""
+        config.lcm_diffusion_setting.negative_prompt = ""
+        current_tile = get_current_tile(config, context, strength)
+    else:
+        # Attempt to use img2img with low denoising strength to
+        # generate the tiles with the extra aid of a prompt
+        # context = get_context(InterfaceType.CLI)
+        current_tile = get_current_tile(config, context, strength)
+    if math.isclose(scale_factor, tile_scale_factor):
+        target_image.paste(
+            current_tile, (int(x * scale_factor), int(y * scale_factor)), mask_image
+        )
+    else:
+        target_image.paste(
+            current_tile.resize((int(w * scale_factor), int(h * scale_factor))),
+            (int(x * scale_factor), int(y * scale_factor)),
+            mask_image.resize((int(w * scale_factor), int(h * scale_factor))),
+        )
+    mask_image.close()
+    current_tile.close()
+    config.lcm_diffusion_setting.init_image.close()
+# Generate tile mask using the box definition in the upscale_settings["tiles"]
+# array with the corresponding index; note that tile masks for the default
+# tiled upscale task can be reused but that would complicate the code, so
+# new tile masks are instead created for each tile
+def generate_tile_mask(
+    config,
+    index,
+    upscale_settings,
+):
+    scale_factor = upscale_settings["scale_factor"]
+    tile_overlap = upscale_settings["tile_overlap"]
+    tile_scale_factor = upscale_settings["tiles"][index]["scale_factor"]
+    w = int(upscale_settings["tiles"][index]["w"] * tile_scale_factor)
+    h = int(upscale_settings["tiles"][index]["h"] * tile_scale_factor)
+    # The Stable Diffusion pipeline automatically adjusts the output size
+    # to multiples of 8 pixels; the mask must be created with the same
+    # size as the output tile
+    w = w - (w % 8)
+    h = h - (h % 8)
+    mask_box = upscale_settings["tiles"][index]["mask_box"]
+    if mask_box == None:
+        # Build a default solid mask with soft/transparent edges
+        mask_box = (
+            tile_overlap,
+            tile_overlap,
+            w - tile_overlap,
+            h - tile_overlap,
+        )
+    mask_image = Image.new(mode="RGBA", size=(w, h), color=(0, 0, 0, 0))
+    mask_draw = ImageDraw.Draw(mask_image)
+    mask_draw.rectangle(tuple(mask_box), fill=(0, 0, 0))
+    mask_blur = mask_image.filter(ImageFilter.BoxBlur(tile_overlap - 1))
+    mask_image.close()
+    return mask_blur

backend/upscale/upscaler.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from backend.models.lcmdiffusion_setting import DiffusionTask
+from backend.models.upscale import UpscaleMode
+from backend.upscale.edsr_upscale_onnx import upscale_edsr_2x
+from backend.upscale.aura_sr_upscale import upscale_aura_sr
+from backend.upscale.tiled_upscale import generate_upscaled_image
+from context import Context
+from PIL import Image
+from state import get_settings
+config = get_settings()
+def upscale_image(
+    context: Context,
+    src_image_path: str,
+    dst_image_path: str,
+    scale_factor: int = 2,
+    upscale_mode: UpscaleMode = UpscaleMode.normal.value,
+    strength: float = 0.1,
+):
+    if upscale_mode == UpscaleMode.normal.value:
+        upscaled_img = upscale_edsr_2x(src_image_path)
+        upscaled_img.save(dst_image_path)
+        print(f"Upscaled image saved {dst_image_path}")
+    elif upscale_mode == UpscaleMode.aura_sr.value:
+        upscaled_img = upscale_aura_sr(src_image_path)
+        upscaled_img.save(dst_image_path)
+        print(f"Upscaled image saved {dst_image_path}")
+    else:
+        config.settings.lcm_diffusion_setting.strength = (
+            0.3 if config.settings.lcm_diffusion_setting.use_openvino else strength
+        )
+        config.settings.lcm_diffusion_setting.diffusion_task = (
+            DiffusionTask.image_to_image.value
+        )
+        generate_upscaled_image(
+            config.settings,
+            src_image_path,
+            config.settings.lcm_diffusion_setting.strength,
+            upscale_settings=None,
+            context=context,
+            tile_overlap=(
+                32 if config.settings.lcm_diffusion_setting.use_openvino else 16
+            ),
+            output_path=dst_image_path,
+            image_format=config.settings.generated_images.format,
+        )
+        print(f"Upscaled image saved {dst_image_path}")
+    return [Image.open(dst_image_path)]

configs/lcm-lora-models.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+latent-consistency/lcm-lora-sdv1-5
+latent-consistency/lcm-lora-sdxl
+latent-consistency/lcm-lora-ssd-1b
+rupeshs/hypersd-sd1-5-1-step-lora

configs/lcm-models.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+stabilityai/sd-turbo
+rupeshs/sdxs-512-0.9-orig-vae
+rupeshs/hyper-sd-sdxl-1-step
+rupeshs/SDXL-Lightning-2steps
+stabilityai/sdxl-turbo
+SimianLuo/LCM_Dreamshaper_v7
+latent-consistency/lcm-sdxl
+latent-consistency/lcm-ssd-1b

configs/openvino-lcm-models.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+rupeshs/sd-turbo-openvino
+rupeshs/sdxs-512-0.9-openvino
+rupeshs/hyper-sd-sdxl-1-step-openvino-int8
+rupeshs/SDXL-Lightning-2steps-openvino-int8
+rupeshs/sdxl-turbo-openvino-int8
+rupeshs/LCM-dreamshaper-v7-openvino
+Disty0/LCM_SoteMix
+rupeshs/FLUX.1-schnell-openvino-int4
+rupeshs/sd15-lcm-square-openvino-int8

configs/stable-diffusion-models.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+Lykon/dreamshaper-8
+Fictiverse/Stable_Diffusion_PaperCut_Model
+stabilityai/stable-diffusion-xl-base-1.0
+runwayml/stable-diffusion-v1-5
+segmind/SSD-1B
+stablediffusionapi/anything-v5
+prompthero/openjourney-v4

constants.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from os import environ, cpu_count
+cpu_cores = cpu_count()
+cpus = cpu_cores // 2 if cpu_cores else 0
+APP_VERSION = "v1.0.0 beta 200"
+LCM_DEFAULT_MODEL = "stabilityai/sd-turbo"
+LCM_DEFAULT_MODEL_OPENVINO = "rupeshs/sd-turbo-openvino"
+APP_NAME = "FastSD CPU"
+APP_SETTINGS_FILE = "settings.yaml"
+RESULTS_DIRECTORY = "results"
+CONFIG_DIRECTORY = "configs"
+DEVICE = environ.get("DEVICE", "cpu")
+SD_MODELS_FILE = "stable-diffusion-models.txt"
+LCM_LORA_MODELS_FILE = "lcm-lora-models.txt"
+OPENVINO_LCM_MODELS_FILE = "openvino-lcm-models.txt"
+TAESD_MODEL = "madebyollin/taesd"
+TAESDXL_MODEL = "madebyollin/taesdxl"
+TAESD_MODEL_OPENVINO = "deinferno/taesd-openvino"
+LCM_MODELS_FILE = "lcm-models.txt"
+TAESDXL_MODEL_OPENVINO = "rupeshs/taesdxl-openvino"
+LORA_DIRECTORY = "lora_models"
+CONTROLNET_DIRECTORY = "controlnet_models"
+MODELS_DIRECTORY = "models"
+GGUF_THREADS = environ.get("GGUF_THREADS", cpus)
+TAEF1_MODEL_OPENVINO = "rupeshs/taef1-openvino"

context.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from typing import Any
+from app_settings import Settings
+from models.interface_types import InterfaceType
+from backend.models.lcmdiffusion_setting import DiffusionTask
+from backend.lcm_text_to_image import LCMTextToImage
+from time import perf_counter
+from backend.image_saver import ImageSaver
+from pprint import pprint
+class Context:
+    def __init__(
+        self,
+        interface_type: InterfaceType,
+        device="cpu",
+    ):
+        self.interface_type = interface_type.value
+        self.lcm_text_to_image = LCMTextToImage(device)
+        self._latency = 0
+    @property
+    def latency(self):
+        return self._latency
+    def generate_text_to_image(
+        self,
+        settings: Settings,
+        reshape: bool = False,
+        device: str = "cpu",
+        save_config=True,
+    ) -> Any:
+        if (
+            settings.lcm_diffusion_setting.use_tiny_auto_encoder
+            and settings.lcm_diffusion_setting.use_openvino
+        ):
+            print(
+                "WARNING: Tiny AutoEncoder is not supported in Image to image mode (OpenVINO)"
+            )
+        tick = perf_counter()
+        from state import get_settings
+        if (
+            settings.lcm_diffusion_setting.diffusion_task
+            == DiffusionTask.text_to_image.value
+        ):
+            settings.lcm_diffusion_setting.init_image = None
+        if save_config:
+            get_settings().save()
+        pprint(settings.lcm_diffusion_setting.model_dump())
+        if not settings.lcm_diffusion_setting.lcm_lora:
+            return None
+        self.lcm_text_to_image.init(
+            device,
+            settings.lcm_diffusion_setting,
+        )
+        images = self.lcm_text_to_image.generate(
+            settings.lcm_diffusion_setting,
+            reshape,
+        )
+        elapsed = perf_counter() - tick
+        self._latency = elapsed
+        print(f"Latency : {elapsed:.2f} seconds")
+        if settings.lcm_diffusion_setting.controlnet:
+            if settings.lcm_diffusion_setting.controlnet.enabled:
+                images.append(settings.lcm_diffusion_setting.controlnet._control_image)
+        return images
+    def save_images(
+        self,
+        images: Any,
+        settings: Settings,
+    ) -> list[str]:
+        saved_images = []
+        if images and settings.generated_images.save_image:
+            saved_images = ImageSaver.save_images(
+                settings.generated_images.path,
+                images=images,
+                lcm_diffusion_setting=settings.lcm_diffusion_setting,
+                format=settings.generated_images.format,
+                jpeg_quality=settings.generated_images.save_image_quality,
+            )
+        return saved_images

frontend/cli_interactive.py ADDED Viewed

	@@ -0,0 +1,661 @@

+from os import path
+from PIL import Image
+from typing import Any
+from constants import DEVICE
+from paths import FastStableDiffusionPaths
+from backend.upscale.upscaler import upscale_image
+from backend.upscale.tiled_upscale import generate_upscaled_image
+from frontend.webui.image_variations_ui import generate_image_variations
+from backend.lora import (
+    get_active_lora_weights,
+    update_lora_weights,
+    load_lora_weight,
+)
+from backend.models.lcmdiffusion_setting import (
+    DiffusionTask,
+    ControlNetSetting,
+)
+_batch_count = 1
+_edit_lora_settings = False
+def user_value(
+    value_type: type,
+    message: str,
+    default_value: Any,
+) -> Any:
+    try:
+        value = value_type(input(message))
+    except:
+        value = default_value
+    return value
+def interactive_mode(
+    config,
+    context,
+):
+    print("=============================================")
+    print("Welcome to FastSD CPU Interactive CLI")
+    print("=============================================")
+    while True:
+        print("> 1. Text to Image")
+        print("> 2. Image to Image")
+        print("> 3. Image Variations")
+        print("> 4. EDSR Upscale")
+        print("> 5. SD Upscale")
+        print("> 6. Edit default generation settings")
+        print("> 7. Edit LoRA settings")
+        print("> 8. Edit ControlNet settings")
+        print("> 9. Edit negative prompt")
+        print("> 10. Quit")
+        option = user_value(
+            int,
+            "Enter a Diffusion Task number (1): ",
+            1,
+        )
+        if option not in range(1, 11):
+            print("Wrong Diffusion Task number!")
+            exit()
+        if option == 1:
+            interactive_txt2img(
+                config,
+                context,
+            )
+        elif option == 2:
+            interactive_img2img(
+                config,
+                context,
+            )
+        elif option == 3:
+            interactive_variations(
+                config,
+                context,
+            )
+        elif option == 4:
+            interactive_edsr(
+                config,
+                context,
+            )
+        elif option == 5:
+            interactive_sdupscale(
+                config,
+                context,
+            )
+        elif option == 6:
+            interactive_settings(
+                config,
+                context,
+            )
+        elif option == 7:
+            interactive_lora(
+                config,
+                context,
+                True,
+            )
+        elif option == 8:
+            interactive_controlnet(
+                config,
+                context,
+                True,
+            )
+        elif option == 9:
+            interactive_negative(
+                config,
+                context,
+            )
+        elif option == 10:
+            exit()
+def interactive_negative(
+    config,
+    context,
+):
+    settings = config.lcm_diffusion_setting
+    print(f"Current negative prompt: '{settings.negative_prompt}'")
+    user_input = input("Write a negative prompt (set guidance > 1.0): ")
+    if user_input == "":
+        return
+    else:
+        settings.negative_prompt = user_input
+def interactive_controlnet(
+    config,
+    context,
+    menu_flag=False,
+):
+    """
+    @param menu_flag: Indicates whether this function was called from the main
+        interactive CLI menu; _True_ if called from the main menu, _False_ otherwise
+    """
+    settings = config.lcm_diffusion_setting
+    if not settings.controlnet:
+        settings.controlnet = ControlNetSetting()
+    current_enabled = settings.controlnet.enabled
+    current_adapter_path = settings.controlnet.adapter_path
+    current_conditioning_scale = settings.controlnet.conditioning_scale
+    current_control_image = settings.controlnet._control_image
+    option = input("Enable ControlNet? (y/N): ")
+    settings.controlnet.enabled = True if option.upper() == "Y" else False
+    if settings.controlnet.enabled:
+        option = input(
+            f"Enter ControlNet adapter path ({settings.controlnet.adapter_path}): "
+        )
+        if option != "":
+            settings.controlnet.adapter_path = option
+        settings.controlnet.conditioning_scale = user_value(
+            float,
+            f"Enter ControlNet conditioning scale ({settings.controlnet.conditioning_scale}): ",
+            settings.controlnet.conditioning_scale,
+        )
+        option = input(
+            f"Enter ControlNet control image path (Leave empty to reuse current): "
+        )
+        if option != "":
+            try:
+                new_image = Image.open(option)
+                settings.controlnet._control_image = new_image
+            except (AttributeError, FileNotFoundError) as e:
+                settings.controlnet._control_image = None
+        if (
+            not settings.controlnet.adapter_path
+            or not path.exists(settings.controlnet.adapter_path)
+            or not settings.controlnet._control_image
+        ):
+            print("Invalid ControlNet settings! Disabling ControlNet")
+            settings.controlnet.enabled = False
+    if (
+        settings.controlnet.enabled != current_enabled
+        or settings.controlnet.adapter_path != current_adapter_path
+    ):
+        settings.rebuild_pipeline = True
+def interactive_lora(
+    config,
+    context,
+    menu_flag=False,
+):
+    """
+    @param menu_flag: Indicates whether this function was called from the main
+        interactive CLI menu; _True_ if called from the main menu, _False_ otherwise
+    """
+    if context == None or context.lcm_text_to_image.pipeline == None:
+        print("Diffusion pipeline not initialized, please run a generation task first!")
+        return
+    print("> 1. Change LoRA weights")
+    print("> 2. Load new LoRA model")
+    option = user_value(
+        int,
+        "Enter a LoRA option (1): ",
+        1,
+    )
+    if option not in range(1, 3):
+        print("Wrong LoRA option!")
+        return
+    if option == 1:
+        update_weights = []
+        active_weights = get_active_lora_weights()
+        for lora in active_weights:
+            weight = user_value(
+                float,
+                f"Enter a new LoRA weight for {lora[0]} ({lora[1]}): ",
+                lora[1],
+            )
+            update_weights.append(
+                (
+                    lora[0],
+                    weight,
+                )
+            )
+        if len(update_weights) > 0:
+            update_lora_weights(
+                context.lcm_text_to_image.pipeline,
+                config.lcm_diffusion_setting,
+                update_weights,
+            )
+    elif option == 2:
+        # Load a new LoRA
+        settings = config.lcm_diffusion_setting
+        settings.lora.fuse = False
+        settings.lora.enabled = False
+        settings.lora.path = input("Enter LoRA model path: ")
+        settings.lora.weight = user_value(
+            float,
+            "Enter a LoRA weight (0.5): ",
+            0.5,
+        )
+        if not path.exists(settings.lora.path):
+            print("Invalid LoRA model path!")
+            return
+        settings.lora.enabled = True
+        load_lora_weight(context.lcm_text_to_image.pipeline, settings)
+    if menu_flag:
+        global _edit_lora_settings
+        _edit_lora_settings = False
+        option = input("Edit LoRA settings after every generation? (y/N): ")
+        if option.upper() == "Y":
+            _edit_lora_settings = True
+def interactive_settings(
+    config,
+    context,
+):
+    global _batch_count
+    settings = config.lcm_diffusion_setting
+    print("Enter generation settings (leave empty to use current value)")
+    print("> 1. Use LCM")
+    print("> 2. Use LCM-Lora")
+    print("> 3. Use OpenVINO")
+    option = user_value(
+        int,
+        "Select inference model option (1): ",
+        1,
+    )
+    if option not in range(1, 4):
+        print("Wrong inference model option! Falling back to defaults")
+        return
+    settings.use_lcm_lora = False
+    settings.use_openvino = False
+    if option == 1:
+        lcm_model_id = input(f"Enter LCM model ID ({settings.lcm_model_id}): ")
+        if lcm_model_id != "":
+            settings.lcm_model_id = lcm_model_id
+    elif option == 2:
+        settings.use_lcm_lora = True
+        lcm_lora_id = input(
+            f"Enter LCM-Lora model ID ({settings.lcm_lora.lcm_lora_id}): "
+        )
+        if lcm_lora_id != "":
+            settings.lcm_lora.lcm_lora_id = lcm_lora_id
+        base_model_id = input(
+            f"Enter Base model ID ({settings.lcm_lora.base_model_id}): "
+        )
+        if base_model_id != "":
+            settings.lcm_lora.base_model_id = base_model_id
+    elif option == 3:
+        settings.use_openvino = True
+        openvino_lcm_model_id = input(
+            f"Enter OpenVINO model ID ({settings.openvino_lcm_model_id}): "
+        )
+        if openvino_lcm_model_id != "":
+            settings.openvino_lcm_model_id = openvino_lcm_model_id
+    settings.use_offline_model = True
+    settings.use_tiny_auto_encoder = True
+    option = input("Work offline? (Y/n): ")
+    if option.upper() == "N":
+        settings.use_offline_model = False
+    option = input("Use Tiny Auto Encoder? (Y/n): ")
+    if option.upper() == "N":
+        settings.use_tiny_auto_encoder = False
+    settings.image_width = user_value(
+        int,
+        f"Image width ({settings.image_width}): ",
+        settings.image_width,
+    )
+    settings.image_height = user_value(
+        int,
+        f"Image height ({settings.image_height}): ",
+        settings.image_height,
+    )
+    settings.inference_steps = user_value(
+        int,
+        f"Inference steps ({settings.inference_steps}): ",
+        settings.inference_steps,
+    )
+    settings.guidance_scale = user_value(
+        float,
+        f"Guidance scale ({settings.guidance_scale}): ",
+        settings.guidance_scale,
+    )
+    settings.number_of_images = user_value(
+        int,
+        f"Number of images per batch ({settings.number_of_images}): ",
+        settings.number_of_images,
+    )
+    _batch_count = user_value(
+        int,
+        f"Batch count ({_batch_count}): ",
+        _batch_count,
+    )
+    # output_format = user_value(int, f"Output format (PNG)", 1)
+    print(config.lcm_diffusion_setting)
+def interactive_txt2img(
+    config,
+    context,
+):
+    global _batch_count
+    config.lcm_diffusion_setting.diffusion_task = DiffusionTask.text_to_image.value
+    user_input = input("Write a prompt (write 'exit' to quit): ")
+    while True:
+        if user_input == "exit":
+            return
+        elif user_input == "":
+            user_input = config.lcm_diffusion_setting.prompt
+        config.lcm_diffusion_setting.prompt = user_input
+        for _ in range(0, _batch_count):
+            images = context.generate_text_to_image(
+                settings=config,
+                device=DEVICE,
+            )
+            context.save_images(
+                images,
+                config,
+            )
+        if _edit_lora_settings:
+            interactive_lora(
+                config,
+                context,
+            )
+        user_input = input("Write a prompt: ")
+def interactive_img2img(
+    config,
+    context,
+):
+    global _batch_count
+    settings = config.lcm_diffusion_setting
+    settings.diffusion_task = DiffusionTask.image_to_image.value
+    steps = settings.inference_steps
+    source_path = input("Image path: ")
+    if source_path == "":
+        print("Error : You need to provide a file in img2img mode")
+        return
+    settings.strength = user_value(
+        float,
+        f"img2img strength ({settings.strength}): ",
+        settings.strength,
+    )
+    settings.inference_steps = int(steps / settings.strength + 1)
+    user_input = input("Write a prompt (write 'exit' to quit): ")
+    while True:
+        if user_input == "exit":
+            settings.inference_steps = steps
+            return
+        settings.init_image = Image.open(source_path)
+        settings.prompt = user_input
+        for _ in range(0, _batch_count):
+            images = context.generate_text_to_image(
+                settings=config,
+                device=DEVICE,
+            )
+            context.save_images(
+                images,
+                config,
+            )
+        new_path = input(f"Image path ({source_path}): ")
+        if new_path != "":
+            source_path = new_path
+        settings.strength = user_value(
+            float,
+            f"img2img strength ({settings.strength}): ",
+            settings.strength,
+        )
+        if _edit_lora_settings:
+            interactive_lora(
+                config,
+                context,
+            )
+        settings.inference_steps = int(steps / settings.strength + 1)
+        user_input = input("Write a prompt: ")
+def interactive_variations(
+    config,
+    context,
+):
+    global _batch_count
+    settings = config.lcm_diffusion_setting
+    settings.diffusion_task = DiffusionTask.image_to_image.value
+    steps = settings.inference_steps
+    source_path = input("Image path: ")
+    if source_path == "":
+        print("Error : You need to provide a file in Image variations mode")
+        return
+    settings.strength = user_value(
+        float,
+        f"Image variations strength ({settings.strength}): ",
+        settings.strength,
+    )
+    settings.inference_steps = int(steps / settings.strength + 1)
+    while True:
+        settings.init_image = Image.open(source_path)
+        settings.prompt = ""
+        for i in range(0, _batch_count):
+            generate_image_variations(
+                settings.init_image,
+                settings.strength,
+            )
+        if _edit_lora_settings:
+            interactive_lora(
+                config,
+                context,
+            )
+        user_input = input("Continue in Image variations mode? (Y/n): ")
+        if user_input.upper() == "N":
+            settings.inference_steps = steps
+            return
+        new_path = input(f"Image path ({source_path}): ")
+        if new_path != "":
+            source_path = new_path
+        settings.strength = user_value(
+            float,
+            f"Image variations strength ({settings.strength}): ",
+            settings.strength,
+        )
+        settings.inference_steps = int(steps / settings.strength + 1)
+def interactive_edsr(
+    config,
+    context,
+):
+    source_path = input("Image path: ")
+    if source_path == "":
+        print("Error : You need to provide a file in EDSR mode")
+        return
+    while True:
+        output_path = FastStableDiffusionPaths.get_upscale_filepath(
+            source_path,
+            2,
+            config.generated_images.format,
+        )
+        result = upscale_image(
+            context,
+            source_path,
+            output_path,
+            2,
+        )
+        user_input = input("Continue in EDSR upscale mode? (Y/n): ")
+        if user_input.upper() == "N":
+            return
+        new_path = input(f"Image path ({source_path}): ")
+        if new_path != "":
+            source_path = new_path
+def interactive_sdupscale_settings(config):
+    steps = config.lcm_diffusion_setting.inference_steps
+    custom_settings = {}
+    print("> 1. Upscale whole image")
+    print("> 2. Define custom tiles (advanced)")
+    option = user_value(
+        int,
+        "Select an SD Upscale option (1): ",
+        1,
+    )
+    if option not in range(1, 3):
+        print("Wrong SD Upscale option!")
+        return
+    # custom_settings["source_file"] = args.file
+    custom_settings["source_file"] = ""
+    new_path = input(f"Input image path ({custom_settings['source_file']}): ")
+    if new_path != "":
+        custom_settings["source_file"] = new_path
+    if custom_settings["source_file"] == "":
+        print("Error : You need to provide a file in SD Upscale mode")
+        return
+    custom_settings["target_file"] = None
+    if option == 2:
+        custom_settings["target_file"] = input("Image to patch: ")
+        if custom_settings["target_file"] == "":
+            print("No target file provided, upscaling whole input image instead!")
+            custom_settings["target_file"] = None
+            option = 1
+    custom_settings["output_format"] = config.generated_images.format
+    custom_settings["strength"] = user_value(
+        float,
+        f"SD Upscale strength ({config.lcm_diffusion_setting.strength}): ",
+        config.lcm_diffusion_setting.strength,
+    )
+    config.lcm_diffusion_setting.inference_steps = int(
+        steps / custom_settings["strength"] + 1
+    )
+    if option == 1:
+        custom_settings["scale_factor"] = user_value(
+            float,
+            f"Scale factor (2.0): ",
+            2.0,
+        )
+        custom_settings["tile_size"] = user_value(
+            int,
+            f"Split input image into tiles of the following size, in pixels (256): ",
+            256,
+        )
+        custom_settings["tile_overlap"] = user_value(
+            int,
+            f"Tile overlap, in pixels (16): ",
+            16,
+        )
+    elif option == 2:
+        custom_settings["scale_factor"] = user_value(
+            float,
+            "Input image to Image-to-patch scale_factor (2.0): ",
+            2.0,
+        )
+        custom_settings["tile_size"] = 256
+        custom_settings["tile_overlap"] = 16
+    custom_settings["prompt"] = input(
+        "Write a prompt describing the input image (optional): "
+    )
+    custom_settings["tiles"] = []
+    if option == 2:
+        add_tile = True
+        while add_tile:
+            print("=== Define custom SD Upscale tile ===")
+            tile_x = user_value(
+                int,
+                "Enter tile's X position: ",
+                0,
+            )
+            tile_y = user_value(
+                int,
+                "Enter tile's Y position: ",
+                0,
+            )
+            tile_w = user_value(
+                int,
+                "Enter tile's width (256): ",
+                256,
+            )
+            tile_h = user_value(
+                int,
+                "Enter tile's height (256): ",
+                256,
+            )
+            tile_scale = user_value(
+                float,
+                "Enter tile's scale factor (2.0): ",
+                2.0,
+            )
+            tile_prompt = input("Enter tile's prompt (optional): ")
+            custom_settings["tiles"].append(
+                {
+                    "x": tile_x,
+                    "y": tile_y,
+                    "w": tile_w,
+                    "h": tile_h,
+                    "mask_box": None,
+                    "prompt": tile_prompt,
+                    "scale_factor": tile_scale,
+                }
+            )
+            tile_option = input("Do you want to define another tile? (y/N): ")
+            if tile_option == "" or tile_option.upper() == "N":
+                add_tile = False
+    return custom_settings
+def interactive_sdupscale(
+    config,
+    context,
+):
+    settings = config.lcm_diffusion_setting
+    settings.diffusion_task = DiffusionTask.image_to_image.value
+    settings.init_image = ""
+    source_path = ""
+    steps = settings.inference_steps
+    while True:
+        custom_upscale_settings = None
+        option = input("Edit custom SD Upscale settings? (y/N): ")
+        if option.upper() == "Y":
+            config.lcm_diffusion_setting.inference_steps = steps
+            custom_upscale_settings = interactive_sdupscale_settings(config)
+            if not custom_upscale_settings:
+                return
+            source_path = custom_upscale_settings["source_file"]
+        else:
+            new_path = input(f"Image path ({source_path}): ")
+            if new_path != "":
+                source_path = new_path
+            if source_path == "":
+                print("Error : You need to provide a file in SD Upscale mode")
+                return
+            settings.strength = user_value(
+                float,
+                f"SD Upscale strength ({settings.strength}): ",
+                settings.strength,
+            )
+            settings.inference_steps = int(steps / settings.strength + 1)
+        output_path = FastStableDiffusionPaths.get_upscale_filepath(
+            source_path,
+            2,
+            config.generated_images.format,
+        )
+        generate_upscaled_image(
+            config,
+            source_path,
+            settings.strength,
+            upscale_settings=custom_upscale_settings,
+            context=context,
+            tile_overlap=32 if settings.use_openvino else 16,
+            output_path=output_path,
+            image_format=config.generated_images.format,
+        )
+        user_input = input("Continue in SD Upscale mode? (Y/n): ")
+        if user_input.upper() == "N":
+            settings.inference_steps = steps
+            return