Meismaxandmaxisme commited on
Commit
4d7970b
·
verified ·
1 Parent(s): 1e8cffa

Upload 3 files

Browse files
src/backend/openvino/ov_hc_stablediffusion_pipeline.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This is an experimental pipeline used to test AI PC NPU and GPU"""
2
+
3
+ from pathlib import Path
4
+
5
+ from diffusers import EulerDiscreteScheduler,LCMScheduler
6
+ from huggingface_hub import snapshot_download
7
+ from PIL import Image
8
+ from backend.openvino.stable_diffusion_engine import (
9
+ StableDiffusionEngineAdvanced,
10
+ LatentConsistencyEngineAdvanced
11
+ )
12
+
13
+
14
+ class OvHcStableDiffusion:
15
+ "OpenVINO Heterogeneous compute Stablediffusion"
16
+
17
+ def __init__(
18
+ self,
19
+ model_path,
20
+ device: list = ["GPU", "NPU", "GPU", "GPU"],
21
+ ):
22
+ model_dir = Path(snapshot_download(model_path))
23
+ self.scheduler = EulerDiscreteScheduler(
24
+ beta_start=0.00085,
25
+ beta_end=0.012,
26
+ beta_schedule="scaled_linear",
27
+ )
28
+ self.ov_sd_pipleline = StableDiffusionEngineAdvanced(
29
+ model=model_dir,
30
+ device=device,
31
+ )
32
+
33
+ def generate(
34
+ self,
35
+ prompt: str,
36
+ neg_prompt: str,
37
+ init_image: Image = None,
38
+ strength: float = 1.0,
39
+ ):
40
+ image = self.ov_sd_pipleline(
41
+ prompt=prompt,
42
+ negative_prompt=neg_prompt,
43
+ init_image=init_image,
44
+ strength=strength,
45
+ num_inference_steps=25,
46
+ scheduler=self.scheduler,
47
+ )
48
+ image_rgb = image[..., ::-1]
49
+ return Image.fromarray(image_rgb)
50
+
51
+
52
+ class OvHcLatentConsistency:
53
+ """
54
+ OpenVINO Heterogeneous compute Latent consistency models
55
+ For the current Intel Cor Ultra, the Text Encoder and Unet can run on NPU
56
+ Supports following - Text to image , Image to image and image variations
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ model_path,
62
+ device: list = ["NPU", "NPU", "GPU"],
63
+ ):
64
+
65
+ model_dir = Path(snapshot_download(model_path))
66
+
67
+ self.scheduler = LCMScheduler(
68
+ beta_start=0.001,
69
+ beta_end=0.01,
70
+ )
71
+ self.ov_sd_pipleline = LatentConsistencyEngineAdvanced(
72
+ model=model_dir,
73
+ device=device,
74
+ )
75
+
76
+ def generate(
77
+ self,
78
+ prompt: str,
79
+ neg_prompt: str,
80
+ init_image: Image = None,
81
+ num_inference_steps=4,
82
+ strength: float = 0.5,
83
+ ):
84
+ image = self.ov_sd_pipleline(
85
+ prompt=prompt,
86
+ init_image = init_image,
87
+ strength = strength,
88
+ num_inference_steps=num_inference_steps,
89
+ scheduler=self.scheduler,
90
+ seed=None,
91
+ )
92
+
93
+ return image
src/backend/openvino/pipelines.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Any
3
+
4
+ from optimum.intel.openvino import OVDiffusionPipeline
5
+ from optimum.intel.openvino.modeling_diffusion import (
6
+ OVModelVae,
7
+ OVModelVaeDecoder,
8
+ OVModelVaeEncoder,
9
+ )
10
+
11
+ from backend.device import is_openvino_device
12
+ from backend.tiny_autoencoder import get_tiny_autoencoder_repo_id
13
+ from constants import DEVICE, LCM_DEFAULT_MODEL_OPENVINO
14
+ from paths import get_base_folder_name
15
+
16
+ if is_openvino_device():
17
+ from huggingface_hub import snapshot_download
18
+ from optimum.intel.openvino.modeling_diffusion import (
19
+ OVBaseModel,
20
+ OVStableDiffusionImg2ImgPipeline,
21
+ OVStableDiffusionPipeline,
22
+ OVStableDiffusionXLImg2ImgPipeline,
23
+ OVStableDiffusionXLPipeline,
24
+ )
25
+
26
+
27
+ def ov_load_tiny_autoencoder(
28
+ pipeline: Any,
29
+ use_local_model: bool = False,
30
+ ):
31
+ taesd_dir = snapshot_download(
32
+ repo_id=get_tiny_autoencoder_repo_id(pipeline.__class__.__name__),
33
+ local_files_only=use_local_model,
34
+ )
35
+ vae_decoder = OVModelVaeDecoder(
36
+ model=OVBaseModel.load_model(f"{taesd_dir}/vae_decoder/openvino_model.xml"),
37
+ parent_pipeline=pipeline,
38
+ model_name="vae_decoder",
39
+ )
40
+ vae_encoder = OVModelVaeEncoder(
41
+ model=OVBaseModel.load_model(f"{taesd_dir}/vae_encoder/openvino_model.xml"),
42
+ parent_pipeline=pipeline,
43
+ model_name="vae_encoder",
44
+ )
45
+ pipeline.vae = OVModelVae(
46
+ decoder=vae_decoder,
47
+ encoder=vae_encoder,
48
+ )
49
+ pipeline.vae.config.scaling_factor = 1.0
50
+
51
+
52
+ def get_ov_text_to_image_pipeline(
53
+ model_id: str = LCM_DEFAULT_MODEL_OPENVINO,
54
+ use_local_model: bool = False,
55
+ ) -> Any:
56
+ if "xl" in get_base_folder_name(model_id).lower():
57
+ pipeline = OVStableDiffusionXLPipeline.from_pretrained(
58
+ model_id,
59
+ local_files_only=use_local_model,
60
+ ov_config={"CACHE_DIR": ""},
61
+ device=DEVICE.upper(),
62
+ )
63
+ else:
64
+ pipeline = OVStableDiffusionPipeline.from_pretrained(
65
+ model_id,
66
+ local_files_only=use_local_model,
67
+ ov_config={"CACHE_DIR": ""},
68
+ device=DEVICE.upper(),
69
+ )
70
+
71
+ return pipeline
72
+
73
+
74
+ def get_ov_image_to_image_pipeline(
75
+ model_id: str = LCM_DEFAULT_MODEL_OPENVINO,
76
+ use_local_model: bool = False,
77
+ ) -> Any:
78
+ if "xl" in get_base_folder_name(model_id).lower():
79
+ pipeline = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(
80
+ model_id,
81
+ local_files_only=use_local_model,
82
+ ov_config={"CACHE_DIR": ""},
83
+ device=DEVICE.upper(),
84
+ )
85
+ else:
86
+ pipeline = OVStableDiffusionImg2ImgPipeline.from_pretrained(
87
+ model_id,
88
+ local_files_only=use_local_model,
89
+ ov_config={"CACHE_DIR": ""},
90
+ device=DEVICE.upper(),
91
+ )
92
+ return pipeline
93
+
94
+
95
+ def get_ov_diffusion_pipeline(
96
+ model_id: str,
97
+ use_local_model: bool = False,
98
+ ) -> Any:
99
+ pipeline = OVDiffusionPipeline.from_pretrained(
100
+ model_id,
101
+ local_files_only=use_local_model,
102
+ ov_config={"CACHE_DIR": ""},
103
+ device=DEVICE.upper(),
104
+ )
105
+ return pipeline
src/backend/openvino/stable_diffusion_engine.py ADDED
@@ -0,0 +1,1817 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright(C) 2022-2023 Intel Corporation
3
+ SPDX - License - Identifier: Apache - 2.0
4
+
5
+ """
6
+ import inspect
7
+ from typing import Union, Optional, Any, List, Dict
8
+ import numpy as np
9
+ # openvino
10
+ from openvino.runtime import Core
11
+ # tokenizer
12
+ from transformers import CLIPTokenizer
13
+ import torch
14
+ import random
15
+
16
+ from diffusers import DiffusionPipeline
17
+ from diffusers.schedulers import (DDIMScheduler,
18
+ LMSDiscreteScheduler,
19
+ PNDMScheduler,
20
+ EulerDiscreteScheduler,
21
+ EulerAncestralDiscreteScheduler)
22
+
23
+
24
+ from diffusers.image_processor import VaeImageProcessor
25
+ from diffusers.utils.torch_utils import randn_tensor
26
+ from diffusers.utils import PIL_INTERPOLATION
27
+
28
+ import cv2
29
+ import os
30
+ import sys
31
+
32
+ # for multithreading
33
+ import concurrent.futures
34
+
35
+ #For GIF
36
+ import PIL
37
+ from PIL import Image
38
+ import glob
39
+ import json
40
+ import time
41
+
42
+ def scale_fit_to_window(dst_width:int, dst_height:int, image_width:int, image_height:int):
43
+ """
44
+ Preprocessing helper function for calculating image size for resize with peserving original aspect ratio
45
+ and fitting image to specific window size
46
+
47
+ Parameters:
48
+ dst_width (int): destination window width
49
+ dst_height (int): destination window height
50
+ image_width (int): source image width
51
+ image_height (int): source image height
52
+ Returns:
53
+ result_width (int): calculated width for resize
54
+ result_height (int): calculated height for resize
55
+ """
56
+ im_scale = min(dst_height / image_height, dst_width / image_width)
57
+ return int(im_scale * image_width), int(im_scale * image_height)
58
+
59
+ def preprocess(image: PIL.Image.Image, ht=512, wt=512):
60
+ """
61
+ Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512,
62
+ then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that
63
+ converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW.
64
+ The function returns preprocessed input tensor and padding size, which can be used in postprocessing.
65
+
66
+ Parameters:
67
+ image (PIL.Image.Image): input image
68
+ Returns:
69
+ image (np.ndarray): preprocessed image tensor
70
+ meta (Dict): dictionary with preprocessing metadata info
71
+ """
72
+
73
+ src_width, src_height = image.size
74
+ image = image.convert('RGB')
75
+ dst_width, dst_height = scale_fit_to_window(
76
+ wt, ht, src_width, src_height)
77
+ image = np.array(image.resize((dst_width, dst_height),
78
+ resample=PIL.Image.Resampling.LANCZOS))[None, :]
79
+
80
+ pad_width = wt - dst_width
81
+ pad_height = ht - dst_height
82
+ pad = ((0, 0), (0, pad_height), (0, pad_width), (0, 0))
83
+ image = np.pad(image, pad, mode="constant")
84
+ image = image.astype(np.float32) / 255.0
85
+ image = 2.0 * image - 1.0
86
+ image = image.transpose(0, 3, 1, 2)
87
+
88
+ return image, {"padding": pad, "src_width": src_width, "src_height": src_height}
89
+
90
+ def try_enable_npu_turbo(device, core):
91
+ import platform
92
+ if "windows" in platform.system().lower():
93
+ if "NPU" in device and "3720" not in core.get_property('NPU', 'DEVICE_ARCHITECTURE'):
94
+ try:
95
+ core.set_property(properties={'NPU_TURBO': 'YES'},device_name='NPU')
96
+ except:
97
+ print(f"Failed loading NPU_TURBO for device {device}. Skipping... ")
98
+ else:
99
+ print_npu_turbo_art()
100
+ else:
101
+ print(f"Skipping NPU_TURBO for device {device}")
102
+ elif "linux" in platform.system().lower():
103
+ if os.path.isfile('/sys/module/intel_vpu/parameters/test_mode'):
104
+ with open('/sys/module/intel_vpu/version', 'r') as f:
105
+ version = f.readline().split()[0]
106
+ if tuple(map(int, version.split('.'))) < tuple(map(int, '1.9.0'.split('.'))):
107
+ print(f"The driver intel_vpu-1.9.0 (or later) needs to be loaded for NPU Turbo (currently {version}). Skipping...")
108
+ else:
109
+ with open('/sys/module/intel_vpu/parameters/test_mode', 'r') as tm_file:
110
+ test_mode = int(tm_file.readline().split()[0])
111
+ if test_mode == 512:
112
+ print_npu_turbo_art()
113
+ else:
114
+ print("The driver >=intel_vpu-1.9.0 was must be loaded with "
115
+ "\"modprobe intel_vpu test_mode=512\" to enable NPU_TURBO "
116
+ f"(currently test_mode={test_mode}). Skipping...")
117
+ else:
118
+ print(f"The driver >=intel_vpu-1.9.0 must be loaded with \"modprobe intel_vpu test_mode=512\" to enable NPU_TURBO. Skipping...")
119
+ else:
120
+ print(f"This platform ({platform.system()}) does not support NPU Turbo")
121
+
122
+ def result(var):
123
+ return next(iter(var.values()))
124
+
125
+ class StableDiffusionEngineAdvanced(DiffusionPipeline):
126
+ def __init__(self, model="runwayml/stable-diffusion-v1-5",
127
+ tokenizer="openai/clip-vit-large-patch14",
128
+ device=["CPU", "CPU", "CPU", "CPU"]):
129
+ try:
130
+ self.tokenizer = CLIPTokenizer.from_pretrained(model, local_files_only=True)
131
+ except:
132
+ self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer)
133
+ self.tokenizer.save_pretrained(model)
134
+
135
+ self.core = Core()
136
+ self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')})
137
+ try_enable_npu_turbo(device, self.core)
138
+
139
+ print("Loading models... ")
140
+
141
+
142
+
143
+ with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
144
+ futures = {
145
+ "unet_time_proj": executor.submit(self.core.compile_model, os.path.join(model, "unet_time_proj.xml"), device[0]),
146
+ "text": executor.submit(self.load_model, model, "text_encoder", device[0]),
147
+ "unet": executor.submit(self.load_model, model, "unet_int8", device[1]),
148
+ "unet_neg": executor.submit(self.load_model, model, "unet_int8", device[2]) if device[1] != device[2] else None,
149
+ "vae_decoder": executor.submit(self.load_model, model, "vae_decoder", device[3]),
150
+ "vae_encoder": executor.submit(self.load_model, model, "vae_encoder", device[3])
151
+ }
152
+
153
+ self.unet_time_proj = futures["unet_time_proj"].result()
154
+ self.text_encoder = futures["text"].result()
155
+ self.unet = futures["unet"].result()
156
+ self.unet_neg = futures["unet_neg"].result() if futures["unet_neg"] else self.unet
157
+ self.vae_decoder = futures["vae_decoder"].result()
158
+ self.vae_encoder = futures["vae_encoder"].result()
159
+ print("Text Device:", device[0])
160
+ print("unet Device:", device[1])
161
+ print("unet-neg Device:", device[2])
162
+ print("VAE Device:", device[3])
163
+
164
+ self._text_encoder_output = self.text_encoder.output(0)
165
+ self._vae_d_output = self.vae_decoder.output(0)
166
+ self._vae_e_output = self.vae_encoder.output(0) if self.vae_encoder else None
167
+
168
+ self.set_dimensions()
169
+ self.infer_request_neg = self.unet_neg.create_infer_request()
170
+ self.infer_request = self.unet.create_infer_request()
171
+ self.infer_request_time_proj = self.unet_time_proj.create_infer_request()
172
+ self.time_proj_constants = np.load(os.path.join(model, "time_proj_constants.npy"))
173
+
174
+ def load_model(self, model, model_name, device):
175
+ if "NPU" in device:
176
+ with open(os.path.join(model, f"{model_name}.blob"), "rb") as f:
177
+ return self.core.import_model(f.read(), device)
178
+ return self.core.compile_model(os.path.join(model, f"{model_name}.xml"), device)
179
+
180
+ def set_dimensions(self):
181
+ latent_shape = self.unet.input("latent_model_input").shape
182
+ if latent_shape[1] == 4:
183
+ self.height = latent_shape[2] * 8
184
+ self.width = latent_shape[3] * 8
185
+ else:
186
+ self.height = latent_shape[1] * 8
187
+ self.width = latent_shape[2] * 8
188
+
189
+ def __call__(
190
+ self,
191
+ prompt,
192
+ init_image = None,
193
+ negative_prompt=None,
194
+ scheduler=None,
195
+ strength = 0.5,
196
+ num_inference_steps = 32,
197
+ guidance_scale = 7.5,
198
+ eta = 0.0,
199
+ create_gif = False,
200
+ model = None,
201
+ callback = None,
202
+ callback_userdata = None
203
+ ):
204
+
205
+ # extract condition
206
+ text_input = self.tokenizer(
207
+ prompt,
208
+ padding="max_length",
209
+ max_length=self.tokenizer.model_max_length,
210
+ truncation=True,
211
+ return_tensors="np",
212
+ )
213
+ text_embeddings = self.text_encoder(text_input.input_ids)[self._text_encoder_output]
214
+
215
+ # do classifier free guidance
216
+ do_classifier_free_guidance = guidance_scale > 1.0
217
+ if do_classifier_free_guidance:
218
+
219
+ if negative_prompt is None:
220
+ uncond_tokens = [""]
221
+ elif isinstance(negative_prompt, str):
222
+ uncond_tokens = [negative_prompt]
223
+ else:
224
+ uncond_tokens = negative_prompt
225
+
226
+ tokens_uncond = self.tokenizer(
227
+ uncond_tokens,
228
+ padding="max_length",
229
+ max_length=self.tokenizer.model_max_length, #truncation=True,
230
+ return_tensors="np"
231
+ )
232
+ uncond_embeddings = self.text_encoder(tokens_uncond.input_ids)[self._text_encoder_output]
233
+ text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
234
+
235
+ # set timesteps
236
+ accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
237
+ extra_set_kwargs = {}
238
+
239
+ if accepts_offset:
240
+ extra_set_kwargs["offset"] = 1
241
+
242
+ scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
243
+
244
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, scheduler)
245
+ latent_timestep = timesteps[:1]
246
+
247
+ # get the initial random noise unless the user supplied it
248
+ latents, meta = self.prepare_latents(init_image, latent_timestep, scheduler)
249
+
250
+
251
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
252
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
253
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
254
+ # and should be between [0, 1]
255
+ accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys())
256
+ extra_step_kwargs = {}
257
+ if accepts_eta:
258
+ extra_step_kwargs["eta"] = eta
259
+ if create_gif:
260
+ frames = []
261
+
262
+ for i, t in enumerate(self.progress_bar(timesteps)):
263
+ if callback:
264
+ callback(i, callback_userdata)
265
+
266
+ # expand the latents if we are doing classifier free guidance
267
+ noise_pred = []
268
+ latent_model_input = latents
269
+ latent_model_input = scheduler.scale_model_input(latent_model_input, t)
270
+
271
+ latent_model_input_neg = latent_model_input
272
+ if self.unet.input("latent_model_input").shape[1] != 4:
273
+ #print("In transpose")
274
+ try:
275
+ latent_model_input = latent_model_input.permute(0,2,3,1)
276
+ except:
277
+ latent_model_input = latent_model_input.transpose(0,2,3,1)
278
+
279
+ if self.unet_neg.input("latent_model_input").shape[1] != 4:
280
+ #print("In transpose")
281
+ try:
282
+ latent_model_input_neg = latent_model_input_neg.permute(0,2,3,1)
283
+ except:
284
+ latent_model_input_neg = latent_model_input_neg.transpose(0,2,3,1)
285
+
286
+
287
+ time_proj_constants_fp16 = np.float16(self.time_proj_constants)
288
+ t_scaled_fp16 = time_proj_constants_fp16 * np.float16(t)
289
+ cosine_t_fp16 = np.cos(t_scaled_fp16)
290
+ sine_t_fp16 = np.sin(t_scaled_fp16)
291
+
292
+ t_scaled = self.time_proj_constants * np.float32(t)
293
+
294
+ cosine_t = np.cos(t_scaled)
295
+ sine_t = np.sin(t_scaled)
296
+
297
+ time_proj_dict = {"sine_t" : np.float32(sine_t), "cosine_t" : np.float32(cosine_t)}
298
+ self.infer_request_time_proj.start_async(time_proj_dict)
299
+ self.infer_request_time_proj.wait()
300
+ time_proj = self.infer_request_time_proj.get_output_tensor(0).data.astype(np.float32)
301
+
302
+ input_tens_neg_dict = {"time_proj": np.float32(time_proj), "latent_model_input":latent_model_input_neg, "encoder_hidden_states": np.expand_dims(text_embeddings[0], axis=0)}
303
+ input_tens_dict = {"time_proj": np.float32(time_proj), "latent_model_input":latent_model_input, "encoder_hidden_states": np.expand_dims(text_embeddings[1], axis=0)}
304
+
305
+ self.infer_request_neg.start_async(input_tens_neg_dict)
306
+ self.infer_request.start_async(input_tens_dict)
307
+ self.infer_request_neg.wait()
308
+ self.infer_request.wait()
309
+
310
+ noise_pred_neg = self.infer_request_neg.get_output_tensor(0)
311
+ noise_pred_pos = self.infer_request.get_output_tensor(0)
312
+
313
+ noise_pred.append(noise_pred_neg.data.astype(np.float32))
314
+ noise_pred.append(noise_pred_pos.data.astype(np.float32))
315
+
316
+ # perform guidance
317
+ if do_classifier_free_guidance:
318
+ noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1]
319
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
320
+
321
+ # compute the previous noisy sample x_t -> x_t-1
322
+ latents = scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs)["prev_sample"].numpy()
323
+
324
+ if create_gif:
325
+ frames.append(latents)
326
+
327
+ if callback:
328
+ callback(num_inference_steps, callback_userdata)
329
+
330
+ # scale and decode the image latents with vae
331
+ latents = 1 / 0.18215 * latents
332
+
333
+ start = time.time()
334
+ image = self.vae_decoder(latents)[self._vae_d_output]
335
+ print("Decoder ended:",time.time() - start)
336
+
337
+ image = self.postprocess_image(image, meta)
338
+
339
+ if create_gif:
340
+ gif_folder=os.path.join(model,"../../../gif")
341
+ print("gif_folder:",gif_folder)
342
+ if not os.path.exists(gif_folder):
343
+ os.makedirs(gif_folder)
344
+ for i in range(0,len(frames)):
345
+ image = self.vae_decoder(frames[i]*(1/0.18215))[self._vae_d_output]
346
+ image = self.postprocess_image(image, meta)
347
+ output = gif_folder + "/" + str(i).zfill(3) +".png"
348
+ cv2.imwrite(output, image)
349
+ with open(os.path.join(gif_folder, "prompt.json"), "w") as file:
350
+ json.dump({"prompt": prompt}, file)
351
+ frames_image = [Image.open(image) for image in glob.glob(f"{gif_folder}/*.png")]
352
+ frame_one = frames_image[0]
353
+ gif_file=os.path.join(gif_folder,"stable_diffusion.gif")
354
+ frame_one.save(gif_file, format="GIF", append_images=frames_image, save_all=True, duration=100, loop=0)
355
+
356
+ return image
357
+
358
+ def prepare_latents(self, image:PIL.Image.Image = None, latent_timestep:torch.Tensor = None, scheduler = LMSDiscreteScheduler):
359
+ """
360
+ Function for getting initial latents for starting generation
361
+
362
+ Parameters:
363
+ image (PIL.Image.Image, *optional*, None):
364
+ Input image for generation, if not provided randon noise will be used as starting point
365
+ latent_timestep (torch.Tensor, *optional*, None):
366
+ Predicted by scheduler initial step for image generation, required for latent image mixing with nosie
367
+ Returns:
368
+ latents (np.ndarray):
369
+ Image encoded in latent space
370
+ """
371
+ latents_shape = (1, 4, self.height // 8, self.width // 8)
372
+
373
+ noise = np.random.randn(*latents_shape).astype(np.float32)
374
+ if image is None:
375
+ ##print("Image is NONE")
376
+ # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas
377
+ if isinstance(scheduler, LMSDiscreteScheduler):
378
+
379
+ noise = noise * scheduler.sigmas[0].numpy()
380
+ return noise, {}
381
+ elif isinstance(scheduler, EulerDiscreteScheduler) or isinstance(scheduler,EulerAncestralDiscreteScheduler):
382
+
383
+ noise = noise * scheduler.sigmas.max().numpy()
384
+ return noise, {}
385
+ else:
386
+ return noise, {}
387
+ input_image, meta = preprocess(image,self.height,self.width)
388
+
389
+ moments = self.vae_encoder(input_image)[self._vae_e_output]
390
+
391
+ mean, logvar = np.split(moments, 2, axis=1)
392
+
393
+ std = np.exp(logvar * 0.5)
394
+ latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215
395
+
396
+
397
+ latents = scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy()
398
+ return latents, meta
399
+
400
+ def postprocess_image(self, image:np.ndarray, meta:Dict):
401
+ """
402
+ Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initial image size (if required),
403
+ normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format
404
+
405
+ Parameters:
406
+ image (np.ndarray):
407
+ Generated image
408
+ meta (Dict):
409
+ Metadata obtained on latents preparing step, can be empty
410
+ output_type (str, *optional*, pil):
411
+ Output format for result, can be pil or numpy
412
+ Returns:
413
+ image (List of np.ndarray or PIL.Image.Image):
414
+ Postprocessed images
415
+
416
+ if "src_height" in meta:
417
+ orig_height, orig_width = meta["src_height"], meta["src_width"]
418
+ image = [cv2.resize(img, (orig_width, orig_height))
419
+ for img in image]
420
+
421
+ return image
422
+ """
423
+ if "padding" in meta:
424
+ pad = meta["padding"]
425
+ (_, end_h), (_, end_w) = pad[1:3]
426
+ h, w = image.shape[2:]
427
+ #print("image shape",image.shape[2:])
428
+ unpad_h = h - end_h
429
+ unpad_w = w - end_w
430
+ image = image[:, :, :unpad_h, :unpad_w]
431
+ image = np.clip(image / 2 + 0.5, 0, 1)
432
+ image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8)
433
+
434
+
435
+
436
+ if "src_height" in meta:
437
+ orig_height, orig_width = meta["src_height"], meta["src_width"]
438
+ image = cv2.resize(image, (orig_width, orig_height))
439
+
440
+ return image
441
+
442
+
443
+
444
+
445
+ def get_timesteps(self, num_inference_steps:int, strength:float, scheduler):
446
+ """
447
+ Helper function for getting scheduler timesteps for generation
448
+ In case of image-to-image generation, it updates number of steps according to strength
449
+
450
+ Parameters:
451
+ num_inference_steps (int):
452
+ number of inference steps for generation
453
+ strength (float):
454
+ value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
455
+ Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
456
+ """
457
+ # get the original timestep using init_timestep
458
+
459
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
460
+
461
+ t_start = max(num_inference_steps - init_timestep, 0)
462
+ timesteps = scheduler.timesteps[t_start:]
463
+
464
+ return timesteps, num_inference_steps - t_start
465
+
466
+ class StableDiffusionEngine(DiffusionPipeline):
467
+ def __init__(
468
+ self,
469
+ model="bes-dev/stable-diffusion-v1-4-openvino",
470
+ tokenizer="openai/clip-vit-large-patch14",
471
+ device=["CPU","CPU","CPU","CPU"]):
472
+
473
+ self.core = Core()
474
+ self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')})
475
+
476
+ self.batch_size = 2 if device[1] == device[2] and device[1] == "GPU" else 1
477
+ try_enable_npu_turbo(device, self.core)
478
+
479
+ try:
480
+ self.tokenizer = CLIPTokenizer.from_pretrained(model, local_files_only=True)
481
+ except Exception as e:
482
+ print("Local tokenizer not found. Attempting to download...")
483
+ self.tokenizer = self.download_tokenizer(tokenizer, model)
484
+
485
+ print("Loading models... ")
486
+
487
+ with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
488
+ text_future = executor.submit(self.load_model, model, "text_encoder", device[0])
489
+ vae_de_future = executor.submit(self.load_model, model, "vae_decoder", device[3])
490
+ vae_en_future = executor.submit(self.load_model, model, "vae_encoder", device[3])
491
+
492
+ if self.batch_size == 1:
493
+ if "int8" not in model:
494
+ unet_future = executor.submit(self.load_model, model, "unet_bs1", device[1])
495
+ unet_neg_future = executor.submit(self.load_model, model, "unet_bs1", device[2]) if device[1] != device[2] else None
496
+ else:
497
+ unet_future = executor.submit(self.load_model, model, "unet_int8a16", device[1])
498
+ unet_neg_future = executor.submit(self.load_model, model, "unet_int8a16", device[2]) if device[1] != device[2] else None
499
+ else:
500
+ unet_future = executor.submit(self.load_model, model, "unet", device[1])
501
+ unet_neg_future = None
502
+
503
+ self.unet = unet_future.result()
504
+ self.unet_neg = unet_neg_future.result() if unet_neg_future else self.unet
505
+ self.text_encoder = text_future.result()
506
+ self.vae_decoder = vae_de_future.result()
507
+ self.vae_encoder = vae_en_future.result()
508
+ print("Text Device:", device[0])
509
+ print("unet Device:", device[1])
510
+ print("unet-neg Device:", device[2])
511
+ print("VAE Device:", device[3])
512
+
513
+ self._text_encoder_output = self.text_encoder.output(0)
514
+ self._unet_output = self.unet.output(0)
515
+ self._vae_d_output = self.vae_decoder.output(0)
516
+ self._vae_e_output = self.vae_encoder.output(0) if self.vae_encoder else None
517
+
518
+ self.unet_input_tensor_name = "sample" if 'sample' in self.unet.input(0).names else "latent_model_input"
519
+
520
+ if self.batch_size == 1:
521
+ self.infer_request = self.unet.create_infer_request()
522
+ self.infer_request_neg = self.unet_neg.create_infer_request()
523
+ self._unet_neg_output = self.unet_neg.output(0)
524
+ else:
525
+ self.infer_request = None
526
+ self.infer_request_neg = None
527
+ self._unet_neg_output = None
528
+
529
+ self.set_dimensions()
530
+
531
+
532
+
533
+ def load_model(self, model, model_name, device):
534
+ if "NPU" in device:
535
+ with open(os.path.join(model, f"{model_name}.blob"), "rb") as f:
536
+ return self.core.import_model(f.read(), device)
537
+ return self.core.compile_model(os.path.join(model, f"{model_name}.xml"), device)
538
+
539
+ def set_dimensions(self):
540
+ latent_shape = self.unet.input(self.unet_input_tensor_name).shape
541
+ if latent_shape[1] == 4:
542
+ self.height = latent_shape[2] * 8
543
+ self.width = latent_shape[3] * 8
544
+ else:
545
+ self.height = latent_shape[1] * 8
546
+ self.width = latent_shape[2] * 8
547
+
548
+ def __call__(
549
+ self,
550
+ prompt,
551
+ init_image=None,
552
+ negative_prompt=None,
553
+ scheduler=None,
554
+ strength=0.5,
555
+ num_inference_steps=32,
556
+ guidance_scale=7.5,
557
+ eta=0.0,
558
+ create_gif=False,
559
+ model=None,
560
+ callback=None,
561
+ callback_userdata=None
562
+ ):
563
+ # extract condition
564
+ text_input = self.tokenizer(
565
+ prompt,
566
+ padding="max_length",
567
+ max_length=self.tokenizer.model_max_length,
568
+ truncation=True,
569
+ return_tensors="np",
570
+ )
571
+ text_embeddings = self.text_encoder(text_input.input_ids)[self._text_encoder_output]
572
+
573
+
574
+ # do classifier free guidance
575
+ do_classifier_free_guidance = guidance_scale > 1.0
576
+ if do_classifier_free_guidance:
577
+ if negative_prompt is None:
578
+ uncond_tokens = [""]
579
+ elif isinstance(negative_prompt, str):
580
+ uncond_tokens = [negative_prompt]
581
+ else:
582
+ uncond_tokens = negative_prompt
583
+
584
+ tokens_uncond = self.tokenizer(
585
+ uncond_tokens,
586
+ padding="max_length",
587
+ max_length=self.tokenizer.model_max_length, # truncation=True,
588
+ return_tensors="np"
589
+ )
590
+ uncond_embeddings = self.text_encoder(tokens_uncond.input_ids)[self._text_encoder_output]
591
+ text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
592
+
593
+ # set timesteps
594
+ accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
595
+ extra_set_kwargs = {}
596
+
597
+ if accepts_offset:
598
+ extra_set_kwargs["offset"] = 1
599
+
600
+ scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
601
+
602
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, scheduler)
603
+ latent_timestep = timesteps[:1]
604
+
605
+ # get the initial random noise unless the user supplied it
606
+ latents, meta = self.prepare_latents(init_image, latent_timestep, scheduler,model)
607
+
608
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
609
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
610
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
611
+ # and should be between [0, 1]
612
+ accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys())
613
+ extra_step_kwargs = {}
614
+ if accepts_eta:
615
+ extra_step_kwargs["eta"] = eta
616
+ if create_gif:
617
+ frames = []
618
+
619
+ for i, t in enumerate(self.progress_bar(timesteps)):
620
+ if callback:
621
+ callback(i, callback_userdata)
622
+
623
+ if self.batch_size == 1:
624
+ # expand the latents if we are doing classifier free guidance
625
+ noise_pred = []
626
+ latent_model_input = latents
627
+
628
+ #Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
629
+ latent_model_input = scheduler.scale_model_input(latent_model_input, t)
630
+ latent_model_input_pos = latent_model_input
631
+ latent_model_input_neg = latent_model_input
632
+
633
+ if self.unet.input(self.unet_input_tensor_name).shape[1] != 4:
634
+ try:
635
+ latent_model_input_pos = latent_model_input_pos.permute(0,2,3,1)
636
+ except:
637
+ latent_model_input_pos = latent_model_input_pos.transpose(0,2,3,1)
638
+
639
+ if self.unet_neg.input(self.unet_input_tensor_name).shape[1] != 4:
640
+ try:
641
+ latent_model_input_neg = latent_model_input_neg.permute(0,2,3,1)
642
+ except:
643
+ latent_model_input_neg = latent_model_input_neg.transpose(0,2,3,1)
644
+
645
+ if "sample" in self.unet_input_tensor_name:
646
+ input_tens_neg_dict = {"sample" : latent_model_input_neg, "encoder_hidden_states": np.expand_dims(text_embeddings[0], axis=0), "timestep": np.expand_dims(np.float32(t), axis=0)}
647
+ input_tens_pos_dict = {"sample" : latent_model_input_pos, "encoder_hidden_states": np.expand_dims(text_embeddings[1], axis=0), "timestep": np.expand_dims(np.float32(t), axis=0)}
648
+ else:
649
+ input_tens_neg_dict = {"latent_model_input" : latent_model_input_neg, "encoder_hidden_states": np.expand_dims(text_embeddings[0], axis=0), "t": np.expand_dims(np.float32(t), axis=0)}
650
+ input_tens_pos_dict = {"latent_model_input" : latent_model_input_pos, "encoder_hidden_states": np.expand_dims(text_embeddings[1], axis=0), "t": np.expand_dims(np.float32(t), axis=0)}
651
+
652
+ self.infer_request_neg.start_async(input_tens_neg_dict)
653
+ self.infer_request.start_async(input_tens_pos_dict)
654
+
655
+ self.infer_request_neg.wait()
656
+ self.infer_request.wait()
657
+
658
+ noise_pred_neg = self.infer_request_neg.get_output_tensor(0)
659
+ noise_pred_pos = self.infer_request.get_output_tensor(0)
660
+
661
+ noise_pred.append(noise_pred_neg.data.astype(np.float32))
662
+ noise_pred.append(noise_pred_pos.data.astype(np.float32))
663
+ else:
664
+ latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
665
+ latent_model_input = scheduler.scale_model_input(latent_model_input, t)
666
+ noise_pred = self.unet([latent_model_input, np.array(t, dtype=np.float32), text_embeddings])[self._unet_output]
667
+
668
+ if do_classifier_free_guidance:
669
+ noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1]
670
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
671
+
672
+ # compute the previous noisy sample x_t -> x_t-1
673
+ latents = scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs)["prev_sample"].numpy()
674
+
675
+ if create_gif:
676
+ frames.append(latents)
677
+
678
+ if callback:
679
+ callback(num_inference_steps, callback_userdata)
680
+
681
+ # scale and decode the image latents with vae
682
+ #if self.height == 512 and self.width == 512:
683
+ latents = 1 / 0.18215 * latents
684
+ image = self.vae_decoder(latents)[self._vae_d_output]
685
+ image = self.postprocess_image(image, meta)
686
+
687
+ return image
688
+
689
+ def prepare_latents(self, image: PIL.Image.Image = None, latent_timestep: torch.Tensor = None,
690
+ scheduler=LMSDiscreteScheduler,model=None):
691
+ """
692
+ Function for getting initial latents for starting generation
693
+
694
+ Parameters:
695
+ image (PIL.Image.Image, *optional*, None):
696
+ Input image for generation, if not provided randon noise will be used as starting point
697
+ latent_timestep (torch.Tensor, *optional*, None):
698
+ Predicted by scheduler initial step for image generation, required for latent image mixing with nosie
699
+ Returns:
700
+ latents (np.ndarray):
701
+ Image encoded in latent space
702
+ """
703
+ latents_shape = (1, 4, self.height // 8, self.width // 8)
704
+
705
+ noise = np.random.randn(*latents_shape).astype(np.float32)
706
+ if image is None:
707
+ #print("Image is NONE")
708
+ # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas
709
+ if isinstance(scheduler, LMSDiscreteScheduler):
710
+
711
+ noise = noise * scheduler.sigmas[0].numpy()
712
+ return noise, {}
713
+ elif isinstance(scheduler, EulerDiscreteScheduler):
714
+
715
+ noise = noise * scheduler.sigmas.max().numpy()
716
+ return noise, {}
717
+ else:
718
+ return noise, {}
719
+ input_image, meta = preprocess(image, self.height, self.width)
720
+
721
+ moments = self.vae_encoder(input_image)[self._vae_e_output]
722
+
723
+ if "sd_2.1" in model:
724
+ latents = moments * 0.18215
725
+
726
+ else:
727
+
728
+ mean, logvar = np.split(moments, 2, axis=1)
729
+
730
+ std = np.exp(logvar * 0.5)
731
+ latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215
732
+
733
+ latents = scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy()
734
+ return latents, meta
735
+
736
+
737
+ def postprocess_image(self, image: np.ndarray, meta: Dict):
738
+ """
739
+ Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initila image size (if required),
740
+ normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format
741
+
742
+ Parameters:
743
+ image (np.ndarray):
744
+ Generated image
745
+ meta (Dict):
746
+ Metadata obtained on latents preparing step, can be empty
747
+ output_type (str, *optional*, pil):
748
+ Output format for result, can be pil or numpy
749
+ Returns:
750
+ image (List of np.ndarray or PIL.Image.Image):
751
+ Postprocessed images
752
+
753
+ if "src_height" in meta:
754
+ orig_height, orig_width = meta["src_height"], meta["src_width"]
755
+ image = [cv2.resize(img, (orig_width, orig_height))
756
+ for img in image]
757
+
758
+ return image
759
+ """
760
+ if "padding" in meta:
761
+ pad = meta["padding"]
762
+ (_, end_h), (_, end_w) = pad[1:3]
763
+ h, w = image.shape[2:]
764
+ # print("image shape",image.shape[2:])
765
+ unpad_h = h - end_h
766
+ unpad_w = w - end_w
767
+ image = image[:, :, :unpad_h, :unpad_w]
768
+ image = np.clip(image / 2 + 0.5, 0, 1)
769
+ image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8)
770
+
771
+ if "src_height" in meta:
772
+ orig_height, orig_width = meta["src_height"], meta["src_width"]
773
+ image = cv2.resize(image, (orig_width, orig_height))
774
+
775
+ return image
776
+
777
+ # image = (image / 2 + 0.5).clip(0, 1)
778
+ # image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8)
779
+
780
+ def get_timesteps(self, num_inference_steps: int, strength: float, scheduler):
781
+ """
782
+ Helper function for getting scheduler timesteps for generation
783
+ In case of image-to-image generation, it updates number of steps according to strength
784
+
785
+ Parameters:
786
+ num_inference_steps (int):
787
+ number of inference steps for generation
788
+ strength (float):
789
+ value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
790
+ Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
791
+ """
792
+ # get the original timestep using init_timestep
793
+
794
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
795
+
796
+ t_start = max(num_inference_steps - init_timestep, 0)
797
+ timesteps = scheduler.timesteps[t_start:]
798
+
799
+ return timesteps, num_inference_steps - t_start
800
+
801
+ class LatentConsistencyEngine(DiffusionPipeline):
802
+ def __init__(
803
+ self,
804
+ model="SimianLuo/LCM_Dreamshaper_v7",
805
+ tokenizer="openai/clip-vit-large-patch14",
806
+ device=["CPU", "CPU", "CPU"],
807
+ ):
808
+ super().__init__()
809
+ try:
810
+ self.tokenizer = CLIPTokenizer.from_pretrained(model, local_files_only=True)
811
+ except:
812
+ self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer)
813
+ self.tokenizer.save_pretrained(model)
814
+
815
+ self.core = Core()
816
+ self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')}) # adding caching to reduce init time
817
+ try_enable_npu_turbo(device, self.core)
818
+
819
+
820
+ with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
821
+ text_future = executor.submit(self.load_model, model, "text_encoder", device[0])
822
+ unet_future = executor.submit(self.load_model, model, "unet", device[1])
823
+ vae_de_future = executor.submit(self.load_model, model, "vae_decoder", device[2])
824
+
825
+ print("Text Device:", device[0])
826
+ self.text_encoder = text_future.result()
827
+ self._text_encoder_output = self.text_encoder.output(0)
828
+
829
+ print("Unet Device:", device[1])
830
+ self.unet = unet_future.result()
831
+ self._unet_output = self.unet.output(0)
832
+ self.infer_request = self.unet.create_infer_request()
833
+
834
+ print(f"VAE Device: {device[2]}")
835
+ self.vae_decoder = vae_de_future.result()
836
+ self.infer_request_vae = self.vae_decoder.create_infer_request()
837
+ self.safety_checker = None #pipe.safety_checker
838
+ self.feature_extractor = None #pipe.feature_extractor
839
+ self.vae_scale_factor = 2 ** 3
840
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
841
+
842
+ def load_model(self, model, model_name, device):
843
+ if "NPU" in device:
844
+ with open(os.path.join(model, f"{model_name}.blob"), "rb") as f:
845
+ return self.core.import_model(f.read(), device)
846
+ return self.core.compile_model(os.path.join(model, f"{model_name}.xml"), device)
847
+
848
+ def _encode_prompt(
849
+ self,
850
+ prompt,
851
+ num_images_per_prompt,
852
+ prompt_embeds: None,
853
+ ):
854
+ r"""
855
+ Encodes the prompt into text encoder hidden states.
856
+ Args:
857
+ prompt (`str` or `List[str]`, *optional*):
858
+ prompt to be encoded
859
+ num_images_per_prompt (`int`):
860
+ number of images that should be generated per prompt
861
+ prompt_embeds (`torch.FloatTensor`, *optional*):
862
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
863
+ provided, text embeddings will be generated from `prompt` input argument.
864
+ """
865
+
866
+ if prompt_embeds is None:
867
+
868
+ text_inputs = self.tokenizer(
869
+ prompt,
870
+ padding="max_length",
871
+ max_length=self.tokenizer.model_max_length,
872
+ truncation=True,
873
+ return_tensors="pt",
874
+ )
875
+ text_input_ids = text_inputs.input_ids
876
+ untruncated_ids = self.tokenizer(
877
+ prompt, padding="longest", return_tensors="pt"
878
+ ).input_ids
879
+
880
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[
881
+ -1
882
+ ] and not torch.equal(text_input_ids, untruncated_ids):
883
+ removed_text = self.tokenizer.batch_decode(
884
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
885
+ )
886
+ logger.warning(
887
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
888
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
889
+ )
890
+
891
+ prompt_embeds = self.text_encoder(text_input_ids, share_inputs=True, share_outputs=True)
892
+ prompt_embeds = torch.from_numpy(prompt_embeds[0])
893
+
894
+ bs_embed, seq_len, _ = prompt_embeds.shape
895
+ # duplicate text embeddings for each generation per prompt
896
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
897
+ prompt_embeds = prompt_embeds.view(
898
+ bs_embed * num_images_per_prompt, seq_len, -1
899
+ )
900
+
901
+ # Don't need to get uncond prompt embedding because of LCM Guided Distillation
902
+ return prompt_embeds
903
+
904
+ def run_safety_checker(self, image, dtype):
905
+ if self.safety_checker is None:
906
+ has_nsfw_concept = None
907
+ else:
908
+ if torch.is_tensor(image):
909
+ feature_extractor_input = self.image_processor.postprocess(
910
+ image, output_type="pil"
911
+ )
912
+ else:
913
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
914
+ safety_checker_input = self.feature_extractor(
915
+ feature_extractor_input, return_tensors="pt"
916
+ )
917
+ image, has_nsfw_concept = self.safety_checker(
918
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
919
+ )
920
+ return image, has_nsfw_concept
921
+
922
+ def prepare_latents(
923
+ self, batch_size, num_channels_latents, height, width, dtype, latents=None
924
+ ):
925
+ shape = (
926
+ batch_size,
927
+ num_channels_latents,
928
+ height // self.vae_scale_factor,
929
+ width // self.vae_scale_factor,
930
+ )
931
+ if latents is None:
932
+ latents = torch.randn(shape, dtype=dtype)
933
+ # scale the initial noise by the standard deviation required by the scheduler
934
+ return latents
935
+
936
+ def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
937
+ """
938
+ see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
939
+ Args:
940
+ timesteps: torch.Tensor: generate embedding vectors at these timesteps
941
+ embedding_dim: int: dimension of the embeddings to generate
942
+ dtype: data type of the generated embeddings
943
+ Returns:
944
+ embedding vectors with shape `(len(timesteps), embedding_dim)`
945
+ """
946
+ assert len(w.shape) == 1
947
+ w = w * 1000.0
948
+
949
+ half_dim = embedding_dim // 2
950
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
951
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
952
+ emb = w.to(dtype)[:, None] * emb[None, :]
953
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
954
+ if embedding_dim % 2 == 1: # zero pad
955
+ emb = torch.nn.functional.pad(emb, (0, 1))
956
+ assert emb.shape == (w.shape[0], embedding_dim)
957
+ return emb
958
+
959
+ @torch.no_grad()
960
+ def __call__(
961
+ self,
962
+ prompt: Union[str, List[str]] = None,
963
+ height: Optional[int] = 512,
964
+ width: Optional[int] = 512,
965
+ guidance_scale: float = 7.5,
966
+ scheduler = None,
967
+ num_images_per_prompt: Optional[int] = 1,
968
+ latents: Optional[torch.FloatTensor] = None,
969
+ num_inference_steps: int = 4,
970
+ lcm_origin_steps: int = 50,
971
+ prompt_embeds: Optional[torch.FloatTensor] = None,
972
+ output_type: Optional[str] = "pil",
973
+ return_dict: bool = True,
974
+ model: Optional[Dict[str, any]] = None,
975
+ seed: Optional[int] = 1234567,
976
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
977
+ callback = None,
978
+ callback_userdata = None
979
+ ):
980
+
981
+ # 1. Define call parameters
982
+ if prompt is not None and isinstance(prompt, str):
983
+ batch_size = 1
984
+ elif prompt is not None and isinstance(prompt, list):
985
+ batch_size = len(prompt)
986
+ else:
987
+ batch_size = prompt_embeds.shape[0]
988
+
989
+ if seed is not None:
990
+ torch.manual_seed(seed)
991
+
992
+ #print("After Step 1: batch size is ", batch_size)
993
+ # do_classifier_free_guidance = guidance_scale > 0.0
994
+ # In LCM Implementation: cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG)
995
+
996
+ # 2. Encode input prompt
997
+ prompt_embeds = self._encode_prompt(
998
+ prompt,
999
+ num_images_per_prompt,
1000
+ prompt_embeds=prompt_embeds,
1001
+ )
1002
+ #print("After Step 2: prompt embeds is ", prompt_embeds)
1003
+ #print("After Step 2: scheduler is ", scheduler )
1004
+ # 3. Prepare timesteps
1005
+ scheduler.set_timesteps(num_inference_steps, original_inference_steps=lcm_origin_steps)
1006
+ timesteps = scheduler.timesteps
1007
+
1008
+ #print("After Step 3: timesteps is ", timesteps)
1009
+
1010
+ # 4. Prepare latent variable
1011
+ num_channels_latents = 4
1012
+ latents = self.prepare_latents(
1013
+ batch_size * num_images_per_prompt,
1014
+ num_channels_latents,
1015
+ height,
1016
+ width,
1017
+ prompt_embeds.dtype,
1018
+ latents,
1019
+ )
1020
+ latents = latents * scheduler.init_noise_sigma
1021
+
1022
+ #print("After Step 4: ")
1023
+ bs = batch_size * num_images_per_prompt
1024
+
1025
+ # 5. Get Guidance Scale Embedding
1026
+ w = torch.tensor(guidance_scale).repeat(bs)
1027
+ w_embedding = self.get_w_embedding(w, embedding_dim=256)
1028
+ #print("After Step 5: ")
1029
+ # 6. LCM MultiStep Sampling Loop:
1030
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1031
+ for i, t in enumerate(timesteps):
1032
+ if callback:
1033
+ callback(i+1, callback_userdata)
1034
+
1035
+ ts = torch.full((bs,), t, dtype=torch.long)
1036
+
1037
+ # model prediction (v-prediction, eps, x)
1038
+ model_pred = self.unet([latents, ts, prompt_embeds, w_embedding],share_inputs=True, share_outputs=True)[0]
1039
+
1040
+ # compute the previous noisy sample x_t -> x_t-1
1041
+ latents, denoised = scheduler.step(
1042
+ torch.from_numpy(model_pred), t, latents, return_dict=False
1043
+ )
1044
+ progress_bar.update()
1045
+
1046
+ #print("After Step 6: ")
1047
+
1048
+ vae_start = time.time()
1049
+
1050
+ if not output_type == "latent":
1051
+ image = torch.from_numpy(self.vae_decoder(denoised / 0.18215, share_inputs=True, share_outputs=True)[0])
1052
+ else:
1053
+ image = denoised
1054
+
1055
+ print("Decoder Ended: ", time.time() - vae_start)
1056
+ #post_start = time.time()
1057
+
1058
+ #if has_nsfw_concept is None:
1059
+ do_denormalize = [True] * image.shape[0]
1060
+ #else:
1061
+ # do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1062
+
1063
+ #print ("After do_denormalize: image is ", image)
1064
+
1065
+ image = self.image_processor.postprocess(
1066
+ image, output_type=output_type, do_denormalize=do_denormalize
1067
+ )
1068
+
1069
+ return image[0]
1070
+
1071
+ class LatentConsistencyEngineAdvanced(DiffusionPipeline):
1072
+ def __init__(
1073
+ self,
1074
+ model="SimianLuo/LCM_Dreamshaper_v7",
1075
+ tokenizer="openai/clip-vit-large-patch14",
1076
+ device=["CPU", "CPU", "CPU"],
1077
+ ):
1078
+ super().__init__()
1079
+ try:
1080
+ self.tokenizer = CLIPTokenizer.from_pretrained(model, local_files_only=True)
1081
+ except:
1082
+ self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer)
1083
+ self.tokenizer.save_pretrained(model)
1084
+
1085
+ self.core = Core()
1086
+ self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')}) # adding caching to reduce init time
1087
+ #try_enable_npu_turbo(device, self.core)
1088
+
1089
+
1090
+ with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
1091
+ text_future = executor.submit(self.load_model, model, "text_encoder", device[0])
1092
+ unet_future = executor.submit(self.load_model, model, "unet", device[1])
1093
+ vae_de_future = executor.submit(self.load_model, model, "vae_decoder", device[2])
1094
+ vae_encoder_future = executor.submit(self.load_model, model, "vae_encoder", device[2])
1095
+
1096
+
1097
+ print("Text Device:", device[0])
1098
+ self.text_encoder = text_future.result()
1099
+ self._text_encoder_output = self.text_encoder.output(0)
1100
+
1101
+ print("Unet Device:", device[1])
1102
+ self.unet = unet_future.result()
1103
+ self._unet_output = self.unet.output(0)
1104
+ self.infer_request = self.unet.create_infer_request()
1105
+
1106
+ print(f"VAE Device: {device[2]}")
1107
+ self.vae_decoder = vae_de_future.result()
1108
+ self.vae_encoder = vae_encoder_future.result()
1109
+ self._vae_e_output = self.vae_encoder.output(0) if self.vae_encoder else None
1110
+
1111
+ self.infer_request_vae = self.vae_decoder.create_infer_request()
1112
+ self.safety_checker = None #pipe.safety_checker
1113
+ self.feature_extractor = None #pipe.feature_extractor
1114
+ self.vae_scale_factor = 2 ** 3
1115
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
1116
+
1117
+ def load_model(self, model, model_name, device):
1118
+ print(f"Compiling the {model_name} to {device} ...")
1119
+ return self.core.compile_model(os.path.join(model, f"{model_name}.xml"), device)
1120
+
1121
+ def get_timesteps(self, num_inference_steps:int, strength:float, scheduler):
1122
+ """
1123
+ Helper function for getting scheduler timesteps for generation
1124
+ In case of image-to-image generation, it updates number of steps according to strength
1125
+
1126
+ Parameters:
1127
+ num_inference_steps (int):
1128
+ number of inference steps for generation
1129
+ strength (float):
1130
+ value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
1131
+ Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
1132
+ """
1133
+ # get the original timestep using init_timestep
1134
+
1135
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
1136
+
1137
+ t_start = max(num_inference_steps - init_timestep, 0)
1138
+ timesteps = scheduler.timesteps[t_start:]
1139
+
1140
+ return timesteps, num_inference_steps - t_start
1141
+
1142
+ def _encode_prompt(
1143
+ self,
1144
+ prompt,
1145
+ num_images_per_prompt,
1146
+ prompt_embeds: None,
1147
+ ):
1148
+ r"""
1149
+ Encodes the prompt into text encoder hidden states.
1150
+ Args:
1151
+ prompt (`str` or `List[str]`, *optional*):
1152
+ prompt to be encoded
1153
+ num_images_per_prompt (`int`):
1154
+ number of images that should be generated per prompt
1155
+ prompt_embeds (`torch.FloatTensor`, *optional*):
1156
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
1157
+ provided, text embeddings will be generated from `prompt` input argument.
1158
+ """
1159
+
1160
+ if prompt_embeds is None:
1161
+
1162
+ text_inputs = self.tokenizer(
1163
+ prompt,
1164
+ padding="max_length",
1165
+ max_length=self.tokenizer.model_max_length,
1166
+ truncation=True,
1167
+ return_tensors="pt",
1168
+ )
1169
+ text_input_ids = text_inputs.input_ids
1170
+ untruncated_ids = self.tokenizer(
1171
+ prompt, padding="longest", return_tensors="pt"
1172
+ ).input_ids
1173
+
1174
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[
1175
+ -1
1176
+ ] and not torch.equal(text_input_ids, untruncated_ids):
1177
+ removed_text = self.tokenizer.batch_decode(
1178
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
1179
+ )
1180
+ logger.warning(
1181
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
1182
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
1183
+ )
1184
+
1185
+ prompt_embeds = self.text_encoder(text_input_ids, share_inputs=True, share_outputs=True)
1186
+ prompt_embeds = torch.from_numpy(prompt_embeds[0])
1187
+
1188
+ bs_embed, seq_len, _ = prompt_embeds.shape
1189
+ # duplicate text embeddings for each generation per prompt
1190
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
1191
+ prompt_embeds = prompt_embeds.view(
1192
+ bs_embed * num_images_per_prompt, seq_len, -1
1193
+ )
1194
+
1195
+ # Don't need to get uncond prompt embedding because of LCM Guided Distillation
1196
+ return prompt_embeds
1197
+
1198
+ def run_safety_checker(self, image, dtype):
1199
+ if self.safety_checker is None:
1200
+ has_nsfw_concept = None
1201
+ else:
1202
+ if torch.is_tensor(image):
1203
+ feature_extractor_input = self.image_processor.postprocess(
1204
+ image, output_type="pil"
1205
+ )
1206
+ else:
1207
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
1208
+ safety_checker_input = self.feature_extractor(
1209
+ feature_extractor_input, return_tensors="pt"
1210
+ )
1211
+ image, has_nsfw_concept = self.safety_checker(
1212
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
1213
+ )
1214
+ return image, has_nsfw_concep
1215
+
1216
+ def prepare_latents(
1217
+ self,image,timestep,batch_size, num_channels_latents, height, width, dtype, scheduler,latents=None,
1218
+ ):
1219
+ shape = (
1220
+ batch_size,
1221
+ num_channels_latents,
1222
+ height // self.vae_scale_factor,
1223
+ width // self.vae_scale_factor,
1224
+ )
1225
+ if image:
1226
+ #latents_shape = (1, 4, 512, 512 // 8)
1227
+ #input_image, meta = preprocess(image,512,512)
1228
+ latents_shape = (1, 4, 512 // 8, 512 // 8)
1229
+ noise = np.random.randn(*latents_shape).astype(np.float32)
1230
+ input_image,meta = preprocess(image,512,512)
1231
+ moments = self.vae_encoder(input_image)[self._vae_e_output]
1232
+ mean, logvar = np.split(moments, 2, axis=1)
1233
+ std = np.exp(logvar * 0.5)
1234
+ latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215
1235
+ noise = torch.randn(shape, dtype=dtype)
1236
+ #latents = scheduler.add_noise(init_latents, noise, timestep)
1237
+ latents = scheduler.add_noise(torch.from_numpy(latents), noise, timestep)
1238
+
1239
+ else:
1240
+ latents = torch.randn(shape, dtype=dtype)
1241
+ # scale the initial noise by the standard deviation required by the scheduler
1242
+ return latents
1243
+
1244
+ def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
1245
+ """
1246
+ see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
1247
+ Args:
1248
+ timesteps: torch.Tensor: generate embedding vectors at these timesteps
1249
+ embedding_dim: int: dimension of the embeddings to generate
1250
+ dtype: data type of the generated embeddings
1251
+ Returns:
1252
+ embedding vectors with shape `(len(timesteps), embedding_dim)`
1253
+ """
1254
+ assert len(w.shape) == 1
1255
+ w = w * 1000.0
1256
+
1257
+ half_dim = embedding_dim // 2
1258
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
1259
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
1260
+ emb = w.to(dtype)[:, None] * emb[None, :]
1261
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
1262
+ if embedding_dim % 2 == 1: # zero pad
1263
+ emb = torch.nn.functional.pad(emb, (0, 1))
1264
+ assert emb.shape == (w.shape[0], embedding_dim)
1265
+ return emb
1266
+
1267
+ @torch.no_grad()
1268
+ def __call__(
1269
+ self,
1270
+ prompt: Union[str, List[str]] = None,
1271
+ init_image: Optional[PIL.Image.Image] = None,
1272
+ strength: Optional[float] = 0.8,
1273
+ height: Optional[int] = 512,
1274
+ width: Optional[int] = 512,
1275
+ guidance_scale: float = 7.5,
1276
+ scheduler = None,
1277
+ num_images_per_prompt: Optional[int] = 1,
1278
+ latents: Optional[torch.FloatTensor] = None,
1279
+ num_inference_steps: int = 4,
1280
+ lcm_origin_steps: int = 50,
1281
+ prompt_embeds: Optional[torch.FloatTensor] = None,
1282
+ output_type: Optional[str] = "pil",
1283
+ return_dict: bool = True,
1284
+ model: Optional[Dict[str, any]] = None,
1285
+ seed: Optional[int] = 1234567,
1286
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
1287
+ callback = None,
1288
+ callback_userdata = None
1289
+ ):
1290
+
1291
+ # 1. Define call parameters
1292
+ if prompt is not None and isinstance(prompt, str):
1293
+ batch_size = 1
1294
+ elif prompt is not None and isinstance(prompt, list):
1295
+ batch_size = len(prompt)
1296
+ else:
1297
+ batch_size = prompt_embeds.shape[0]
1298
+
1299
+ if seed is not None:
1300
+ torch.manual_seed(seed)
1301
+
1302
+ #print("After Step 1: batch size is ", batch_size)
1303
+ # do_classifier_free_guidance = guidance_scale > 0.0
1304
+ # In LCM Implementation: cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG)
1305
+
1306
+ # 2. Encode input prompt
1307
+ prompt_embeds = self._encode_prompt(
1308
+ prompt,
1309
+ num_images_per_prompt,
1310
+ prompt_embeds=prompt_embeds,
1311
+ )
1312
+ #print("After Step 2: prompt embeds is ", prompt_embeds)
1313
+ #print("After Step 2: scheduler is ", scheduler )
1314
+ # 3. Prepare timesteps
1315
+ #scheduler.set_timesteps(num_inference_steps, original_inference_steps=lcm_origin_steps)
1316
+ latent_timestep = None
1317
+ if init_image:
1318
+ scheduler.set_timesteps(num_inference_steps, original_inference_steps=lcm_origin_steps)
1319
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, scheduler)
1320
+ latent_timestep = timesteps[:1]
1321
+ else:
1322
+ scheduler.set_timesteps(num_inference_steps, original_inference_steps=lcm_origin_steps)
1323
+ timesteps = scheduler.timesteps
1324
+ #timesteps = scheduler.timesteps
1325
+ #latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
1326
+ #print("timesteps: ", latent_timestep)
1327
+
1328
+ #print("After Step 3: timesteps is ", timesteps)
1329
+
1330
+ # 4. Prepare latent variable
1331
+ num_channels_latents = 4
1332
+ latents = self.prepare_latents(
1333
+ init_image,
1334
+ latent_timestep,
1335
+ batch_size * num_images_per_prompt,
1336
+ num_channels_latents,
1337
+ height,
1338
+ width,
1339
+ prompt_embeds.dtype,
1340
+ scheduler,
1341
+ latents,
1342
+ )
1343
+
1344
+ latents = latents * scheduler.init_noise_sigma
1345
+
1346
+ #print("After Step 4: ")
1347
+ bs = batch_size * num_images_per_prompt
1348
+
1349
+ # 5. Get Guidance Scale Embedding
1350
+ w = torch.tensor(guidance_scale).repeat(bs)
1351
+ w_embedding = self.get_w_embedding(w, embedding_dim=256)
1352
+ #print("After Step 5: ")
1353
+ # 6. LCM MultiStep Sampling Loop:
1354
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1355
+ for i, t in enumerate(timesteps):
1356
+ if callback:
1357
+ callback(i+1, callback_userdata)
1358
+
1359
+ ts = torch.full((bs,), t, dtype=torch.long)
1360
+
1361
+ # model prediction (v-prediction, eps, x)
1362
+ model_pred = self.unet([latents, ts, prompt_embeds, w_embedding],share_inputs=True, share_outputs=True)[0]
1363
+
1364
+ # compute the previous noisy sample x_t -> x_t-1
1365
+ latents, denoised = scheduler.step(
1366
+ torch.from_numpy(model_pred), t, latents, return_dict=False
1367
+ )
1368
+ progress_bar.update()
1369
+
1370
+ #print("After Step 6: ")
1371
+
1372
+ vae_start = time.time()
1373
+
1374
+ if not output_type == "latent":
1375
+ image = torch.from_numpy(self.vae_decoder(denoised / 0.18215, share_inputs=True, share_outputs=True)[0])
1376
+ else:
1377
+ image = denoised
1378
+
1379
+ print("Decoder Ended: ", time.time() - vae_start)
1380
+ #post_start = time.time()
1381
+
1382
+ #if has_nsfw_concept is None:
1383
+ do_denormalize = [True] * image.shape[0]
1384
+ #else:
1385
+ # do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1386
+
1387
+ #print ("After do_denormalize: image is ", image)
1388
+
1389
+ image = self.image_processor.postprocess(
1390
+ image, output_type=output_type, do_denormalize=do_denormalize
1391
+ )
1392
+
1393
+ return image[0]
1394
+
1395
+ class StableDiffusionEngineReferenceOnly(DiffusionPipeline):
1396
+ def __init__(
1397
+ self,
1398
+ #scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
1399
+ model="bes-dev/stable-diffusion-v1-4-openvino",
1400
+ tokenizer="openai/clip-vit-large-patch14",
1401
+ device=["CPU","CPU","CPU"]
1402
+ ):
1403
+ #self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer)
1404
+ try:
1405
+ self.tokenizer = CLIPTokenizer.from_pretrained(model,local_files_only=True)
1406
+ except:
1407
+ self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer)
1408
+ self.tokenizer.save_pretrained(model)
1409
+
1410
+ #self.scheduler = scheduler
1411
+ # models
1412
+
1413
+ self.core = Core()
1414
+ self.core.set_property({'CACHE_DIR': os.path.join(model, 'cache')}) #adding caching to reduce init time
1415
+ # text features
1416
+
1417
+ print("Text Device:",device[0])
1418
+ self.text_encoder = self.core.compile_model(os.path.join(model, "text_encoder.xml"), device[0])
1419
+
1420
+ self._text_encoder_output = self.text_encoder.output(0)
1421
+
1422
+ # diffusion
1423
+ print("unet_w Device:",device[1])
1424
+ self.unet_w = self.core.compile_model(os.path.join(model, "unet_reference_write.xml"), device[1])
1425
+ self._unet_w_output = self.unet_w.output(0)
1426
+ self.latent_shape = tuple(self.unet_w.inputs[0].shape)[1:]
1427
+
1428
+ print("unet_r Device:",device[1])
1429
+ self.unet_r = self.core.compile_model(os.path.join(model, "unet_reference_read.xml"), device[1])
1430
+ self._unet_r_output = self.unet_r.output(0)
1431
+ # decoder
1432
+ print("Vae Device:",device[2])
1433
+
1434
+ self.vae_decoder = self.core.compile_model(os.path.join(model, "vae_decoder.xml"), device[2])
1435
+
1436
+ # encoder
1437
+
1438
+ self.vae_encoder = self.core.compile_model(os.path.join(model, "vae_encoder.xml"), device[2])
1439
+
1440
+ self.init_image_shape = tuple(self.vae_encoder.inputs[0].shape)[2:]
1441
+
1442
+ self._vae_d_output = self.vae_decoder.output(0)
1443
+ self._vae_e_output = self.vae_encoder.output(0) if self.vae_encoder is not None else None
1444
+
1445
+ self.height = self.unet_w.input(0).shape[2] * 8
1446
+ self.width = self.unet_w.input(0).shape[3] * 8
1447
+
1448
+
1449
+
1450
+ def __call__(
1451
+ self,
1452
+ prompt,
1453
+ image = None,
1454
+ negative_prompt=None,
1455
+ scheduler=None,
1456
+ strength = 1.0,
1457
+ num_inference_steps = 32,
1458
+ guidance_scale = 7.5,
1459
+ eta = 0.0,
1460
+ create_gif = False,
1461
+ model = None,
1462
+ callback = None,
1463
+ callback_userdata = None
1464
+ ):
1465
+ # extract condition
1466
+ text_input = self.tokenizer(
1467
+ prompt,
1468
+ padding="max_length",
1469
+ max_length=self.tokenizer.model_max_length,
1470
+ truncation=True,
1471
+ return_tensors="np",
1472
+ )
1473
+ text_embeddings = self.text_encoder(text_input.input_ids)[self._text_encoder_output]
1474
+
1475
+
1476
+ # do classifier free guidance
1477
+ do_classifier_free_guidance = guidance_scale > 1.0
1478
+ if do_classifier_free_guidance:
1479
+
1480
+ if negative_prompt is None:
1481
+ uncond_tokens = [""]
1482
+ elif isinstance(negative_prompt, str):
1483
+ uncond_tokens = [negative_prompt]
1484
+ else:
1485
+ uncond_tokens = negative_prompt
1486
+
1487
+ tokens_uncond = self.tokenizer(
1488
+ uncond_tokens,
1489
+ padding="max_length",
1490
+ max_length=self.tokenizer.model_max_length, #truncation=True,
1491
+ return_tensors="np"
1492
+ )
1493
+ uncond_embeddings = self.text_encoder(tokens_uncond.input_ids)[self._text_encoder_output]
1494
+ text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
1495
+
1496
+ # set timesteps
1497
+ accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
1498
+ extra_set_kwargs = {}
1499
+
1500
+ if accepts_offset:
1501
+ extra_set_kwargs["offset"] = 1
1502
+
1503
+ scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
1504
+
1505
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, scheduler)
1506
+ latent_timestep = timesteps[:1]
1507
+
1508
+ ref_image = self.prepare_image(
1509
+ image=image,
1510
+ width=512,
1511
+ height=512,
1512
+ )
1513
+ # get the initial random noise unless the user supplied it
1514
+ latents, meta = self.prepare_latents(None, latent_timestep, scheduler)
1515
+ #ref_image_latents, _ = self.prepare_latents(init_image, latent_timestep, scheduler)
1516
+ ref_image_latents = self.ov_prepare_ref_latents(ref_image)
1517
+
1518
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
1519
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
1520
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
1521
+ # and should be between [0, 1]
1522
+ accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys())
1523
+ extra_step_kwargs = {}
1524
+ if accepts_eta:
1525
+ extra_step_kwargs["eta"] = eta
1526
+ if create_gif:
1527
+ frames = []
1528
+
1529
+ for i, t in enumerate(self.progress_bar(timesteps)):
1530
+ if callback:
1531
+ callback(i, callback_userdata)
1532
+
1533
+ # expand the latents if we are doing classifier free guidance
1534
+ latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
1535
+ latent_model_input = scheduler.scale_model_input(latent_model_input, t)
1536
+
1537
+ # ref only part
1538
+ noise = randn_tensor(
1539
+ ref_image_latents.shape
1540
+ )
1541
+
1542
+ ref_xt = scheduler.add_noise(
1543
+ torch.from_numpy(ref_image_latents),
1544
+ noise,
1545
+ t.reshape(
1546
+ 1,
1547
+ ),
1548
+ ).numpy()
1549
+ ref_xt = np.concatenate([ref_xt] * 2) if do_classifier_free_guidance else ref_xt
1550
+ ref_xt = scheduler.scale_model_input(ref_xt, t)
1551
+
1552
+ # MODE = "write"
1553
+ result_w_dict = self.unet_w([
1554
+ ref_xt,
1555
+ t,
1556
+ text_embeddings
1557
+ ])
1558
+ down_0_attn0 = result_w_dict["/unet/down_blocks.0/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1559
+ down_0_attn1 = result_w_dict["/unet/down_blocks.0/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1560
+ down_1_attn0 = result_w_dict["/unet/down_blocks.1/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1561
+ down_1_attn1 = result_w_dict["/unet/down_blocks.1/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1562
+ down_2_attn0 = result_w_dict["/unet/down_blocks.2/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1563
+ down_2_attn1 = result_w_dict["/unet/down_blocks.2/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1564
+ mid_attn0 = result_w_dict["/unet/mid_block/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1565
+ up_1_attn0 = result_w_dict["/unet/up_blocks.1/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1566
+ up_1_attn1 = result_w_dict["/unet/up_blocks.1/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1567
+ up_1_attn2 = result_w_dict["/unet/up_blocks.1/attentions.2/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1568
+ up_2_attn0 = result_w_dict["/unet/up_blocks.2/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1569
+ up_2_attn1 = result_w_dict["/unet/up_blocks.2/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1570
+ up_2_attn2 = result_w_dict["/unet/up_blocks.2/attentions.2/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1571
+ up_3_attn0 = result_w_dict["/unet/up_blocks.3/attentions.0/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1572
+ up_3_attn1 = result_w_dict["/unet/up_blocks.3/attentions.1/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1573
+ up_3_attn2 = result_w_dict["/unet/up_blocks.3/attentions.2/transformer_blocks.0/norm1/LayerNormalization_output_0"]
1574
+
1575
+ # MODE = "read"
1576
+ noise_pred = self.unet_r([
1577
+ latent_model_input, t, text_embeddings, down_0_attn0, down_0_attn1, down_1_attn0,
1578
+ down_1_attn1, down_2_attn0, down_2_attn1, mid_attn0, up_1_attn0, up_1_attn1, up_1_attn2,
1579
+ up_2_attn0, up_2_attn1, up_2_attn2, up_3_attn0, up_3_attn1, up_3_attn2
1580
+ ])[0]
1581
+
1582
+ # perform guidance
1583
+ if do_classifier_free_guidance:
1584
+ noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1]
1585
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1586
+
1587
+ # compute the previous noisy sample x_t -> x_t-1
1588
+ latents = scheduler.step(torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs)["prev_sample"].numpy()
1589
+
1590
+ if create_gif:
1591
+ frames.append(latents)
1592
+
1593
+ if callback:
1594
+ callback(num_inference_steps, callback_userdata)
1595
+
1596
+ # scale and decode the image latents with vae
1597
+
1598
+ image = self.vae_decoder(latents)[self._vae_d_output]
1599
+
1600
+ image = self.postprocess_image(image, meta)
1601
+
1602
+ if create_gif:
1603
+ gif_folder=os.path.join(model,"../../../gif")
1604
+ if not os.path.exists(gif_folder):
1605
+ os.makedirs(gif_folder)
1606
+ for i in range(0,len(frames)):
1607
+ image = self.vae_decoder(frames[i])[self._vae_d_output]
1608
+ image = self.postprocess_image(image, meta)
1609
+ output = gif_folder + "/" + str(i).zfill(3) +".png"
1610
+ cv2.imwrite(output, image)
1611
+ with open(os.path.join(gif_folder, "prompt.json"), "w") as file:
1612
+ json.dump({"prompt": prompt}, file)
1613
+ frames_image = [Image.open(image) for image in glob.glob(f"{gif_folder}/*.png")]
1614
+ frame_one = frames_image[0]
1615
+ gif_file=os.path.join(gif_folder,"stable_diffusion.gif")
1616
+ frame_one.save(gif_file, format="GIF", append_images=frames_image, save_all=True, duration=100, loop=0)
1617
+
1618
+ return image
1619
+
1620
+ def ov_prepare_ref_latents(self, refimage, vae_scaling_factor=0.18215):
1621
+ #refimage = refimage.to(device=device, dtype=dtype)
1622
+
1623
+ # encode the mask image into latents space so we can concatenate it to the latents
1624
+ moments = self.vae_encoder(refimage)[0]
1625
+ mean, logvar = np.split(moments, 2, axis=1)
1626
+ std = np.exp(logvar * 0.5)
1627
+ ref_image_latents = (mean + std * np.random.randn(*mean.shape))
1628
+ ref_image_latents = vae_scaling_factor * ref_image_latents
1629
+ #ref_image_latents = scheduler.add_noise(torch.from_numpy(ref_image_latents), torch.from_numpy(noise), latent_timestep).numpy()
1630
+
1631
+ # aligning device to prevent device errors when concating it with the latent model input
1632
+ #ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
1633
+ return ref_image_latents
1634
+
1635
+ def prepare_latents(self, image:PIL.Image.Image = None, latent_timestep:torch.Tensor = None, scheduler = LMSDiscreteScheduler):
1636
+ """
1637
+ Function for getting initial latents for starting generation
1638
+
1639
+ Parameters:
1640
+ image (PIL.Image.Image, *optional*, None):
1641
+ Input image for generation, if not provided randon noise will be used as starting point
1642
+ latent_timestep (torch.Tensor, *optional*, None):
1643
+ Predicted by scheduler initial step for image generation, required for latent image mixing with nosie
1644
+ Returns:
1645
+ latents (np.ndarray):
1646
+ Image encoded in latent space
1647
+ """
1648
+ latents_shape = (1, 4, self.height // 8, self.width // 8)
1649
+
1650
+ noise = np.random.randn(*latents_shape).astype(np.float32)
1651
+ if image is None:
1652
+ #print("Image is NONE")
1653
+ # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas
1654
+ if isinstance(scheduler, LMSDiscreteScheduler):
1655
+
1656
+ noise = noise * scheduler.sigmas[0].numpy()
1657
+ return noise, {}
1658
+ elif isinstance(scheduler, EulerDiscreteScheduler):
1659
+
1660
+ noise = noise * scheduler.sigmas.max().numpy()
1661
+ return noise, {}
1662
+ else:
1663
+ return noise, {}
1664
+ input_image, meta = preprocess(image,self.height,self.width)
1665
+
1666
+ moments = self.vae_encoder(input_image)[self._vae_e_output]
1667
+
1668
+ mean, logvar = np.split(moments, 2, axis=1)
1669
+
1670
+ std = np.exp(logvar * 0.5)
1671
+ latents = (mean + std * np.random.randn(*mean.shape)) * 0.18215
1672
+
1673
+
1674
+ latents = scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy()
1675
+ return latents, meta
1676
+
1677
+ def postprocess_image(self, image:np.ndarray, meta:Dict):
1678
+ """
1679
+ Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initila image size (if required),
1680
+ normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format
1681
+
1682
+ Parameters:
1683
+ image (np.ndarray):
1684
+ Generated image
1685
+ meta (Dict):
1686
+ Metadata obtained on latents preparing step, can be empty
1687
+ output_type (str, *optional*, pil):
1688
+ Output format for result, can be pil or numpy
1689
+ Returns:
1690
+ image (List of np.ndarray or PIL.Image.Image):
1691
+ Postprocessed images
1692
+
1693
+ if "src_height" in meta:
1694
+ orig_height, orig_width = meta["src_height"], meta["src_width"]
1695
+ image = [cv2.resize(img, (orig_width, orig_height))
1696
+ for img in image]
1697
+
1698
+ return image
1699
+ """
1700
+ if "padding" in meta:
1701
+ pad = meta["padding"]
1702
+ (_, end_h), (_, end_w) = pad[1:3]
1703
+ h, w = image.shape[2:]
1704
+ #print("image shape",image.shape[2:])
1705
+ unpad_h = h - end_h
1706
+ unpad_w = w - end_w
1707
+ image = image[:, :, :unpad_h, :unpad_w]
1708
+ image = np.clip(image / 2 + 0.5, 0, 1)
1709
+ image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8)
1710
+
1711
+
1712
+
1713
+ if "src_height" in meta:
1714
+ orig_height, orig_width = meta["src_height"], meta["src_width"]
1715
+ image = cv2.resize(image, (orig_width, orig_height))
1716
+
1717
+ return image
1718
+
1719
+
1720
+ #image = (image / 2 + 0.5).clip(0, 1)
1721
+ #image = (image[0].transpose(1, 2, 0)[:, :, ::-1] * 255).astype(np.uint8)
1722
+
1723
+
1724
+ def get_timesteps(self, num_inference_steps:int, strength:float, scheduler):
1725
+ """
1726
+ Helper function for getting scheduler timesteps for generation
1727
+ In case of image-to-image generation, it updates number of steps according to strength
1728
+
1729
+ Parameters:
1730
+ num_inference_steps (int):
1731
+ number of inference steps for generation
1732
+ strength (float):
1733
+ value between 0.0 and 1.0, that controls the amount of noise that is added to the input image.
1734
+ Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
1735
+ """
1736
+ # get the original timestep using init_timestep
1737
+
1738
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
1739
+
1740
+ t_start = max(num_inference_steps - init_timestep, 0)
1741
+ timesteps = scheduler.timesteps[t_start:]
1742
+
1743
+ return timesteps, num_inference_steps - t_start
1744
+ def prepare_image(
1745
+ self,
1746
+ image,
1747
+ width,
1748
+ height,
1749
+ do_classifier_free_guidance=False,
1750
+ guess_mode=False,
1751
+ ):
1752
+ if not isinstance(image, np.ndarray):
1753
+ if isinstance(image, PIL.Image.Image):
1754
+ image = [image]
1755
+
1756
+ if isinstance(image[0], PIL.Image.Image):
1757
+ images = []
1758
+
1759
+ for image_ in image:
1760
+ image_ = image_.convert("RGB")
1761
+ image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
1762
+ image_ = np.array(image_)
1763
+ image_ = image_[None, :]
1764
+ images.append(image_)
1765
+
1766
+ image = images
1767
+
1768
+ image = np.concatenate(image, axis=0)
1769
+ image = np.array(image).astype(np.float32) / 255.0
1770
+ image = (image - 0.5) / 0.5
1771
+ image = image.transpose(0, 3, 1, 2)
1772
+ elif isinstance(image[0], np.ndarray):
1773
+ image = np.concatenate(image, dim=0)
1774
+
1775
+ if do_classifier_free_guidance and not guess_mode:
1776
+ image = np.concatenate([image] * 2)
1777
+
1778
+ return image
1779
+
1780
+ def print_npu_turbo_art():
1781
+ random_number = random.randint(1, 3)
1782
+
1783
+ if random_number == 1:
1784
+ print(" ")
1785
+ print(" ___ ___ ___ ___ ___ ___ ")
1786
+ print(" /\ \ /\ \ /\ \ /\ \ /\ \ _____ /\ \ ")
1787
+ print(" \:\ \ /::\ \ \:\ \ ___ \:\ \ /::\ \ /::\ \ /::\ \ ")
1788
+ print(" \:\ \ /:/\:\__\ \:\ \ /\__\ \:\ \ /:/\:\__\ /:/\:\ \ /:/\:\ \ ")
1789
+ print(" _____\:\ \ /:/ /:/ / ___ \:\ \ /:/ / ___ \:\ \ /:/ /:/ / /:/ /::\__\ /:/ \:\ \ ")
1790
+ print(" /::::::::\__\ /:/_/:/ / /\ \ \:\__\ /:/__/ /\ \ \:\__\ /:/_/:/__/___ /:/_/:/\:|__| /:/__/ \:\__\ ")
1791
+ print(" \:\~~\~~\/__/ \:\/:/ / \:\ \ /:/ / /::\ \ \:\ \ /:/ / \:\/:::::/ / \:\/:/ /:/ / \:\ \ /:/ / ")
1792
+ print(" \:\ \ \::/__/ \:\ /:/ / /:/\:\ \ \:\ /:/ / \::/~~/~~~~ \::/_/:/ / \:\ /:/ / ")
1793
+ print(" \:\ \ \:\ \ \:\/:/ / \/__\:\ \ \:\/:/ / \:\~~\ \:\/:/ / \:\/:/ / ")
1794
+ print(" \:\__\ \:\__\ \::/ / \:\__\ \::/ / \:\__\ \::/ / \::/ / ")
1795
+ print(" \/__/ \/__/ \/__/ \/__/ \/__/ \/__/ \/__/ \/__/ ")
1796
+ print(" ")
1797
+ elif random_number == 2:
1798
+ print(" _ _ ____ _ _ _____ _ _ ____ ____ ___ ")
1799
+ print("| \ | | | _ \ | | | | |_ _| | | | | | _ \ | __ ) / _ \ ")
1800
+ print("| \| | | |_) | | | | | | | | | | | | |_) | | _ \ | | | |")
1801
+ print("| |\ | | __/ | |_| | | | | |_| | | _ < | |_) | | |_| |")
1802
+ print("|_| \_| |_| \___/ |_| \___/ |_| \_\ |____/ \___/ ")
1803
+ print(" ")
1804
+ else:
1805
+ print("")
1806
+ print(" ) ( ( ) ")
1807
+ print(" ( /( )\ ) * ) )\ ) ( ( /( ")
1808
+ print(" )\()) (()/( ( ` ) /( ( (()/( ( )\ )\()) ")
1809
+ print("((_)\ /(_)) )\ ( )(_)) )\ /(_)) )((_) ((_)\ ")
1810
+ print(" _((_) (_)) _ ((_) (_(_()) _ ((_) (_)) ((_)_ ((_) ")
1811
+ print("| \| | | _ \ | | | | |_ _| | | | | | _ \ | _ ) / _ \ ")
1812
+ print("| .` | | _/ | |_| | | | | |_| | | / | _ \ | (_) | ")
1813
+ print("|_|\_| |_| \___/ |_| \___/ |_|_\ |___/ \___/ ")
1814
+ print(" ")
1815
+
1816
+
1817
+