Upload folder using huggingface_hub
Browse files- model_index.json +40 -0
- pipeline_mvdiffusion.py +578 -0
- scheduler/scheduler_config.json +19 -0
- text_encoder/config.json +25 -0
- text_encoder/model.safetensors +3 -0
- tokenizer/merges.txt +0 -0
- tokenizer/special_tokens_map.json +24 -0
- tokenizer/tokenizer_config.json +38 -0
- tokenizer/vocab.json +0 -0
- unet/config.json +77 -0
- unet/diffusion_pytorch_model.safetensors +3 -0
- vae/config.json +36 -0
- vae/diffusion_pytorch_model.safetensors +3 -0
model_index.json
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": [
|
3 |
+
"pipeline_mvdiffusion",
|
4 |
+
"MVDiffusionPipeline"
|
5 |
+
],
|
6 |
+
"_diffusers_version": "0.29.2",
|
7 |
+
"feature_extractor": [
|
8 |
+
null,
|
9 |
+
null
|
10 |
+
],
|
11 |
+
"image_encoder": [
|
12 |
+
null,
|
13 |
+
null
|
14 |
+
],
|
15 |
+
"requires_safety_checker": false,
|
16 |
+
"safety_checker": [
|
17 |
+
null,
|
18 |
+
null
|
19 |
+
],
|
20 |
+
"scheduler": [
|
21 |
+
"diffusers",
|
22 |
+
"DDIMScheduler"
|
23 |
+
],
|
24 |
+
"text_encoder": [
|
25 |
+
"transformers",
|
26 |
+
"CLIPTextModel"
|
27 |
+
],
|
28 |
+
"tokenizer": [
|
29 |
+
"transformers",
|
30 |
+
"CLIPTokenizer"
|
31 |
+
],
|
32 |
+
"unet": [
|
33 |
+
"diffusers",
|
34 |
+
"UNet2DConditionModel"
|
35 |
+
],
|
36 |
+
"vae": [
|
37 |
+
"diffusers",
|
38 |
+
"AutoencoderKL"
|
39 |
+
]
|
40 |
+
}
|
pipeline_mvdiffusion.py
ADDED
@@ -0,0 +1,578 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Callable, Dict, List, Optional, Union
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch.nn.functional as F
|
7 |
+
|
8 |
+
try:
|
9 |
+
from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
|
10 |
+
except:
|
11 |
+
|
12 |
+
class MultiPipelineCallbacks:
|
13 |
+
...
|
14 |
+
|
15 |
+
class PipelineCallback:
|
16 |
+
...
|
17 |
+
|
18 |
+
|
19 |
+
from diffusers.image_processor import PipelineImageInput
|
20 |
+
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
21 |
+
from diffusers.models.attention import Attention
|
22 |
+
from diffusers.models.attention_processor import AttnProcessor2_0
|
23 |
+
from diffusers.pipelines.stable_diffusion.pipeline_output import (
|
24 |
+
StableDiffusionPipelineOutput,
|
25 |
+
)
|
26 |
+
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
|
27 |
+
StableDiffusionPipeline,
|
28 |
+
rescale_noise_cfg,
|
29 |
+
retrieve_timesteps,
|
30 |
+
)
|
31 |
+
from diffusers.pipelines.stable_diffusion.safety_checker import (
|
32 |
+
StableDiffusionSafetyChecker,
|
33 |
+
)
|
34 |
+
from diffusers.schedulers import KarrasDiffusionSchedulers
|
35 |
+
from diffusers.utils import deprecate
|
36 |
+
from transformers import (
|
37 |
+
CLIPImageProcessor,
|
38 |
+
CLIPTextModel,
|
39 |
+
CLIPTokenizer,
|
40 |
+
CLIPVisionModel,
|
41 |
+
)
|
42 |
+
|
43 |
+
|
44 |
+
class MVDiffusionPipeline(StableDiffusionPipeline):
|
45 |
+
def __init__(
|
46 |
+
self,
|
47 |
+
vae: AutoencoderKL,
|
48 |
+
text_encoder: CLIPTextModel,
|
49 |
+
tokenizer: CLIPTokenizer,
|
50 |
+
unet: UNet2DConditionModel,
|
51 |
+
scheduler: KarrasDiffusionSchedulers,
|
52 |
+
safety_checker: StableDiffusionSafetyChecker,
|
53 |
+
feature_extractor: Optional[CLIPImageProcessor] = None,
|
54 |
+
image_encoder: Optional[CLIPVisionModel] = None,
|
55 |
+
requires_safety_checker: bool = False,
|
56 |
+
) -> None:
|
57 |
+
super().__init__(
|
58 |
+
vae=vae,
|
59 |
+
text_encoder=text_encoder,
|
60 |
+
tokenizer=tokenizer,
|
61 |
+
unet=add_mv_attn_processor(unet),
|
62 |
+
scheduler=scheduler,
|
63 |
+
safety_checker=safety_checker,
|
64 |
+
feature_extractor=feature_extractor,
|
65 |
+
image_encoder=image_encoder,
|
66 |
+
requires_safety_checker=requires_safety_checker,
|
67 |
+
)
|
68 |
+
self.num_views = 4
|
69 |
+
|
70 |
+
def load_ip_adapter(
|
71 |
+
self,
|
72 |
+
pretrained_model_name_or_path_or_dict: Union[
|
73 |
+
str, List[str], Dict[str, torch.Tensor]
|
74 |
+
] = "kiigii/imagedream-ipmv-diffusers",
|
75 |
+
subfolder: Union[str, List[str]] = "ip_adapter",
|
76 |
+
weight_name: Union[str, List[str]] = "ip-adapter-plus_imagedream.bin",
|
77 |
+
image_encoder_folder: Optional[str] = "image_encoder",
|
78 |
+
**kwargs,
|
79 |
+
) -> None:
|
80 |
+
super().load_ip_adapter(
|
81 |
+
pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
|
82 |
+
subfolder=subfolder,
|
83 |
+
weight_name=weight_name,
|
84 |
+
image_encoder_folder=image_encoder_folder,
|
85 |
+
**kwargs,
|
86 |
+
)
|
87 |
+
print("IP-Adapter Loaded.")
|
88 |
+
|
89 |
+
if weight_name == "ip-adapter-plus_imagedream.bin":
|
90 |
+
setattr(self.image_encoder, "visual_projection", nn.Identity())
|
91 |
+
add_mv_attn_processor(self.unet)
|
92 |
+
set_num_views(self.unet, self.num_views + 1)
|
93 |
+
|
94 |
+
def unload_ip_adapter(self) -> None:
|
95 |
+
super().unload_ip_adapter()
|
96 |
+
set_num_views(self.unet, self.num_views)
|
97 |
+
|
98 |
+
def encode_image_to_latents(
|
99 |
+
self,
|
100 |
+
image: PipelineImageInput,
|
101 |
+
height: int,
|
102 |
+
width: int,
|
103 |
+
device: torch.device,
|
104 |
+
num_images_per_prompt: int = 1,
|
105 |
+
):
|
106 |
+
dtype = next(self.vae.parameters()).dtype
|
107 |
+
|
108 |
+
if isinstance(image, torch.Tensor):
|
109 |
+
image = F.interpolate(
|
110 |
+
image,
|
111 |
+
(height, width),
|
112 |
+
mode="bilinear",
|
113 |
+
align_corners=False,
|
114 |
+
antialias=True,
|
115 |
+
)
|
116 |
+
else:
|
117 |
+
image = self.image_processor.preprocess(image, height, width)
|
118 |
+
|
119 |
+
# image should be in range [-1, 1]
|
120 |
+
image = image.to(device=device, dtype=dtype)
|
121 |
+
|
122 |
+
def vae_encode(image):
|
123 |
+
posterior = self.vae.encode(image).latent_dist
|
124 |
+
latents = posterior.sample() * self.vae.config.scaling_factor
|
125 |
+
latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
|
126 |
+
return latents
|
127 |
+
|
128 |
+
latents = vae_encode(image)
|
129 |
+
uncond_latents = vae_encode(torch.zeros_like(image))
|
130 |
+
return latents, uncond_latents
|
131 |
+
|
132 |
+
@torch.no_grad()
|
133 |
+
def __call__(
|
134 |
+
self,
|
135 |
+
prompt: Union[str, List[str]] = None,
|
136 |
+
height: Optional[int] = None,
|
137 |
+
width: Optional[int] = None,
|
138 |
+
num_inference_steps: int = 50,
|
139 |
+
elevation: float = 0.0,
|
140 |
+
timesteps: List[int] = None,
|
141 |
+
sigmas: List[float] = None,
|
142 |
+
guidance_scale: float = 5.0,
|
143 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
144 |
+
num_images_per_prompt: Optional[int] = 1,
|
145 |
+
eta: float = 0.0,
|
146 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
147 |
+
latents: Optional[torch.Tensor] = None,
|
148 |
+
prompt_embeds: Optional[torch.Tensor] = None,
|
149 |
+
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
150 |
+
ip_adapter_image: Optional[PipelineImageInput] = None,
|
151 |
+
# StableDiffusion support `ip_adapter_image_embeds` but we don't use, and raise ValueError.
|
152 |
+
ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
|
153 |
+
output_type: Optional[str] = "pil",
|
154 |
+
return_dict: bool = True,
|
155 |
+
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
156 |
+
guidance_rescale: float = 0.0,
|
157 |
+
clip_skip: Optional[int] = None,
|
158 |
+
callback_on_step_end: Optional[
|
159 |
+
Union[
|
160 |
+
Callable[[int, int, Dict], None],
|
161 |
+
PipelineCallback,
|
162 |
+
MultiPipelineCallbacks,
|
163 |
+
]
|
164 |
+
] = None,
|
165 |
+
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
166 |
+
**kwargs,
|
167 |
+
):
|
168 |
+
if ip_adapter_image_embeds is not None:
|
169 |
+
raise ValueError(
|
170 |
+
"do not use `ip_adapter_image_embeds` in ImageDream, use `ip_adapter_image`"
|
171 |
+
)
|
172 |
+
|
173 |
+
callback = kwargs.pop("callback", None)
|
174 |
+
callback_steps = kwargs.pop("callback_steps", None)
|
175 |
+
|
176 |
+
if callback is not None:
|
177 |
+
deprecate(
|
178 |
+
"callback",
|
179 |
+
"1.0.0",
|
180 |
+
"Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
181 |
+
)
|
182 |
+
if callback_steps is not None:
|
183 |
+
deprecate(
|
184 |
+
"callback_steps",
|
185 |
+
"1.0.0",
|
186 |
+
"Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
|
187 |
+
)
|
188 |
+
|
189 |
+
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
190 |
+
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
191 |
+
|
192 |
+
# ImageDream number of views
|
193 |
+
if cross_attention_kwargs is None:
|
194 |
+
num_views = self.num_views
|
195 |
+
else:
|
196 |
+
cross_attention_kwargs.pop("num_views", self.num_views)
|
197 |
+
|
198 |
+
# 0. Default height and width to unet
|
199 |
+
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
200 |
+
width = width or self.unet.config.sample_size * self.vae_scale_factor
|
201 |
+
# to deal with lora scaling and other possible forward hooks
|
202 |
+
|
203 |
+
# 1. Check inputs. Raise error if not correct
|
204 |
+
if prompt is None:
|
205 |
+
prompt = ""
|
206 |
+
self.check_inputs(
|
207 |
+
prompt,
|
208 |
+
height,
|
209 |
+
width,
|
210 |
+
callback_steps,
|
211 |
+
negative_prompt,
|
212 |
+
prompt_embeds,
|
213 |
+
negative_prompt_embeds,
|
214 |
+
ip_adapter_image,
|
215 |
+
None, # ip_adapter_image_embeds,
|
216 |
+
callback_on_step_end_tensor_inputs,
|
217 |
+
)
|
218 |
+
|
219 |
+
self._guidance_scale = guidance_scale
|
220 |
+
self._guidance_rescale = guidance_rescale
|
221 |
+
self._clip_skip = clip_skip
|
222 |
+
self._cross_attention_kwargs = cross_attention_kwargs
|
223 |
+
self._interrupt = False
|
224 |
+
|
225 |
+
# 2. Define call parameters
|
226 |
+
if prompt is not None and isinstance(prompt, str):
|
227 |
+
batch_size = 1
|
228 |
+
elif prompt is not None and isinstance(prompt, list):
|
229 |
+
batch_size = len(prompt)
|
230 |
+
else:
|
231 |
+
batch_size = prompt_embeds.shape[0]
|
232 |
+
|
233 |
+
device = self._execution_device
|
234 |
+
|
235 |
+
# 3. Encode input prompt
|
236 |
+
lora_scale = (
|
237 |
+
self.cross_attention_kwargs.get("scale", None)
|
238 |
+
if self.cross_attention_kwargs is not None
|
239 |
+
else None
|
240 |
+
)
|
241 |
+
|
242 |
+
prompt_embeds, negative_prompt_embeds = self.encode_prompt(
|
243 |
+
prompt,
|
244 |
+
device,
|
245 |
+
num_images_per_prompt,
|
246 |
+
self.do_classifier_free_guidance,
|
247 |
+
negative_prompt,
|
248 |
+
prompt_embeds=prompt_embeds,
|
249 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
250 |
+
lora_scale=lora_scale,
|
251 |
+
clip_skip=self.clip_skip,
|
252 |
+
)
|
253 |
+
|
254 |
+
# camera parameter for ImageDream
|
255 |
+
camera = get_camera(
|
256 |
+
num_views, elevation=elevation, extra_view=ip_adapter_image is not None
|
257 |
+
).to(dtype=prompt_embeds.dtype, device=device)
|
258 |
+
camera = camera.repeat(batch_size * num_images_per_prompt, 1)
|
259 |
+
|
260 |
+
if ip_adapter_image is not None:
|
261 |
+
image_embeds = self.prepare_ip_adapter_image_embeds(
|
262 |
+
ip_adapter_image,
|
263 |
+
None, # ip_adapter_image_embeds,
|
264 |
+
device,
|
265 |
+
batch_size * num_images_per_prompt,
|
266 |
+
self.do_classifier_free_guidance,
|
267 |
+
)
|
268 |
+
# ImageDream
|
269 |
+
image_latents, negative_image_latents = self.encode_image_to_latents(
|
270 |
+
ip_adapter_image,
|
271 |
+
height,
|
272 |
+
width,
|
273 |
+
device,
|
274 |
+
batch_size * num_images_per_prompt,
|
275 |
+
)
|
276 |
+
num_views += 1
|
277 |
+
|
278 |
+
# For classifier free guidance, we need to do two forward passes.
|
279 |
+
# Here we concatenate the unconditional and text embeddings into a single batch
|
280 |
+
# to avoid doing two forward passes
|
281 |
+
if self.do_classifier_free_guidance:
|
282 |
+
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
283 |
+
camera = torch.cat([camera] * 2)
|
284 |
+
if ip_adapter_image is not None:
|
285 |
+
image_latents = torch.cat([negative_image_latents, image_latents])
|
286 |
+
|
287 |
+
# Multi-view inputs for ImageDream.
|
288 |
+
prompt_embeds = prompt_embeds.repeat_interleave(num_views, dim=0)
|
289 |
+
if ip_adapter_image is not None:
|
290 |
+
image_embeds = [i.repeat_interleave(num_views, dim=0) for i in image_embeds]
|
291 |
+
|
292 |
+
# 4. Prepare timesteps
|
293 |
+
timesteps, num_inference_steps = retrieve_timesteps(
|
294 |
+
self.scheduler, num_inference_steps, device, timesteps, sigmas
|
295 |
+
)
|
296 |
+
|
297 |
+
# 5. Prepare latent variables
|
298 |
+
num_channels_latents = self.unet.config.in_channels
|
299 |
+
latents = self.prepare_latents(
|
300 |
+
batch_size * num_images_per_prompt * num_views,
|
301 |
+
num_channels_latents,
|
302 |
+
height,
|
303 |
+
width,
|
304 |
+
prompt_embeds.dtype,
|
305 |
+
device,
|
306 |
+
generator,
|
307 |
+
latents,
|
308 |
+
)
|
309 |
+
|
310 |
+
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
311 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
312 |
+
|
313 |
+
# 6.1 Add image embeds for IP-Adapter
|
314 |
+
if ip_adapter_image is not None:
|
315 |
+
added_cond_kwargs = {"image_embeds": image_embeds}
|
316 |
+
else:
|
317 |
+
added_cond_kwargs = None
|
318 |
+
|
319 |
+
# 6.2 Optionally get Guidance Scale Embedding
|
320 |
+
timestep_cond = None
|
321 |
+
if self.unet.config.time_cond_proj_dim is not None:
|
322 |
+
guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
|
323 |
+
batch_size * num_images_per_prompt
|
324 |
+
)
|
325 |
+
timestep_cond = self.get_guidance_scale_embedding(
|
326 |
+
guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
|
327 |
+
).to(device=device, dtype=latents.dtype)
|
328 |
+
|
329 |
+
set_num_views(self.unet, num_views)
|
330 |
+
|
331 |
+
# fmt: off
|
332 |
+
# 7. Denoising loop
|
333 |
+
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
334 |
+
self._num_timesteps = len(timesteps)
|
335 |
+
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
336 |
+
for i, t in enumerate(timesteps):
|
337 |
+
if self.interrupt:
|
338 |
+
continue
|
339 |
+
|
340 |
+
# expand the latents if we are doing classifier free guidance
|
341 |
+
latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
|
342 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
343 |
+
|
344 |
+
if ip_adapter_image is not None:
|
345 |
+
latent_model_input[num_views - 1 :: num_views, :, :, :] = image_latents
|
346 |
+
# predict the noise residual
|
347 |
+
noise_pred = self.unet(
|
348 |
+
latent_model_input,
|
349 |
+
t,
|
350 |
+
class_labels=camera,
|
351 |
+
encoder_hidden_states=prompt_embeds,
|
352 |
+
timestep_cond=timestep_cond,
|
353 |
+
cross_attention_kwargs=self.cross_attention_kwargs,
|
354 |
+
added_cond_kwargs=added_cond_kwargs,
|
355 |
+
return_dict=False,
|
356 |
+
)[0]
|
357 |
+
|
358 |
+
# perform guidance
|
359 |
+
if self.do_classifier_free_guidance:
|
360 |
+
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
361 |
+
noise_pred = torch.lerp(noise_pred_uncond, noise_pred_text, self.guidance_scale)
|
362 |
+
|
363 |
+
if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
|
364 |
+
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
365 |
+
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
|
366 |
+
|
367 |
+
# compute the previous noisy sample x_t -> x_t-1
|
368 |
+
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
|
369 |
+
|
370 |
+
if callback_on_step_end is not None:
|
371 |
+
callback_kwargs = {}
|
372 |
+
for k in callback_on_step_end_tensor_inputs:
|
373 |
+
callback_kwargs[k] = locals()[k]
|
374 |
+
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
375 |
+
|
376 |
+
latents = callback_outputs.pop("latents", latents)
|
377 |
+
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
378 |
+
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
379 |
+
|
380 |
+
# call the callback, if provided
|
381 |
+
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
382 |
+
progress_bar.update()
|
383 |
+
if callback is not None and i % callback_steps == 0:
|
384 |
+
step_idx = i // getattr(self.scheduler, "order", 1)
|
385 |
+
callback(step_idx, t, latents)
|
386 |
+
# fmt: on
|
387 |
+
if not output_type == "latent":
|
388 |
+
image = self.vae.decode(
|
389 |
+
latents / self.vae.config.scaling_factor,
|
390 |
+
return_dict=False,
|
391 |
+
generator=generator,
|
392 |
+
)[0]
|
393 |
+
image, has_nsfw_concept = self.run_safety_checker(
|
394 |
+
image, device, prompt_embeds.dtype
|
395 |
+
)
|
396 |
+
else:
|
397 |
+
image = latents
|
398 |
+
has_nsfw_concept = None
|
399 |
+
|
400 |
+
if has_nsfw_concept is None:
|
401 |
+
do_denormalize = [True] * image.shape[0]
|
402 |
+
else:
|
403 |
+
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
|
404 |
+
|
405 |
+
image = self.image_processor.postprocess(
|
406 |
+
image, output_type=output_type, do_denormalize=do_denormalize
|
407 |
+
)
|
408 |
+
|
409 |
+
# Offload all models
|
410 |
+
self.maybe_free_model_hooks()
|
411 |
+
|
412 |
+
if not return_dict:
|
413 |
+
return (image, has_nsfw_concept)
|
414 |
+
|
415 |
+
return StableDiffusionPipelineOutput(
|
416 |
+
images=image, nsfw_content_detected=has_nsfw_concept
|
417 |
+
)
|
418 |
+
|
419 |
+
|
420 |
+
# fmt: off
|
421 |
+
# Copied from ImageDream
|
422 |
+
# https://github.com/bytedance/ImageDream/blob/main/extern/ImageDream/imagedream/camera_utils.py
|
423 |
+
|
424 |
+
|
425 |
+
def create_camera_to_world_matrix(elevation, azimuth):
|
426 |
+
elevation = np.radians(elevation)
|
427 |
+
azimuth = np.radians(azimuth)
|
428 |
+
# Convert elevation and azimuth angles to Cartesian coordinates on a unit sphere
|
429 |
+
x = np.cos(elevation) * np.sin(azimuth)
|
430 |
+
y = np.sin(elevation)
|
431 |
+
z = np.cos(elevation) * np.cos(azimuth)
|
432 |
+
|
433 |
+
# Calculate camera position, target, and up vectors
|
434 |
+
camera_pos = np.array([x, y, z])
|
435 |
+
target = np.array([0, 0, 0])
|
436 |
+
up = np.array([0, 1, 0])
|
437 |
+
|
438 |
+
# Construct view matrix
|
439 |
+
forward = target - camera_pos
|
440 |
+
forward /= np.linalg.norm(forward)
|
441 |
+
right = np.cross(forward, up)
|
442 |
+
right /= np.linalg.norm(right)
|
443 |
+
new_up = np.cross(right, forward)
|
444 |
+
new_up /= np.linalg.norm(new_up)
|
445 |
+
cam2world = np.eye(4)
|
446 |
+
cam2world[:3, :3] = np.array([right, new_up, -forward]).T
|
447 |
+
cam2world[:3, 3] = camera_pos
|
448 |
+
return cam2world
|
449 |
+
|
450 |
+
|
451 |
+
def convert_opengl_to_blender(camera_matrix):
|
452 |
+
if isinstance(camera_matrix, np.ndarray):
|
453 |
+
# Construct transformation matrix to convert from OpenGL space to Blender space
|
454 |
+
flip_yz = np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
|
455 |
+
camera_matrix_blender = np.dot(flip_yz, camera_matrix)
|
456 |
+
else:
|
457 |
+
# Construct transformation matrix to convert from OpenGL space to Blender space
|
458 |
+
flip_yz = torch.tensor(
|
459 |
+
[[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]
|
460 |
+
)
|
461 |
+
if camera_matrix.ndim == 3:
|
462 |
+
flip_yz = flip_yz.unsqueeze(0)
|
463 |
+
camera_matrix_blender = torch.matmul(flip_yz.to(camera_matrix), camera_matrix)
|
464 |
+
return camera_matrix_blender
|
465 |
+
|
466 |
+
|
467 |
+
def normalize_camera(camera_matrix):
|
468 |
+
"""normalize the camera location onto a unit-sphere"""
|
469 |
+
if isinstance(camera_matrix, np.ndarray):
|
470 |
+
camera_matrix = camera_matrix.reshape(-1, 4, 4)
|
471 |
+
translation = camera_matrix[:, :3, 3]
|
472 |
+
translation = translation / (
|
473 |
+
np.linalg.norm(translation, axis=1, keepdims=True) + 1e-8
|
474 |
+
)
|
475 |
+
camera_matrix[:, :3, 3] = translation
|
476 |
+
else:
|
477 |
+
camera_matrix = camera_matrix.reshape(-1, 4, 4)
|
478 |
+
translation = camera_matrix[:, :3, 3]
|
479 |
+
translation = translation / (
|
480 |
+
torch.norm(translation, dim=1, keepdim=True) + 1e-8
|
481 |
+
)
|
482 |
+
camera_matrix[:, :3, 3] = translation
|
483 |
+
return camera_matrix.reshape(-1, 16)
|
484 |
+
|
485 |
+
|
486 |
+
def get_camera(
|
487 |
+
num_frames,
|
488 |
+
elevation=15,
|
489 |
+
azimuth_start=0,
|
490 |
+
azimuth_span=360,
|
491 |
+
blender_coord=True,
|
492 |
+
extra_view=False,
|
493 |
+
):
|
494 |
+
angle_gap = azimuth_span / num_frames
|
495 |
+
cameras = []
|
496 |
+
for azimuth in np.arange(azimuth_start, azimuth_span + azimuth_start, angle_gap):
|
497 |
+
camera_matrix = create_camera_to_world_matrix(elevation, azimuth)
|
498 |
+
if blender_coord:
|
499 |
+
camera_matrix = convert_opengl_to_blender(camera_matrix)
|
500 |
+
cameras.append(camera_matrix.flatten())
|
501 |
+
|
502 |
+
if extra_view:
|
503 |
+
dim = len(cameras[0])
|
504 |
+
cameras.append(np.zeros(dim))
|
505 |
+
return torch.tensor(np.stack(cameras, 0)).float()
|
506 |
+
# fmt: on
|
507 |
+
|
508 |
+
|
509 |
+
def add_mv_attn_processor(unet: UNet2DConditionModel, num_views: int = 4) -> UNet2DConditionModel:
|
510 |
+
attn_procs = {}
|
511 |
+
for key, attn_processor in unet.attn_processors.items():
|
512 |
+
if "attn1" in key:
|
513 |
+
attn_procs[key] = MVAttnProcessor2_0(num_views)
|
514 |
+
else:
|
515 |
+
attn_procs[key] = attn_processor
|
516 |
+
unet.set_attn_processor(attn_procs)
|
517 |
+
return unet
|
518 |
+
|
519 |
+
|
520 |
+
def set_num_views(unet: UNet2DConditionModel, num_views: int) -> UNet2DConditionModel:
|
521 |
+
for key, attn_processor in unet.attn_processors.items():
|
522 |
+
if isinstance(attn_processor, MVAttnProcessor2_0):
|
523 |
+
attn_processor.num_views = num_views
|
524 |
+
return unet
|
525 |
+
|
526 |
+
|
527 |
+
class MVAttnProcessor2_0(AttnProcessor2_0):
|
528 |
+
def __init__(self, num_views: int = 4):
|
529 |
+
super().__init__()
|
530 |
+
self.num_views = num_views
|
531 |
+
|
532 |
+
def __call__(
|
533 |
+
self,
|
534 |
+
attn: Attention,
|
535 |
+
hidden_states: torch.Tensor,
|
536 |
+
encoder_hidden_states: Optional[torch.Tensor] = None,
|
537 |
+
attention_mask: Optional[torch.Tensor] = None,
|
538 |
+
temb: Optional[torch.Tensor] = None,
|
539 |
+
*args,
|
540 |
+
**kwargs,
|
541 |
+
):
|
542 |
+
if self.num_views == 1:
|
543 |
+
return super().__call__(
|
544 |
+
attn=attn,
|
545 |
+
hidden_states=hidden_states,
|
546 |
+
encoder_hidden_states=encoder_hidden_states,
|
547 |
+
attention_mask=attention_mask,
|
548 |
+
temb=temb,
|
549 |
+
*args,
|
550 |
+
**kwargs,
|
551 |
+
)
|
552 |
+
|
553 |
+
input_ndim = hidden_states.ndim
|
554 |
+
B = hidden_states.size(0)
|
555 |
+
if B % self.num_views:
|
556 |
+
raise ValueError(
|
557 |
+
f"`batch_size`(got {B}) must be a multiple of `num_views`(got {self.num_views})."
|
558 |
+
)
|
559 |
+
real_B = B // self.num_views
|
560 |
+
if input_ndim == 4:
|
561 |
+
H, W = hidden_states.shape[2:]
|
562 |
+
hidden_states = hidden_states.reshape(real_B, -1, H, W).transpose(1, 2)
|
563 |
+
else:
|
564 |
+
hidden_states = hidden_states.reshape(real_B, -1, hidden_states.size(-1))
|
565 |
+
hidden_states = super().__call__(
|
566 |
+
attn=attn,
|
567 |
+
hidden_states=hidden_states,
|
568 |
+
encoder_hidden_states=encoder_hidden_states,
|
569 |
+
attention_mask=attention_mask,
|
570 |
+
temb=temb,
|
571 |
+
*args,
|
572 |
+
**kwargs,
|
573 |
+
)
|
574 |
+
if input_ndim == 4:
|
575 |
+
hidden_states = hidden_states.transpose(-1, -2).reshape(B, -1, H, W)
|
576 |
+
else:
|
577 |
+
hidden_states = hidden_states.reshape(B, -1, hidden_states.size(-1))
|
578 |
+
return hidden_states
|
scheduler/scheduler_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "DDIMScheduler",
|
3 |
+
"_diffusers_version": "0.29.2",
|
4 |
+
"beta_end": 0.012,
|
5 |
+
"beta_schedule": "scaled_linear",
|
6 |
+
"beta_start": 0.00085,
|
7 |
+
"clip_sample": false,
|
8 |
+
"clip_sample_range": 1.0,
|
9 |
+
"dynamic_thresholding_ratio": 0.995,
|
10 |
+
"num_train_timesteps": 1000,
|
11 |
+
"prediction_type": "epsilon",
|
12 |
+
"rescale_betas_zero_snr": false,
|
13 |
+
"sample_max_value": 1.0,
|
14 |
+
"set_alpha_to_one": false,
|
15 |
+
"steps_offset": 1,
|
16 |
+
"thresholding": false,
|
17 |
+
"timestep_spacing": "leading",
|
18 |
+
"trained_betas": null
|
19 |
+
}
|
text_encoder/config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "stabilityai/stable-diffusion-2-1",
|
3 |
+
"architectures": [
|
4 |
+
"CLIPTextModel"
|
5 |
+
],
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"dropout": 0.0,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_size": 1024,
|
12 |
+
"initializer_factor": 1.0,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 4096,
|
15 |
+
"layer_norm_eps": 1e-05,
|
16 |
+
"max_position_embeddings": 77,
|
17 |
+
"model_type": "clip_text_model",
|
18 |
+
"num_attention_heads": 16,
|
19 |
+
"num_hidden_layers": 23,
|
20 |
+
"pad_token_id": 1,
|
21 |
+
"projection_dim": 512,
|
22 |
+
"torch_dtype": "float16",
|
23 |
+
"transformers_version": "4.42.3",
|
24 |
+
"vocab_size": 49408
|
25 |
+
}
|
text_encoder/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bc1827c465450322616f06dea41596eac7d493f4e95904dcb51f0fc745c4e13f
|
3 |
+
size 680820392
|
tokenizer/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|startoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": "!",
|
17 |
+
"unk_token": {
|
18 |
+
"content": "<|endoftext|>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": true,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
}
|
24 |
+
}
|
tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"0": {
|
5 |
+
"content": "!",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": false,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
},
|
12 |
+
"49406": {
|
13 |
+
"content": "<|startoftext|>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": true,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false,
|
18 |
+
"special": true
|
19 |
+
},
|
20 |
+
"49407": {
|
21 |
+
"content": "<|endoftext|>",
|
22 |
+
"lstrip": false,
|
23 |
+
"normalized": true,
|
24 |
+
"rstrip": false,
|
25 |
+
"single_word": false,
|
26 |
+
"special": true
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"bos_token": "<|startoftext|>",
|
30 |
+
"clean_up_tokenization_spaces": true,
|
31 |
+
"do_lower_case": true,
|
32 |
+
"eos_token": "<|endoftext|>",
|
33 |
+
"errors": "replace",
|
34 |
+
"model_max_length": 77,
|
35 |
+
"pad_token": "!",
|
36 |
+
"tokenizer_class": "CLIPTokenizer",
|
37 |
+
"unk_token": "<|endoftext|>"
|
38 |
+
}
|
tokenizer/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
unet/config.json
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "UNet2DConditionModel",
|
3 |
+
"_diffusers_version": "0.29.2",
|
4 |
+
"act_fn": "silu",
|
5 |
+
"addition_embed_type": null,
|
6 |
+
"addition_embed_type_num_heads": 64,
|
7 |
+
"addition_time_embed_dim": null,
|
8 |
+
"attention_head_dim": [
|
9 |
+
5,
|
10 |
+
10,
|
11 |
+
20,
|
12 |
+
20
|
13 |
+
],
|
14 |
+
"attention_type": "default",
|
15 |
+
"block_out_channels": [
|
16 |
+
320,
|
17 |
+
640,
|
18 |
+
1280,
|
19 |
+
1280
|
20 |
+
],
|
21 |
+
"center_input_sample": false,
|
22 |
+
"class_embed_type": "projection",
|
23 |
+
"class_embeddings_concat": false,
|
24 |
+
"conv_in_kernel": 3,
|
25 |
+
"conv_out_kernel": 3,
|
26 |
+
"cross_attention_dim": 1024,
|
27 |
+
"cross_attention_norm": null,
|
28 |
+
"down_block_types": [
|
29 |
+
"CrossAttnDownBlock2D",
|
30 |
+
"CrossAttnDownBlock2D",
|
31 |
+
"CrossAttnDownBlock2D",
|
32 |
+
"DownBlock2D"
|
33 |
+
],
|
34 |
+
"downsample_padding": 1,
|
35 |
+
"dropout": 0.0,
|
36 |
+
"dual_cross_attention": false,
|
37 |
+
"encoder_hid_dim": null,
|
38 |
+
"encoder_hid_dim_type": null,
|
39 |
+
"flip_sin_to_cos": true,
|
40 |
+
"freq_shift": 0,
|
41 |
+
"in_channels": 4,
|
42 |
+
"layers_per_block": [
|
43 |
+
2,
|
44 |
+
2,
|
45 |
+
2,
|
46 |
+
2
|
47 |
+
],
|
48 |
+
"mid_block_only_cross_attention": null,
|
49 |
+
"mid_block_scale_factor": 1,
|
50 |
+
"mid_block_type": "UNetMidBlock2DCrossAttn",
|
51 |
+
"norm_eps": 1e-05,
|
52 |
+
"norm_num_groups": 32,
|
53 |
+
"num_attention_heads": null,
|
54 |
+
"num_class_embeds": null,
|
55 |
+
"only_cross_attention": false,
|
56 |
+
"out_channels": 4,
|
57 |
+
"projection_class_embeddings_input_dim": 16,
|
58 |
+
"resnet_out_scale_factor": 1.0,
|
59 |
+
"resnet_skip_time_act": false,
|
60 |
+
"resnet_time_scale_shift": "default",
|
61 |
+
"reverse_transformer_layers_per_block": null,
|
62 |
+
"sample_size": 32,
|
63 |
+
"time_cond_proj_dim": null,
|
64 |
+
"time_embedding_act_fn": null,
|
65 |
+
"time_embedding_dim": null,
|
66 |
+
"time_embedding_type": "positional",
|
67 |
+
"timestep_post_act": null,
|
68 |
+
"transformer_layers_per_block": 1,
|
69 |
+
"up_block_types": [
|
70 |
+
"UpBlock2D",
|
71 |
+
"CrossAttnUpBlock2D",
|
72 |
+
"CrossAttnUpBlock2D",
|
73 |
+
"CrossAttnUpBlock2D"
|
74 |
+
],
|
75 |
+
"upcast_attention": false,
|
76 |
+
"use_linear_projection": true
|
77 |
+
}
|
unet/diffusion_pytorch_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:628974a46d456a99f24569ed08924f74c30229f0a5a8fd2318dd4be4363c71d0
|
3 |
+
size 1735228080
|
vae/config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "AutoencoderKL",
|
3 |
+
"_diffusers_version": "0.29.2",
|
4 |
+
"act_fn": "silu",
|
5 |
+
"block_out_channels": [
|
6 |
+
128,
|
7 |
+
256,
|
8 |
+
512,
|
9 |
+
512
|
10 |
+
],
|
11 |
+
"down_block_types": [
|
12 |
+
"DownEncoderBlock2D",
|
13 |
+
"DownEncoderBlock2D",
|
14 |
+
"DownEncoderBlock2D",
|
15 |
+
"DownEncoderBlock2D"
|
16 |
+
],
|
17 |
+
"force_upcast": true,
|
18 |
+
"in_channels": 3,
|
19 |
+
"latent_channels": 4,
|
20 |
+
"latents_mean": null,
|
21 |
+
"latents_std": null,
|
22 |
+
"layers_per_block": 2,
|
23 |
+
"norm_num_groups": 32,
|
24 |
+
"out_channels": 3,
|
25 |
+
"sample_size": 256,
|
26 |
+
"scaling_factor": 0.18215,
|
27 |
+
"shift_factor": null,
|
28 |
+
"up_block_types": [
|
29 |
+
"UpDecoderBlock2D",
|
30 |
+
"UpDecoderBlock2D",
|
31 |
+
"UpDecoderBlock2D",
|
32 |
+
"UpDecoderBlock2D"
|
33 |
+
],
|
34 |
+
"use_post_quant_conv": true,
|
35 |
+
"use_quant_conv": true
|
36 |
+
}
|
vae/diffusion_pytorch_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e4c08995484ee61270175e9e7a072b66a6e4eeb5f0c266667fe1f45b90daf9a
|
3 |
+
size 167335342
|