kiigii commited on
Commit
dbb8b4c
·
verified ·
1 Parent(s): 00c1780

Upload folder using huggingface_hub

Browse files
model_index.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": [
3
+ "pipeline_mvdiffusion",
4
+ "MVDiffusionPipeline"
5
+ ],
6
+ "_diffusers_version": "0.29.2",
7
+ "feature_extractor": [
8
+ null,
9
+ null
10
+ ],
11
+ "image_encoder": [
12
+ null,
13
+ null
14
+ ],
15
+ "requires_safety_checker": false,
16
+ "safety_checker": [
17
+ null,
18
+ null
19
+ ],
20
+ "scheduler": [
21
+ "diffusers",
22
+ "DDIMScheduler"
23
+ ],
24
+ "text_encoder": [
25
+ "transformers",
26
+ "CLIPTextModel"
27
+ ],
28
+ "tokenizer": [
29
+ "transformers",
30
+ "CLIPTokenizer"
31
+ ],
32
+ "unet": [
33
+ "diffusers",
34
+ "UNet2DConditionModel"
35
+ ],
36
+ "vae": [
37
+ "diffusers",
38
+ "AutoencoderKL"
39
+ ]
40
+ }
pipeline_mvdiffusion.py ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Callable, Dict, List, Optional, Union
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+
8
+ try:
9
+ from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
10
+ except:
11
+
12
+ class MultiPipelineCallbacks:
13
+ ...
14
+
15
+ class PipelineCallback:
16
+ ...
17
+
18
+
19
+ from diffusers.image_processor import PipelineImageInput
20
+ from diffusers.models import AutoencoderKL, UNet2DConditionModel
21
+ from diffusers.models.attention import Attention
22
+ from diffusers.models.attention_processor import AttnProcessor2_0
23
+ from diffusers.pipelines.stable_diffusion.pipeline_output import (
24
+ StableDiffusionPipelineOutput,
25
+ )
26
+ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
27
+ StableDiffusionPipeline,
28
+ rescale_noise_cfg,
29
+ retrieve_timesteps,
30
+ )
31
+ from diffusers.pipelines.stable_diffusion.safety_checker import (
32
+ StableDiffusionSafetyChecker,
33
+ )
34
+ from diffusers.schedulers import KarrasDiffusionSchedulers
35
+ from diffusers.utils import deprecate
36
+ from transformers import (
37
+ CLIPImageProcessor,
38
+ CLIPTextModel,
39
+ CLIPTokenizer,
40
+ CLIPVisionModel,
41
+ )
42
+
43
+
44
+ class MVDiffusionPipeline(StableDiffusionPipeline):
45
+ def __init__(
46
+ self,
47
+ vae: AutoencoderKL,
48
+ text_encoder: CLIPTextModel,
49
+ tokenizer: CLIPTokenizer,
50
+ unet: UNet2DConditionModel,
51
+ scheduler: KarrasDiffusionSchedulers,
52
+ safety_checker: StableDiffusionSafetyChecker,
53
+ feature_extractor: Optional[CLIPImageProcessor] = None,
54
+ image_encoder: Optional[CLIPVisionModel] = None,
55
+ requires_safety_checker: bool = False,
56
+ ) -> None:
57
+ super().__init__(
58
+ vae=vae,
59
+ text_encoder=text_encoder,
60
+ tokenizer=tokenizer,
61
+ unet=add_mv_attn_processor(unet),
62
+ scheduler=scheduler,
63
+ safety_checker=safety_checker,
64
+ feature_extractor=feature_extractor,
65
+ image_encoder=image_encoder,
66
+ requires_safety_checker=requires_safety_checker,
67
+ )
68
+ self.num_views = 4
69
+
70
+ def load_ip_adapter(
71
+ self,
72
+ pretrained_model_name_or_path_or_dict: Union[
73
+ str, List[str], Dict[str, torch.Tensor]
74
+ ] = "kiigii/imagedream-ipmv-diffusers",
75
+ subfolder: Union[str, List[str]] = "ip_adapter",
76
+ weight_name: Union[str, List[str]] = "ip-adapter-plus_imagedream.bin",
77
+ image_encoder_folder: Optional[str] = "image_encoder",
78
+ **kwargs,
79
+ ) -> None:
80
+ super().load_ip_adapter(
81
+ pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
82
+ subfolder=subfolder,
83
+ weight_name=weight_name,
84
+ image_encoder_folder=image_encoder_folder,
85
+ **kwargs,
86
+ )
87
+ print("IP-Adapter Loaded.")
88
+
89
+ if weight_name == "ip-adapter-plus_imagedream.bin":
90
+ setattr(self.image_encoder, "visual_projection", nn.Identity())
91
+ add_mv_attn_processor(self.unet)
92
+ set_num_views(self.unet, self.num_views + 1)
93
+
94
+ def unload_ip_adapter(self) -> None:
95
+ super().unload_ip_adapter()
96
+ set_num_views(self.unet, self.num_views)
97
+
98
+ def encode_image_to_latents(
99
+ self,
100
+ image: PipelineImageInput,
101
+ height: int,
102
+ width: int,
103
+ device: torch.device,
104
+ num_images_per_prompt: int = 1,
105
+ ):
106
+ dtype = next(self.vae.parameters()).dtype
107
+
108
+ if isinstance(image, torch.Tensor):
109
+ image = F.interpolate(
110
+ image,
111
+ (height, width),
112
+ mode="bilinear",
113
+ align_corners=False,
114
+ antialias=True,
115
+ )
116
+ else:
117
+ image = self.image_processor.preprocess(image, height, width)
118
+
119
+ # image should be in range [-1, 1]
120
+ image = image.to(device=device, dtype=dtype)
121
+
122
+ def vae_encode(image):
123
+ posterior = self.vae.encode(image).latent_dist
124
+ latents = posterior.sample() * self.vae.config.scaling_factor
125
+ latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
126
+ return latents
127
+
128
+ latents = vae_encode(image)
129
+ uncond_latents = vae_encode(torch.zeros_like(image))
130
+ return latents, uncond_latents
131
+
132
+ @torch.no_grad()
133
+ def __call__(
134
+ self,
135
+ prompt: Union[str, List[str]] = None,
136
+ height: Optional[int] = None,
137
+ width: Optional[int] = None,
138
+ num_inference_steps: int = 50,
139
+ elevation: float = 0.0,
140
+ timesteps: List[int] = None,
141
+ sigmas: List[float] = None,
142
+ guidance_scale: float = 5.0,
143
+ negative_prompt: Optional[Union[str, List[str]]] = None,
144
+ num_images_per_prompt: Optional[int] = 1,
145
+ eta: float = 0.0,
146
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
147
+ latents: Optional[torch.Tensor] = None,
148
+ prompt_embeds: Optional[torch.Tensor] = None,
149
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
150
+ ip_adapter_image: Optional[PipelineImageInput] = None,
151
+ # StableDiffusion support `ip_adapter_image_embeds` but we don't use, and raise ValueError.
152
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
153
+ output_type: Optional[str] = "pil",
154
+ return_dict: bool = True,
155
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
156
+ guidance_rescale: float = 0.0,
157
+ clip_skip: Optional[int] = None,
158
+ callback_on_step_end: Optional[
159
+ Union[
160
+ Callable[[int, int, Dict], None],
161
+ PipelineCallback,
162
+ MultiPipelineCallbacks,
163
+ ]
164
+ ] = None,
165
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
166
+ **kwargs,
167
+ ):
168
+ if ip_adapter_image_embeds is not None:
169
+ raise ValueError(
170
+ "do not use `ip_adapter_image_embeds` in ImageDream, use `ip_adapter_image`"
171
+ )
172
+
173
+ callback = kwargs.pop("callback", None)
174
+ callback_steps = kwargs.pop("callback_steps", None)
175
+
176
+ if callback is not None:
177
+ deprecate(
178
+ "callback",
179
+ "1.0.0",
180
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
181
+ )
182
+ if callback_steps is not None:
183
+ deprecate(
184
+ "callback_steps",
185
+ "1.0.0",
186
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
187
+ )
188
+
189
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
190
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
191
+
192
+ # ImageDream number of views
193
+ if cross_attention_kwargs is None:
194
+ num_views = self.num_views
195
+ else:
196
+ cross_attention_kwargs.pop("num_views", self.num_views)
197
+
198
+ # 0. Default height and width to unet
199
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
200
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
201
+ # to deal with lora scaling and other possible forward hooks
202
+
203
+ # 1. Check inputs. Raise error if not correct
204
+ if prompt is None:
205
+ prompt = ""
206
+ self.check_inputs(
207
+ prompt,
208
+ height,
209
+ width,
210
+ callback_steps,
211
+ negative_prompt,
212
+ prompt_embeds,
213
+ negative_prompt_embeds,
214
+ ip_adapter_image,
215
+ None, # ip_adapter_image_embeds,
216
+ callback_on_step_end_tensor_inputs,
217
+ )
218
+
219
+ self._guidance_scale = guidance_scale
220
+ self._guidance_rescale = guidance_rescale
221
+ self._clip_skip = clip_skip
222
+ self._cross_attention_kwargs = cross_attention_kwargs
223
+ self._interrupt = False
224
+
225
+ # 2. Define call parameters
226
+ if prompt is not None and isinstance(prompt, str):
227
+ batch_size = 1
228
+ elif prompt is not None and isinstance(prompt, list):
229
+ batch_size = len(prompt)
230
+ else:
231
+ batch_size = prompt_embeds.shape[0]
232
+
233
+ device = self._execution_device
234
+
235
+ # 3. Encode input prompt
236
+ lora_scale = (
237
+ self.cross_attention_kwargs.get("scale", None)
238
+ if self.cross_attention_kwargs is not None
239
+ else None
240
+ )
241
+
242
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
243
+ prompt,
244
+ device,
245
+ num_images_per_prompt,
246
+ self.do_classifier_free_guidance,
247
+ negative_prompt,
248
+ prompt_embeds=prompt_embeds,
249
+ negative_prompt_embeds=negative_prompt_embeds,
250
+ lora_scale=lora_scale,
251
+ clip_skip=self.clip_skip,
252
+ )
253
+
254
+ # camera parameter for ImageDream
255
+ camera = get_camera(
256
+ num_views, elevation=elevation, extra_view=ip_adapter_image is not None
257
+ ).to(dtype=prompt_embeds.dtype, device=device)
258
+ camera = camera.repeat(batch_size * num_images_per_prompt, 1)
259
+
260
+ if ip_adapter_image is not None:
261
+ image_embeds = self.prepare_ip_adapter_image_embeds(
262
+ ip_adapter_image,
263
+ None, # ip_adapter_image_embeds,
264
+ device,
265
+ batch_size * num_images_per_prompt,
266
+ self.do_classifier_free_guidance,
267
+ )
268
+ # ImageDream
269
+ image_latents, negative_image_latents = self.encode_image_to_latents(
270
+ ip_adapter_image,
271
+ height,
272
+ width,
273
+ device,
274
+ batch_size * num_images_per_prompt,
275
+ )
276
+ num_views += 1
277
+
278
+ # For classifier free guidance, we need to do two forward passes.
279
+ # Here we concatenate the unconditional and text embeddings into a single batch
280
+ # to avoid doing two forward passes
281
+ if self.do_classifier_free_guidance:
282
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
283
+ camera = torch.cat([camera] * 2)
284
+ if ip_adapter_image is not None:
285
+ image_latents = torch.cat([negative_image_latents, image_latents])
286
+
287
+ # Multi-view inputs for ImageDream.
288
+ prompt_embeds = prompt_embeds.repeat_interleave(num_views, dim=0)
289
+ if ip_adapter_image is not None:
290
+ image_embeds = [i.repeat_interleave(num_views, dim=0) for i in image_embeds]
291
+
292
+ # 4. Prepare timesteps
293
+ timesteps, num_inference_steps = retrieve_timesteps(
294
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
295
+ )
296
+
297
+ # 5. Prepare latent variables
298
+ num_channels_latents = self.unet.config.in_channels
299
+ latents = self.prepare_latents(
300
+ batch_size * num_images_per_prompt * num_views,
301
+ num_channels_latents,
302
+ height,
303
+ width,
304
+ prompt_embeds.dtype,
305
+ device,
306
+ generator,
307
+ latents,
308
+ )
309
+
310
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
311
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
312
+
313
+ # 6.1 Add image embeds for IP-Adapter
314
+ if ip_adapter_image is not None:
315
+ added_cond_kwargs = {"image_embeds": image_embeds}
316
+ else:
317
+ added_cond_kwargs = None
318
+
319
+ # 6.2 Optionally get Guidance Scale Embedding
320
+ timestep_cond = None
321
+ if self.unet.config.time_cond_proj_dim is not None:
322
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
323
+ batch_size * num_images_per_prompt
324
+ )
325
+ timestep_cond = self.get_guidance_scale_embedding(
326
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
327
+ ).to(device=device, dtype=latents.dtype)
328
+
329
+ set_num_views(self.unet, num_views)
330
+
331
+ # fmt: off
332
+ # 7. Denoising loop
333
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
334
+ self._num_timesteps = len(timesteps)
335
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
336
+ for i, t in enumerate(timesteps):
337
+ if self.interrupt:
338
+ continue
339
+
340
+ # expand the latents if we are doing classifier free guidance
341
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
342
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
343
+
344
+ if ip_adapter_image is not None:
345
+ latent_model_input[num_views - 1 :: num_views, :, :, :] = image_latents
346
+ # predict the noise residual
347
+ noise_pred = self.unet(
348
+ latent_model_input,
349
+ t,
350
+ class_labels=camera,
351
+ encoder_hidden_states=prompt_embeds,
352
+ timestep_cond=timestep_cond,
353
+ cross_attention_kwargs=self.cross_attention_kwargs,
354
+ added_cond_kwargs=added_cond_kwargs,
355
+ return_dict=False,
356
+ )[0]
357
+
358
+ # perform guidance
359
+ if self.do_classifier_free_guidance:
360
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
361
+ noise_pred = torch.lerp(noise_pred_uncond, noise_pred_text, self.guidance_scale)
362
+
363
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
364
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
365
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
366
+
367
+ # compute the previous noisy sample x_t -> x_t-1
368
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
369
+
370
+ if callback_on_step_end is not None:
371
+ callback_kwargs = {}
372
+ for k in callback_on_step_end_tensor_inputs:
373
+ callback_kwargs[k] = locals()[k]
374
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
375
+
376
+ latents = callback_outputs.pop("latents", latents)
377
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
378
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
379
+
380
+ # call the callback, if provided
381
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
382
+ progress_bar.update()
383
+ if callback is not None and i % callback_steps == 0:
384
+ step_idx = i // getattr(self.scheduler, "order", 1)
385
+ callback(step_idx, t, latents)
386
+ # fmt: on
387
+ if not output_type == "latent":
388
+ image = self.vae.decode(
389
+ latents / self.vae.config.scaling_factor,
390
+ return_dict=False,
391
+ generator=generator,
392
+ )[0]
393
+ image, has_nsfw_concept = self.run_safety_checker(
394
+ image, device, prompt_embeds.dtype
395
+ )
396
+ else:
397
+ image = latents
398
+ has_nsfw_concept = None
399
+
400
+ if has_nsfw_concept is None:
401
+ do_denormalize = [True] * image.shape[0]
402
+ else:
403
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
404
+
405
+ image = self.image_processor.postprocess(
406
+ image, output_type=output_type, do_denormalize=do_denormalize
407
+ )
408
+
409
+ # Offload all models
410
+ self.maybe_free_model_hooks()
411
+
412
+ if not return_dict:
413
+ return (image, has_nsfw_concept)
414
+
415
+ return StableDiffusionPipelineOutput(
416
+ images=image, nsfw_content_detected=has_nsfw_concept
417
+ )
418
+
419
+
420
+ # fmt: off
421
+ # Copied from ImageDream
422
+ # https://github.com/bytedance/ImageDream/blob/main/extern/ImageDream/imagedream/camera_utils.py
423
+
424
+
425
+ def create_camera_to_world_matrix(elevation, azimuth):
426
+ elevation = np.radians(elevation)
427
+ azimuth = np.radians(azimuth)
428
+ # Convert elevation and azimuth angles to Cartesian coordinates on a unit sphere
429
+ x = np.cos(elevation) * np.sin(azimuth)
430
+ y = np.sin(elevation)
431
+ z = np.cos(elevation) * np.cos(azimuth)
432
+
433
+ # Calculate camera position, target, and up vectors
434
+ camera_pos = np.array([x, y, z])
435
+ target = np.array([0, 0, 0])
436
+ up = np.array([0, 1, 0])
437
+
438
+ # Construct view matrix
439
+ forward = target - camera_pos
440
+ forward /= np.linalg.norm(forward)
441
+ right = np.cross(forward, up)
442
+ right /= np.linalg.norm(right)
443
+ new_up = np.cross(right, forward)
444
+ new_up /= np.linalg.norm(new_up)
445
+ cam2world = np.eye(4)
446
+ cam2world[:3, :3] = np.array([right, new_up, -forward]).T
447
+ cam2world[:3, 3] = camera_pos
448
+ return cam2world
449
+
450
+
451
+ def convert_opengl_to_blender(camera_matrix):
452
+ if isinstance(camera_matrix, np.ndarray):
453
+ # Construct transformation matrix to convert from OpenGL space to Blender space
454
+ flip_yz = np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
455
+ camera_matrix_blender = np.dot(flip_yz, camera_matrix)
456
+ else:
457
+ # Construct transformation matrix to convert from OpenGL space to Blender space
458
+ flip_yz = torch.tensor(
459
+ [[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]
460
+ )
461
+ if camera_matrix.ndim == 3:
462
+ flip_yz = flip_yz.unsqueeze(0)
463
+ camera_matrix_blender = torch.matmul(flip_yz.to(camera_matrix), camera_matrix)
464
+ return camera_matrix_blender
465
+
466
+
467
+ def normalize_camera(camera_matrix):
468
+ """normalize the camera location onto a unit-sphere"""
469
+ if isinstance(camera_matrix, np.ndarray):
470
+ camera_matrix = camera_matrix.reshape(-1, 4, 4)
471
+ translation = camera_matrix[:, :3, 3]
472
+ translation = translation / (
473
+ np.linalg.norm(translation, axis=1, keepdims=True) + 1e-8
474
+ )
475
+ camera_matrix[:, :3, 3] = translation
476
+ else:
477
+ camera_matrix = camera_matrix.reshape(-1, 4, 4)
478
+ translation = camera_matrix[:, :3, 3]
479
+ translation = translation / (
480
+ torch.norm(translation, dim=1, keepdim=True) + 1e-8
481
+ )
482
+ camera_matrix[:, :3, 3] = translation
483
+ return camera_matrix.reshape(-1, 16)
484
+
485
+
486
+ def get_camera(
487
+ num_frames,
488
+ elevation=15,
489
+ azimuth_start=0,
490
+ azimuth_span=360,
491
+ blender_coord=True,
492
+ extra_view=False,
493
+ ):
494
+ angle_gap = azimuth_span / num_frames
495
+ cameras = []
496
+ for azimuth in np.arange(azimuth_start, azimuth_span + azimuth_start, angle_gap):
497
+ camera_matrix = create_camera_to_world_matrix(elevation, azimuth)
498
+ if blender_coord:
499
+ camera_matrix = convert_opengl_to_blender(camera_matrix)
500
+ cameras.append(camera_matrix.flatten())
501
+
502
+ if extra_view:
503
+ dim = len(cameras[0])
504
+ cameras.append(np.zeros(dim))
505
+ return torch.tensor(np.stack(cameras, 0)).float()
506
+ # fmt: on
507
+
508
+
509
+ def add_mv_attn_processor(unet: UNet2DConditionModel, num_views: int = 4) -> UNet2DConditionModel:
510
+ attn_procs = {}
511
+ for key, attn_processor in unet.attn_processors.items():
512
+ if "attn1" in key:
513
+ attn_procs[key] = MVAttnProcessor2_0(num_views)
514
+ else:
515
+ attn_procs[key] = attn_processor
516
+ unet.set_attn_processor(attn_procs)
517
+ return unet
518
+
519
+
520
+ def set_num_views(unet: UNet2DConditionModel, num_views: int) -> UNet2DConditionModel:
521
+ for key, attn_processor in unet.attn_processors.items():
522
+ if isinstance(attn_processor, MVAttnProcessor2_0):
523
+ attn_processor.num_views = num_views
524
+ return unet
525
+
526
+
527
+ class MVAttnProcessor2_0(AttnProcessor2_0):
528
+ def __init__(self, num_views: int = 4):
529
+ super().__init__()
530
+ self.num_views = num_views
531
+
532
+ def __call__(
533
+ self,
534
+ attn: Attention,
535
+ hidden_states: torch.Tensor,
536
+ encoder_hidden_states: Optional[torch.Tensor] = None,
537
+ attention_mask: Optional[torch.Tensor] = None,
538
+ temb: Optional[torch.Tensor] = None,
539
+ *args,
540
+ **kwargs,
541
+ ):
542
+ if self.num_views == 1:
543
+ return super().__call__(
544
+ attn=attn,
545
+ hidden_states=hidden_states,
546
+ encoder_hidden_states=encoder_hidden_states,
547
+ attention_mask=attention_mask,
548
+ temb=temb,
549
+ *args,
550
+ **kwargs,
551
+ )
552
+
553
+ input_ndim = hidden_states.ndim
554
+ B = hidden_states.size(0)
555
+ if B % self.num_views:
556
+ raise ValueError(
557
+ f"`batch_size`(got {B}) must be a multiple of `num_views`(got {self.num_views})."
558
+ )
559
+ real_B = B // self.num_views
560
+ if input_ndim == 4:
561
+ H, W = hidden_states.shape[2:]
562
+ hidden_states = hidden_states.reshape(real_B, -1, H, W).transpose(1, 2)
563
+ else:
564
+ hidden_states = hidden_states.reshape(real_B, -1, hidden_states.size(-1))
565
+ hidden_states = super().__call__(
566
+ attn=attn,
567
+ hidden_states=hidden_states,
568
+ encoder_hidden_states=encoder_hidden_states,
569
+ attention_mask=attention_mask,
570
+ temb=temb,
571
+ *args,
572
+ **kwargs,
573
+ )
574
+ if input_ndim == 4:
575
+ hidden_states = hidden_states.transpose(-1, -2).reshape(B, -1, H, W)
576
+ else:
577
+ hidden_states = hidden_states.reshape(B, -1, hidden_states.size(-1))
578
+ return hidden_states
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.29.2",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "num_train_timesteps": 1000,
11
+ "prediction_type": "epsilon",
12
+ "rescale_betas_zero_snr": false,
13
+ "sample_max_value": 1.0,
14
+ "set_alpha_to_one": false,
15
+ "steps_offset": 1,
16
+ "thresholding": false,
17
+ "timestep_spacing": "leading",
18
+ "trained_betas": null
19
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "stabilityai/stable-diffusion-2-1",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 1024,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 23,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 512,
22
+ "torch_dtype": "float16",
23
+ "transformers_version": "4.42.3",
24
+ "vocab_size": 49408
25
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc1827c465450322616f06dea41596eac7d493f4e95904dcb51f0fc745c4e13f
3
+ size 680820392
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "!",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "!",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49406": {
13
+ "content": "<|startoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "49407": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "bos_token": "<|startoftext|>",
30
+ "clean_up_tokenization_spaces": true,
31
+ "do_lower_case": true,
32
+ "eos_token": "<|endoftext|>",
33
+ "errors": "replace",
34
+ "model_max_length": 77,
35
+ "pad_token": "!",
36
+ "tokenizer_class": "CLIPTokenizer",
37
+ "unk_token": "<|endoftext|>"
38
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
unet/config.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.29.2",
4
+ "act_fn": "silu",
5
+ "addition_embed_type": null,
6
+ "addition_embed_type_num_heads": 64,
7
+ "addition_time_embed_dim": null,
8
+ "attention_head_dim": [
9
+ 5,
10
+ 10,
11
+ 20,
12
+ 20
13
+ ],
14
+ "attention_type": "default",
15
+ "block_out_channels": [
16
+ 320,
17
+ 640,
18
+ 1280,
19
+ 1280
20
+ ],
21
+ "center_input_sample": false,
22
+ "class_embed_type": "projection",
23
+ "class_embeddings_concat": false,
24
+ "conv_in_kernel": 3,
25
+ "conv_out_kernel": 3,
26
+ "cross_attention_dim": 1024,
27
+ "cross_attention_norm": null,
28
+ "down_block_types": [
29
+ "CrossAttnDownBlock2D",
30
+ "CrossAttnDownBlock2D",
31
+ "CrossAttnDownBlock2D",
32
+ "DownBlock2D"
33
+ ],
34
+ "downsample_padding": 1,
35
+ "dropout": 0.0,
36
+ "dual_cross_attention": false,
37
+ "encoder_hid_dim": null,
38
+ "encoder_hid_dim_type": null,
39
+ "flip_sin_to_cos": true,
40
+ "freq_shift": 0,
41
+ "in_channels": 4,
42
+ "layers_per_block": [
43
+ 2,
44
+ 2,
45
+ 2,
46
+ 2
47
+ ],
48
+ "mid_block_only_cross_attention": null,
49
+ "mid_block_scale_factor": 1,
50
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
51
+ "norm_eps": 1e-05,
52
+ "norm_num_groups": 32,
53
+ "num_attention_heads": null,
54
+ "num_class_embeds": null,
55
+ "only_cross_attention": false,
56
+ "out_channels": 4,
57
+ "projection_class_embeddings_input_dim": 16,
58
+ "resnet_out_scale_factor": 1.0,
59
+ "resnet_skip_time_act": false,
60
+ "resnet_time_scale_shift": "default",
61
+ "reverse_transformer_layers_per_block": null,
62
+ "sample_size": 32,
63
+ "time_cond_proj_dim": null,
64
+ "time_embedding_act_fn": null,
65
+ "time_embedding_dim": null,
66
+ "time_embedding_type": "positional",
67
+ "timestep_post_act": null,
68
+ "transformer_layers_per_block": 1,
69
+ "up_block_types": [
70
+ "UpBlock2D",
71
+ "CrossAttnUpBlock2D",
72
+ "CrossAttnUpBlock2D",
73
+ "CrossAttnUpBlock2D"
74
+ ],
75
+ "upcast_attention": false,
76
+ "use_linear_projection": true
77
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:628974a46d456a99f24569ed08924f74c30229f0a5a8fd2318dd4be4363c71d0
3
+ size 1735228080
vae/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.29.2",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "latents_mean": null,
21
+ "latents_std": null,
22
+ "layers_per_block": 2,
23
+ "norm_num_groups": 32,
24
+ "out_channels": 3,
25
+ "sample_size": 256,
26
+ "scaling_factor": 0.18215,
27
+ "shift_factor": null,
28
+ "up_block_types": [
29
+ "UpDecoderBlock2D",
30
+ "UpDecoderBlock2D",
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D"
33
+ ],
34
+ "use_post_quant_conv": true,
35
+ "use_quant_conv": true
36
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e4c08995484ee61270175e9e7a072b66a6e4eeb5f0c266667fe1f45b90daf9a
3
+ size 167335342