Spaces:
Runtime error
Runtime error
Update models/pipeline_consisid.py
Browse files- models/pipeline_consisid.py +55 -41
models/pipeline_consisid.py
CHANGED
|
@@ -14,7 +14,7 @@
|
|
| 14 |
|
| 15 |
import inspect
|
| 16 |
import math
|
| 17 |
-
from typing import Callable, Dict, List, Optional, Tuple, Union
|
| 18 |
|
| 19 |
import cv2
|
| 20 |
import numpy as np
|
|
@@ -24,15 +24,13 @@ from transformers import T5EncoderModel, T5Tokenizer
|
|
| 24 |
|
| 25 |
from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
|
| 26 |
from diffusers.image_processor import PipelineImageInput
|
|
|
|
| 27 |
from diffusers.models import AutoencoderKLCogVideoX, ConsisIDTransformer3DModel
|
| 28 |
from diffusers.models.embeddings import get_3d_rotary_pos_embed
|
| 29 |
from diffusers.pipelines.consisid.pipeline_output import ConsisIDPipelineOutput
|
| 30 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
| 31 |
-
from diffusers.schedulers import
|
| 32 |
-
from diffusers.utils import
|
| 33 |
-
logging,
|
| 34 |
-
replace_example_docstring,
|
| 35 |
-
)
|
| 36 |
from diffusers.utils.torch_utils import randn_tensor
|
| 37 |
from diffusers.video_processor import VideoProcessor
|
| 38 |
|
|
@@ -241,7 +239,7 @@ def retrieve_latents(
|
|
| 241 |
raise AttributeError("Could not access latents of provided encoder_output")
|
| 242 |
|
| 243 |
|
| 244 |
-
class ConsisIDPipeline(DiffusionPipeline):
|
| 245 |
r"""
|
| 246 |
Pipeline for image-to-video generation using ConsisID.
|
| 247 |
|
|
@@ -278,8 +276,8 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 278 |
tokenizer: T5Tokenizer,
|
| 279 |
text_encoder: T5EncoderModel,
|
| 280 |
vae: AutoencoderKLCogVideoX,
|
| 281 |
-
transformer:
|
| 282 |
-
scheduler:
|
| 283 |
):
|
| 284 |
super().__init__()
|
| 285 |
|
|
@@ -611,21 +609,6 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 611 |
f" {negative_prompt_embeds.shape}."
|
| 612 |
)
|
| 613 |
|
| 614 |
-
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
|
| 615 |
-
def fuse_qkv_projections(self) -> None:
|
| 616 |
-
r"""Enables fused QKV projections."""
|
| 617 |
-
self.fusing_transformer = True
|
| 618 |
-
self.transformer.fuse_qkv_projections()
|
| 619 |
-
|
| 620 |
-
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.unfuse_qkv_projections
|
| 621 |
-
def unfuse_qkv_projections(self) -> None:
|
| 622 |
-
r"""Disable QKV projection fusion if enabled."""
|
| 623 |
-
if not self.fusing_transformer:
|
| 624 |
-
logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
|
| 625 |
-
else:
|
| 626 |
-
self.transformer.unfuse_qkv_projections()
|
| 627 |
-
self.fusing_transformer = False
|
| 628 |
-
|
| 629 |
def _prepare_rotary_positional_embeddings(
|
| 630 |
self,
|
| 631 |
height: int,
|
|
@@ -635,8 +618,8 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 635 |
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 636 |
grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
| 637 |
grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
| 638 |
-
base_size_width =
|
| 639 |
-
base_size_height =
|
| 640 |
|
| 641 |
grid_crops_coords = get_resize_crop_region_for_grid(
|
| 642 |
(grid_height, grid_width), base_size_width, base_size_height
|
|
@@ -646,10 +629,9 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 646 |
crops_coords=grid_crops_coords,
|
| 647 |
grid_size=(grid_height, grid_width),
|
| 648 |
temporal_size=num_frames,
|
|
|
|
| 649 |
)
|
| 650 |
|
| 651 |
-
freqs_cos = freqs_cos.to(device=device)
|
| 652 |
-
freqs_sin = freqs_sin.to(device=device)
|
| 653 |
return freqs_cos, freqs_sin
|
| 654 |
|
| 655 |
@property
|
|
@@ -660,6 +642,10 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 660 |
def num_timesteps(self):
|
| 661 |
return self._num_timesteps
|
| 662 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 663 |
@property
|
| 664 |
def interrupt(self):
|
| 665 |
return self._interrupt
|
|
@@ -675,8 +661,7 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 675 |
width: int = 720,
|
| 676 |
num_frames: int = 49,
|
| 677 |
num_inference_steps: int = 50,
|
| 678 |
-
|
| 679 |
-
guidance_scale: float = 6,
|
| 680 |
use_dynamic_cfg: bool = False,
|
| 681 |
num_videos_per_prompt: int = 1,
|
| 682 |
eta: float = 0.0,
|
|
@@ -686,6 +671,7 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 686 |
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
| 687 |
output_type: str = "pil",
|
| 688 |
return_dict: bool = True,
|
|
|
|
| 689 |
callback_on_step_end: Optional[
|
| 690 |
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
| 691 |
] = None,
|
|
@@ -720,16 +706,17 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 720 |
num_inference_steps (`int`, *optional*, defaults to 50):
|
| 721 |
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
| 722 |
expense of slower inference.
|
| 723 |
-
timesteps (`List[int]`, *optional*):
|
| 724 |
-
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
|
| 725 |
-
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
| 726 |
-
passed will be used. Must be in descending order.
|
| 727 |
guidance_scale (`float`, *optional*, defaults to 6):
|
| 728 |
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
| 729 |
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
| 730 |
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
| 731 |
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
| 732 |
usually at the expense of lower image quality.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 733 |
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
| 734 |
The number of videos to generate per prompt.
|
| 735 |
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
|
@@ -752,6 +739,10 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 752 |
return_dict (`bool`, *optional*, defaults to `True`):
|
| 753 |
Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
|
| 754 |
of a plain tuple.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 755 |
callback_on_step_end (`Callable`, *optional*):
|
| 756 |
A function that calls at the end of each denoising steps during the inference. The function is called
|
| 757 |
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
|
@@ -764,6 +755,19 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 764 |
max_sequence_length (`int`, defaults to `226`):
|
| 765 |
Maximum sequence length in encoded prompt. Must be consistent with
|
| 766 |
`self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 767 |
|
| 768 |
Examples:
|
| 769 |
|
|
@@ -772,14 +776,14 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 772 |
[`~pipelines.consisid.pipeline_output.ConsisIDPipelineOutput`] if `return_dict` is True, otherwise a
|
| 773 |
`tuple`. When returning a tuple, the first element is a list with the generated images.
|
| 774 |
"""
|
| 775 |
-
if num_frames > 49:
|
| 776 |
-
raise ValueError(
|
| 777 |
-
"The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
|
| 778 |
-
)
|
| 779 |
|
| 780 |
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
| 781 |
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
| 782 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 783 |
num_videos_per_prompt = 1
|
| 784 |
|
| 785 |
# 1. Check inputs. Raise error if not correct
|
|
@@ -795,6 +799,7 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 795 |
negative_prompt_embeds=negative_prompt_embeds,
|
| 796 |
)
|
| 797 |
self._guidance_scale = guidance_scale
|
|
|
|
| 798 |
self._interrupt = False
|
| 799 |
|
| 800 |
# 2. Default call parameters
|
|
@@ -827,7 +832,7 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 827 |
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
|
| 828 |
|
| 829 |
# 4. Prepare timesteps
|
| 830 |
-
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device
|
| 831 |
self._num_timesteps = len(timesteps)
|
| 832 |
|
| 833 |
# 5. Prepare latents
|
|
@@ -874,6 +879,7 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 874 |
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
| 875 |
# for DPM-solver++
|
| 876 |
old_pred_original_sample = None
|
|
|
|
| 877 |
for i, t in enumerate(timesteps):
|
| 878 |
if self.interrupt:
|
| 879 |
continue
|
|
@@ -893,6 +899,7 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 893 |
encoder_hidden_states=prompt_embeds,
|
| 894 |
timestep=timestep,
|
| 895 |
image_rotary_emb=image_rotary_emb,
|
|
|
|
| 896 |
return_dict=False,
|
| 897 |
id_vit_hidden=id_vit_hidden,
|
| 898 |
id_cond=id_cond,
|
|
@@ -902,7 +909,14 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 902 |
# perform guidance
|
| 903 |
if use_dynamic_cfg:
|
| 904 |
self._guidance_scale = 1 + guidance_scale * (
|
| 905 |
-
(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 906 |
)
|
| 907 |
if do_classifier_free_guidance:
|
| 908 |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
|
@@ -949,4 +963,4 @@ class ConsisIDPipeline(DiffusionPipeline):
|
|
| 949 |
if not return_dict:
|
| 950 |
return (video,)
|
| 951 |
|
| 952 |
-
return ConsisIDPipelineOutput(frames=video)
|
|
|
|
| 14 |
|
| 15 |
import inspect
|
| 16 |
import math
|
| 17 |
+
from typing import Callable, Any, Dict, List, Optional, Tuple, Union
|
| 18 |
|
| 19 |
import cv2
|
| 20 |
import numpy as np
|
|
|
|
| 24 |
|
| 25 |
from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
|
| 26 |
from diffusers.image_processor import PipelineImageInput
|
| 27 |
+
from diffusers.loaders import CogVideoXLoraLoaderMixin
|
| 28 |
from diffusers.models import AutoencoderKLCogVideoX, ConsisIDTransformer3DModel
|
| 29 |
from diffusers.models.embeddings import get_3d_rotary_pos_embed
|
| 30 |
from diffusers.pipelines.consisid.pipeline_output import ConsisIDPipelineOutput
|
| 31 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
| 32 |
+
from diffusers.schedulers import CogVideoXDPMScheduler
|
| 33 |
+
from diffusers.utils import logging, replace_example_docstring
|
|
|
|
|
|
|
|
|
|
| 34 |
from diffusers.utils.torch_utils import randn_tensor
|
| 35 |
from diffusers.video_processor import VideoProcessor
|
| 36 |
|
|
|
|
| 239 |
raise AttributeError("Could not access latents of provided encoder_output")
|
| 240 |
|
| 241 |
|
| 242 |
+
class ConsisIDPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
| 243 |
r"""
|
| 244 |
Pipeline for image-to-video generation using ConsisID.
|
| 245 |
|
|
|
|
| 276 |
tokenizer: T5Tokenizer,
|
| 277 |
text_encoder: T5EncoderModel,
|
| 278 |
vae: AutoencoderKLCogVideoX,
|
| 279 |
+
transformer: ConsisIDTransformer3DModel,
|
| 280 |
+
scheduler: CogVideoXDPMScheduler,
|
| 281 |
):
|
| 282 |
super().__init__()
|
| 283 |
|
|
|
|
| 609 |
f" {negative_prompt_embeds.shape}."
|
| 610 |
)
|
| 611 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 612 |
def _prepare_rotary_positional_embeddings(
|
| 613 |
self,
|
| 614 |
height: int,
|
|
|
|
| 618 |
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 619 |
grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
| 620 |
grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
|
| 621 |
+
base_size_width = self.transformer.config.sample_width // self.transformer.config.patch_size
|
| 622 |
+
base_size_height = self.transformer.config.sample_height // self.transformer.config.patch_size
|
| 623 |
|
| 624 |
grid_crops_coords = get_resize_crop_region_for_grid(
|
| 625 |
(grid_height, grid_width), base_size_width, base_size_height
|
|
|
|
| 629 |
crops_coords=grid_crops_coords,
|
| 630 |
grid_size=(grid_height, grid_width),
|
| 631 |
temporal_size=num_frames,
|
| 632 |
+
device=device,
|
| 633 |
)
|
| 634 |
|
|
|
|
|
|
|
| 635 |
return freqs_cos, freqs_sin
|
| 636 |
|
| 637 |
@property
|
|
|
|
| 642 |
def num_timesteps(self):
|
| 643 |
return self._num_timesteps
|
| 644 |
|
| 645 |
+
@property
|
| 646 |
+
def attention_kwargs(self):
|
| 647 |
+
return self._attention_kwargs
|
| 648 |
+
|
| 649 |
@property
|
| 650 |
def interrupt(self):
|
| 651 |
return self._interrupt
|
|
|
|
| 661 |
width: int = 720,
|
| 662 |
num_frames: int = 49,
|
| 663 |
num_inference_steps: int = 50,
|
| 664 |
+
guidance_scale: float = 6.0,
|
|
|
|
| 665 |
use_dynamic_cfg: bool = False,
|
| 666 |
num_videos_per_prompt: int = 1,
|
| 667 |
eta: float = 0.0,
|
|
|
|
| 671 |
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
| 672 |
output_type: str = "pil",
|
| 673 |
return_dict: bool = True,
|
| 674 |
+
attention_kwargs: Optional[Dict[str, Any]] = None,
|
| 675 |
callback_on_step_end: Optional[
|
| 676 |
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
| 677 |
] = None,
|
|
|
|
| 706 |
num_inference_steps (`int`, *optional*, defaults to 50):
|
| 707 |
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
| 708 |
expense of slower inference.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
guidance_scale (`float`, *optional*, defaults to 6):
|
| 710 |
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
| 711 |
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
| 712 |
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
| 713 |
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
| 714 |
usually at the expense of lower image quality.
|
| 715 |
+
use_dynamic_cfg (`bool`, *optional*, defaults to `False`):
|
| 716 |
+
If True, dynamically adjusts the guidance scale during inference. This allows the model to use a
|
| 717 |
+
progressive guidance scale, improving the balance between text-guided generation and image quality over
|
| 718 |
+
the course of the inference steps. Typically, early inference steps use a higher guidance scale for
|
| 719 |
+
more faithful image generation, while later steps reduce it for more diverse and natural results.
|
| 720 |
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
| 721 |
The number of videos to generate per prompt.
|
| 722 |
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
|
|
|
| 739 |
return_dict (`bool`, *optional*, defaults to `True`):
|
| 740 |
Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
|
| 741 |
of a plain tuple.
|
| 742 |
+
attention_kwargs (`dict`, *optional*):
|
| 743 |
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
| 744 |
+
`self.processor` in
|
| 745 |
+
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
| 746 |
callback_on_step_end (`Callable`, *optional*):
|
| 747 |
A function that calls at the end of each denoising steps during the inference. The function is called
|
| 748 |
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
|
|
|
| 755 |
max_sequence_length (`int`, defaults to `226`):
|
| 756 |
Maximum sequence length in encoded prompt. Must be consistent with
|
| 757 |
`self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
|
| 758 |
+
id_vit_hidden (`Optional[torch.Tensor]`, *optional*):
|
| 759 |
+
The tensor representing the hidden features extracted from the face model, which are used to condition
|
| 760 |
+
the local facial extractor. This is crucial for the model to obtain high-frequency information of the
|
| 761 |
+
face. If not provided, the local facial extractor will not run normally.
|
| 762 |
+
id_cond (`Optional[torch.Tensor]`, *optional*):
|
| 763 |
+
The tensor representing the hidden features extracted from the clip model, which are used to condition
|
| 764 |
+
the local facial extractor. This is crucial for the model to edit facial features If not provided, the
|
| 765 |
+
local facial extractor will not run normally.
|
| 766 |
+
kps_cond (`Optional[torch.Tensor]`, *optional*):
|
| 767 |
+
A tensor that determines whether the global facial extractor use keypoint information for conditioning.
|
| 768 |
+
If provided, this tensor controls whether facial keypoints such as eyes, nose, and mouth landmarks are
|
| 769 |
+
used during the generation process. This helps ensure the model retains more facial low-frequency
|
| 770 |
+
information.
|
| 771 |
|
| 772 |
Examples:
|
| 773 |
|
|
|
|
| 776 |
[`~pipelines.consisid.pipeline_output.ConsisIDPipelineOutput`] if `return_dict` is True, otherwise a
|
| 777 |
`tuple`. When returning a tuple, the first element is a list with the generated images.
|
| 778 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 779 |
|
| 780 |
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
| 781 |
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
| 782 |
|
| 783 |
+
height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
|
| 784 |
+
width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
|
| 785 |
+
num_frames = num_frames or self.transformer.config.sample_frames
|
| 786 |
+
|
| 787 |
num_videos_per_prompt = 1
|
| 788 |
|
| 789 |
# 1. Check inputs. Raise error if not correct
|
|
|
|
| 799 |
negative_prompt_embeds=negative_prompt_embeds,
|
| 800 |
)
|
| 801 |
self._guidance_scale = guidance_scale
|
| 802 |
+
self._attention_kwargs = attention_kwargs
|
| 803 |
self._interrupt = False
|
| 804 |
|
| 805 |
# 2. Default call parameters
|
|
|
|
| 832 |
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
|
| 833 |
|
| 834 |
# 4. Prepare timesteps
|
| 835 |
+
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device)
|
| 836 |
self._num_timesteps = len(timesteps)
|
| 837 |
|
| 838 |
# 5. Prepare latents
|
|
|
|
| 879 |
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
| 880 |
# for DPM-solver++
|
| 881 |
old_pred_original_sample = None
|
| 882 |
+
timesteps_cpu = timesteps.cpu()
|
| 883 |
for i, t in enumerate(timesteps):
|
| 884 |
if self.interrupt:
|
| 885 |
continue
|
|
|
|
| 899 |
encoder_hidden_states=prompt_embeds,
|
| 900 |
timestep=timestep,
|
| 901 |
image_rotary_emb=image_rotary_emb,
|
| 902 |
+
attention_kwargs=attention_kwargs,
|
| 903 |
return_dict=False,
|
| 904 |
id_vit_hidden=id_vit_hidden,
|
| 905 |
id_cond=id_cond,
|
|
|
|
| 909 |
# perform guidance
|
| 910 |
if use_dynamic_cfg:
|
| 911 |
self._guidance_scale = 1 + guidance_scale * (
|
| 912 |
+
(
|
| 913 |
+
1
|
| 914 |
+
- math.cos(
|
| 915 |
+
math.pi
|
| 916 |
+
* ((num_inference_steps - timesteps_cpu[i].item()) / num_inference_steps) ** 5.0
|
| 917 |
+
)
|
| 918 |
+
)
|
| 919 |
+
/ 2
|
| 920 |
)
|
| 921 |
if do_classifier_free_guidance:
|
| 922 |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
|
|
|
| 963 |
if not return_dict:
|
| 964 |
return (video,)
|
| 965 |
|
| 966 |
+
return ConsisIDPipelineOutput(frames=video)
|