FramePack_rotate_landscape

Running on Zero

App Files Files Community

tori29umai commited on Apr 29

Commit

92b4d2f

verified ·

1 Parent(s): 77e9bab

Delete diffusers_helper

Browse files

Files changed (14) hide show

diffusers_helper/__init__.py +0 -1
diffusers_helper/bucket_tools.py +0 -30
diffusers_helper/clip_vision.py +0 -12
diffusers_helper/dit_common.py +0 -53
diffusers_helper/gradio/progress_bar.py +0 -86
diffusers_helper/hf_login.py +0 -25
diffusers_helper/hunyuan.py +0 -111
diffusers_helper/k_diffusion/uni_pc_fm.py +0 -155
diffusers_helper/k_diffusion/wrapper.py +0 -51
diffusers_helper/memory.py +0 -210
diffusers_helper/models/hunyuan_video_packed.py +0 -1032
diffusers_helper/pipelines/k_diffusion_hunyuan.py +0 -120
diffusers_helper/thread_utils.py +0 -123
diffusers_helper/utils.py +0 -613

diffusers_helper/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # diffusers_helper package

diffusers_helper/bucket_tools.py DELETED Viewed

@@ -1,30 +0,0 @@
-bucket_options = {
-    640: [
-        (416, 960),
-        (448, 864),
-        (480, 832),
-        (512, 768),
-        (544, 704),
-        (576, 672),
-        (608, 640),
-        (640, 608),
-        (672, 576),
-        (704, 544),
-        (768, 512),
-        (832, 480),
-        (864, 448),
-        (960, 416),
-    ],
-}
-def find_nearest_bucket(h, w, resolution=640):
-    min_metric = float('inf')
-    best_bucket = None
-    for (bucket_h, bucket_w) in bucket_options[resolution]:
-        metric = abs(h * bucket_w - w * bucket_h)
-        if metric <= min_metric:
-            min_metric = metric
-            best_bucket = (bucket_h, bucket_w)
-    return best_bucket

diffusers_helper/clip_vision.py DELETED Viewed

@@ -1,12 +0,0 @@
-import numpy as np
-def hf_clip_vision_encode(image, feature_extractor, image_encoder):
-    assert isinstance(image, np.ndarray)
-    assert image.ndim == 3 and image.shape[2] == 3
-    assert image.dtype == np.uint8
-    preprocessed = feature_extractor.preprocess(images=image, return_tensors="pt").to(device=image_encoder.device, dtype=image_encoder.dtype)
-    image_encoder_output = image_encoder(**preprocessed)
-    return image_encoder_output

diffusers_helper/dit_common.py DELETED Viewed

@@ -1,53 +0,0 @@
-import torch
-import accelerate.accelerator
-from diffusers.models.normalization import RMSNorm, LayerNorm, FP32LayerNorm, AdaLayerNormContinuous
-accelerate.accelerator.convert_outputs_to_fp32 = lambda x: x
-def LayerNorm_forward(self, x):
-    return torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps).to(x)
-LayerNorm.forward = LayerNorm_forward
-torch.nn.LayerNorm.forward = LayerNorm_forward
-def FP32LayerNorm_forward(self, x):
-    origin_dtype = x.dtype
-    return torch.nn.functional.layer_norm(
-        x.float(),
-        self.normalized_shape,
-        self.weight.float() if self.weight is not None else None,
-        self.bias.float() if self.bias is not None else None,
-        self.eps,
-    ).to(origin_dtype)
-FP32LayerNorm.forward = FP32LayerNorm_forward
-def RMSNorm_forward(self, hidden_states):
-    input_dtype = hidden_states.dtype
-    variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-    hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-    if self.weight is None:
-        return hidden_states.to(input_dtype)
-    return hidden_states.to(input_dtype) * self.weight.to(input_dtype)
-RMSNorm.forward = RMSNorm_forward
-def AdaLayerNormContinuous_forward(self, x, conditioning_embedding):
-    emb = self.linear(self.silu(conditioning_embedding))
-    scale, shift = emb.chunk(2, dim=1)
-    x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
-    return x
-AdaLayerNormContinuous.forward = AdaLayerNormContinuous_forward

diffusers_helper/gradio/progress_bar.py DELETED Viewed

@@ -1,86 +0,0 @@
-progress_html = '''
-<div class="loader-container">
-  <div class="loader"></div>
-  <div class="progress-container">
-    <progress value="*number*" max="100"></progress>
-  </div>
-  <span>*text*</span>
-</div>
-'''
-css = '''
-.loader-container {
-  display: flex; /* Use flex to align items horizontally */
-  align-items: center; /* Center items vertically within the container */
-  white-space: nowrap; /* Prevent line breaks within the container */
-}
-.loader {
-  border: 8px solid #f3f3f3; /* Light grey */
-  border-top: 8px solid #3498db; /* Blue */
-  border-radius: 50%;
-  width: 30px;
-  height: 30px;
-  animation: spin 2s linear infinite;
-}
-@keyframes spin {
-  0% { transform: rotate(0deg); }
-  100% { transform: rotate(360deg); }
-}
-/* Style the progress bar */
-progress {
-  appearance: none; /* Remove default styling */
-  height: 20px; /* Set the height of the progress bar */
-  border-radius: 5px; /* Round the corners of the progress bar */
-  background-color: #f3f3f3; /* Light grey background */
-  width: 100%;
-  vertical-align: middle !important;
-}
-/* Style the progress bar container */
-.progress-container {
-  margin-left: 20px;
-  margin-right: 20px;
-  flex-grow: 1; /* Allow the progress container to take up remaining space */
-}
-/* Set the color of the progress bar fill */
-progress::-webkit-progress-value {
-  background-color: #3498db; /* Blue color for the fill */
-}
-progress::-moz-progress-bar {
-  background-color: #3498db; /* Blue color for the fill in Firefox */
-}
-/* Style the text on the progress bar */
-progress::after {
-  content: attr(value '%'); /* Display the progress value followed by '%' */
-  position: absolute;
-  top: 50%;
-  left: 50%;
-  transform: translate(-50%, -50%);
-  color: white; /* Set text color */
-  font-size: 14px; /* Set font size */
-}
-/* Style other texts */
-.loader-container > span {
-  margin-left: 5px; /* Add spacing between the progress bar and the text */
-}
-.no-generating-animation > .generating {
-  display: none !important;
-}
-'''
-def make_progress_bar_html(number, text):
-    return progress_html.replace('*number*', str(number)).replace('*text*', text)
-def make_progress_bar_css():
-    return css

diffusers_helper/hf_login.py DELETED Viewed

@@ -1,25 +0,0 @@
-import os
-from huggingface_hub import login
-def login():
-    # 如果是在Hugging Face Space环境中运行，使用环境变量中的token
-    if os.environ.get('SPACE_ID') is not None:
-        print("Running in Hugging Face Space, using environment HF_TOKEN")
-        # Space自带访问权限，无需额外登录
-        return
-    # 如果本地环境有token，则使用它登录
-    hf_token = os.environ.get('HF_TOKEN')
-    if hf_token:
-        print("Logging in with HF_TOKEN from environment")
-        login(token=hf_token)
-        return
-    # 检查缓存的token
-    cache_file = os.path.expanduser('~/.huggingface/token')
-    if os.path.exists(cache_file):
-        print("Found cached Hugging Face token")
-        return
-    print("No Hugging Face token found. Using public access.")
-    # 无token时使用公共访问，速度可能较慢且有限制

diffusers_helper/hunyuan.py DELETED Viewed

@@ -1,111 +0,0 @@
-import torch
-from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video import DEFAULT_PROMPT_TEMPLATE
-from diffusers_helper.utils import crop_or_pad_yield_mask
-@torch.no_grad()
-def encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2, max_length=256):
-    assert isinstance(prompt, str)
-    prompt = [prompt]
-    # LLAMA
-    prompt_llama = [DEFAULT_PROMPT_TEMPLATE["template"].format(p) for p in prompt]
-    crop_start = DEFAULT_PROMPT_TEMPLATE["crop_start"]
-    llama_inputs = tokenizer(
-        prompt_llama,
-        padding="max_length",
-        max_length=max_length + crop_start,
-        truncation=True,
-        return_tensors="pt",
-        return_length=False,
-        return_overflowing_tokens=False,
-        return_attention_mask=True,
-    )
-    llama_input_ids = llama_inputs.input_ids.to(text_encoder.device)
-    llama_attention_mask = llama_inputs.attention_mask.to(text_encoder.device)
-    llama_attention_length = int(llama_attention_mask.sum())
-    llama_outputs = text_encoder(
-        input_ids=llama_input_ids,
-        attention_mask=llama_attention_mask,
-        output_hidden_states=True,
-    )
-    llama_vec = llama_outputs.hidden_states[-3][:, crop_start:llama_attention_length]
-    # llama_vec_remaining = llama_outputs.hidden_states[-3][:, llama_attention_length:]
-    llama_attention_mask = llama_attention_mask[:, crop_start:llama_attention_length]
-    assert torch.all(llama_attention_mask.bool())
-    # CLIP
-    clip_l_input_ids = tokenizer_2(
-        prompt,
-        padding="max_length",
-        max_length=77,
-        truncation=True,
-        return_overflowing_tokens=False,
-        return_length=False,
-        return_tensors="pt",
-    ).input_ids
-    clip_l_pooler = text_encoder_2(clip_l_input_ids.to(text_encoder_2.device), output_hidden_states=False).pooler_output
-    return llama_vec, clip_l_pooler
-@torch.no_grad()
-def vae_decode_fake(latents):
-    latent_rgb_factors = [
-        [-0.0395, -0.0331, 0.0445],
-        [0.0696, 0.0795, 0.0518],
-        [0.0135, -0.0945, -0.0282],
-        [0.0108, -0.0250, -0.0765],
-        [-0.0209, 0.0032, 0.0224],
-        [-0.0804, -0.0254, -0.0639],
-        [-0.0991, 0.0271, -0.0669],
-        [-0.0646, -0.0422, -0.0400],
-        [-0.0696, -0.0595, -0.0894],
-        [-0.0799, -0.0208, -0.0375],
-        [0.1166, 0.1627, 0.0962],
-        [0.1165, 0.0432, 0.0407],
-        [-0.2315, -0.1920, -0.1355],
-        [-0.0270, 0.0401, -0.0821],
-        [-0.0616, -0.0997, -0.0727],
-        [0.0249, -0.0469, -0.1703]
-    ]  # From comfyui
-    latent_rgb_factors_bias = [0.0259, -0.0192, -0.0761]
-    weight = torch.tensor(latent_rgb_factors, device=latents.device, dtype=latents.dtype).transpose(0, 1)[:, :, None, None, None]
-    bias = torch.tensor(latent_rgb_factors_bias, device=latents.device, dtype=latents.dtype)
-    images = torch.nn.functional.conv3d(latents, weight, bias=bias, stride=1, padding=0, dilation=1, groups=1)
-    images = images.clamp(0.0, 1.0)
-    return images
-@torch.no_grad()
-def vae_decode(latents, vae, image_mode=False):
-    latents = latents / vae.config.scaling_factor
-    if not image_mode:
-        image = vae.decode(latents.to(device=vae.device, dtype=vae.dtype)).sample
-    else:
-        latents = latents.to(device=vae.device, dtype=vae.dtype).unbind(2)
-        image = [vae.decode(l.unsqueeze(2)).sample for l in latents]
-        image = torch.cat(image, dim=2)
-    return image
-@torch.no_grad()
-def vae_encode(image, vae):
-    latents = vae.encode(image.to(device=vae.device, dtype=vae.dtype)).latent_dist.sample()
-    latents = latents * vae.config.scaling_factor
-    return latents

diffusers_helper/k_diffusion/uni_pc_fm.py DELETED Viewed

@@ -1,155 +0,0 @@
-# Better Flow Matching UniPC by Lvmin Zhang
-# (c) 2025
-# CC BY-SA 4.0
-# Attribution-ShareAlike 4.0 International Licence
-import torch
-from tqdm.auto import trange
-def expand_dims(v, dims):
-    return v[(...,) + (None,) * (dims - 1)]
-class FlowMatchUniPC:
-    def __init__(self, model, extra_args, variant='bh1'):
-        self.model = model
-        self.variant = variant
-        self.extra_args = extra_args
-    def model_fn(self, x, t):
-        return self.model(x, t, **self.extra_args)
-    def update_fn(self, x, model_prev_list, t_prev_list, t, order):
-        assert order <= len(model_prev_list)
-        dims = x.dim()
-        t_prev_0 = t_prev_list[-1]
-        lambda_prev_0 = - torch.log(t_prev_0)
-        lambda_t = - torch.log(t)
-        model_prev_0 = model_prev_list[-1]
-        h = lambda_t - lambda_prev_0
-        rks = []
-        D1s = []
-        for i in range(1, order):
-            t_prev_i = t_prev_list[-(i + 1)]
-            model_prev_i = model_prev_list[-(i + 1)]
-            lambda_prev_i = - torch.log(t_prev_i)
-            rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
-            rks.append(rk)
-            D1s.append((model_prev_i - model_prev_0) / rk)
-        rks.append(1.)
-        rks = torch.tensor(rks, device=x.device)
-        R = []
-        b = []
-        hh = -h[0]
-        h_phi_1 = torch.expm1(hh)
-        h_phi_k = h_phi_1 / hh - 1
-        factorial_i = 1
-        if self.variant == 'bh1':
-            B_h = hh
-        elif self.variant == 'bh2':
-            B_h = torch.expm1(hh)
-        else:
-            raise NotImplementedError('Bad variant!')
-        for i in range(1, order + 1):
-            R.append(torch.pow(rks, i - 1))
-            b.append(h_phi_k * factorial_i / B_h)
-            factorial_i *= (i + 1)
-            h_phi_k = h_phi_k / hh - 1 / factorial_i
-        R = torch.stack(R)
-        b = torch.tensor(b, device=x.device)
-        use_predictor = len(D1s) > 0
-        if use_predictor:
-            D1s = torch.stack(D1s, dim=1)
-            if order == 2:
-                rhos_p = torch.tensor([0.5], device=b.device)
-            else:
-                rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
-        else:
-            D1s = None
-            rhos_p = None
-        if order == 1:
-            rhos_c = torch.tensor([0.5], device=b.device)
-        else:
-            rhos_c = torch.linalg.solve(R, b)
-        x_t_ = expand_dims(t / t_prev_0, dims) * x - expand_dims(h_phi_1, dims) * model_prev_0
-        if use_predictor:
-            pred_res = torch.tensordot(D1s, rhos_p, dims=([1], [0]))
-        else:
-            pred_res = 0
-        x_t = x_t_ - expand_dims(B_h, dims) * pred_res
-        model_t = self.model_fn(x_t, t)
-        if D1s is not None:
-            corr_res = torch.tensordot(D1s, rhos_c[:-1], dims=([1], [0]))
-        else:
-            corr_res = 0
-        D1_t = (model_t - model_prev_0)
-        x_t = x_t_ - expand_dims(B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
-        return x_t, model_t
-    def sample(self, x, sigmas, callback=None, disable_pbar=False):
-        order = min(3, len(sigmas) - 2)
-        model_prev_list, t_prev_list = [], []
-        try:
-            for i in trange(len(sigmas) - 1, disable=disable_pbar):
-                vec_t = sigmas[i].expand(x.shape[0])
-                if i == 0:
-                    model_prev_list = [self.model_fn(x, vec_t)]
-                    t_prev_list = [vec_t]
-                elif i < order:
-                    init_order = i
-                    x, model_x = self.update_fn(x, model_prev_list, t_prev_list, vec_t, init_order)
-                    model_prev_list.append(model_x)
-                    t_prev_list.append(vec_t)
-                else:
-                    x, model_x = self.update_fn(x, model_prev_list, t_prev_list, vec_t, order)
-                    model_prev_list.append(model_x)
-                    t_prev_list.append(vec_t)
-                model_prev_list = model_prev_list[-order:]
-                t_prev_list = t_prev_list[-order:]
-                if callback is not None:
-                    try:
-                        callback({'x': x, 'i': i, 'denoised': model_prev_list[-1]})
-                    except KeyboardInterrupt as e:
-                        print(f"User interruption detected: {e}")
-                        # Return the last available result
-                        return model_prev_list[-1]
-        except KeyboardInterrupt as e:
-            print(f"Process interrupted: {e}")
-            # Return the last available result if we have one
-            if model_prev_list:
-                return model_prev_list[-1]
-            else:
-                # If no results yet, re-raise the exception
-                raise
-        return model_prev_list[-1]
-def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=False, variant='bh1'):
-    assert variant in ['bh1', 'bh2']
-    return FlowMatchUniPC(model, extra_args=extra_args, variant=variant).sample(noise, sigmas=sigmas, callback=callback, disable_pbar=disable)

diffusers_helper/k_diffusion/wrapper.py DELETED Viewed

@@ -1,51 +0,0 @@
-import torch
-def append_dims(x, target_dims):
-    return x[(...,) + (None,) * (target_dims - x.ndim)]
-def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=1.0):
-    if guidance_rescale == 0:
-        return noise_cfg
-    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
-    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
-    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
-    noise_cfg = guidance_rescale * noise_pred_rescaled + (1.0 - guidance_rescale) * noise_cfg
-    return noise_cfg
-def fm_wrapper(transformer, t_scale=1000.0):
-    def k_model(x, sigma, **extra_args):
-        dtype = extra_args['dtype']
-        cfg_scale = extra_args['cfg_scale']
-        cfg_rescale = extra_args['cfg_rescale']
-        concat_latent = extra_args['concat_latent']
-        original_dtype = x.dtype
-        sigma = sigma.float()
-        x = x.to(dtype)
-        timestep = (sigma * t_scale).to(dtype)
-        if concat_latent is None:
-            hidden_states = x
-        else:
-            hidden_states = torch.cat([x, concat_latent.to(x)], dim=1)
-        pred_positive = transformer(hidden_states=hidden_states, timestep=timestep, return_dict=False, **extra_args['positive'])[0].float()
-        if cfg_scale == 1.0:
-            pred_negative = torch.zeros_like(pred_positive)
-        else:
-            pred_negative = transformer(hidden_states=hidden_states, timestep=timestep, return_dict=False, **extra_args['negative'])[0].float()
-        pred_cfg = pred_negative + cfg_scale * (pred_positive - pred_negative)
-        pred = rescale_noise_cfg(pred_cfg, pred_positive, guidance_rescale=cfg_rescale)
-        x0 = x.float() - pred.float() * append_dims(sigma, x.ndim)
-        return x0.to(dtype=original_dtype)
-    return k_model

diffusers_helper/memory.py DELETED Viewed

@@ -1,210 +0,0 @@
-# By lllyasviel
-import torch
-import os
-# 检查是否在Hugging Face Space环境中
-IN_HF_SPACE = os.environ.get('SPACE_ID') is not None
-# 设置CPU设备
-cpu = torch.device('cpu')
-# 在Stateless GPU环境中，不要在主进程初始化CUDA
-def get_gpu_device():
-    if IN_HF_SPACE:
-        # 在Spaces中将延迟初始化GPU设备
-        return 'cuda'  # 返回字符串，而不是实际初始化设备
-    # 非Spaces环境正常初始化
-    try:
-        if torch.cuda.is_available():
-            return torch.device(f'cuda:{torch.cuda.current_device()}')
-        else:
-            print("CUDA不可用，使用CPU作为默认设备")
-            return torch.device('cpu')
-    except Exception as e:
-        print(f"初始化CUDA设备时出错: {e}")
-        print("回退到CPU设备")
-        return torch.device('cpu')
-# 保存一个字符串表示，而不是实际的设备对象
-gpu = get_gpu_device()
-gpu_complete_modules = []
-class DynamicSwapInstaller:
-    @staticmethod
-    def _install_module(module: torch.nn.Module, **kwargs):
-        original_class = module.__class__
-        module.__dict__['forge_backup_original_class'] = original_class
-        def hacked_get_attr(self, name: str):
-            if '_parameters' in self.__dict__:
-                _parameters = self.__dict__['_parameters']
-                if name in _parameters:
-                    p = _parameters[name]
-                    if p is None:
-                        return None
-                    if p.__class__ == torch.nn.Parameter:
-                        return torch.nn.Parameter(p.to(**kwargs), requires_grad=p.requires_grad)
-                    else:
-                        return p.to(**kwargs)
-            if '_buffers' in self.__dict__:
-                _buffers = self.__dict__['_buffers']
-                if name in _buffers:
-                    return _buffers[name].to(**kwargs)
-            return super(original_class, self).__getattr__(name)
-        module.__class__ = type('DynamicSwap_' + original_class.__name__, (original_class,), {
-            '__getattr__': hacked_get_attr,
-        })
-        return
-    @staticmethod
-    def _uninstall_module(module: torch.nn.Module):
-        if 'forge_backup_original_class' in module.__dict__:
-            module.__class__ = module.__dict__.pop('forge_backup_original_class')
-        return
-    @staticmethod
-    def install_model(model: torch.nn.Module, **kwargs):
-        for m in model.modules():
-            DynamicSwapInstaller._install_module(m, **kwargs)
-        return
-    @staticmethod
-    def uninstall_model(model: torch.nn.Module):
-        for m in model.modules():
-            DynamicSwapInstaller._uninstall_module(m)
-        return
-def fake_diffusers_current_device(model: torch.nn.Module, target_device):
-    # 转换字符串设备为torch.device
-    if isinstance(target_device, str):
-        target_device = torch.device(target_device)
-    if hasattr(model, 'scale_shift_table'):
-        model.scale_shift_table.data = model.scale_shift_table.data.to(target_device)
-        return
-    for k, p in model.named_modules():
-        if hasattr(p, 'weight'):
-            p.to(target_device)
-            return
-def get_cuda_free_memory_gb(device=None):
-    if device is None:
-        device = gpu
-    # 如果是字符串，转换为设备
-    if isinstance(device, str):
-        device = torch.device(device)
-    # 如果不是CUDA设备，返回默认值
-    if device.type != 'cuda':
-        print("无法获取非CUDA设备的内存信息，返回默认值")
-        return 6.0  # 返回一个默认值
-    try:
-        memory_stats = torch.cuda.memory_stats(device)
-        bytes_active = memory_stats['active_bytes.all.current']
-        bytes_reserved = memory_stats['reserved_bytes.all.current']
-        bytes_free_cuda, _ = torch.cuda.mem_get_info(device)
-        bytes_inactive_reserved = bytes_reserved - bytes_active
-        bytes_total_available = bytes_free_cuda + bytes_inactive_reserved
-        return bytes_total_available / (1024 ** 3)
-    except Exception as e:
-        print(f"获取CUDA内存信息时出错: {e}")
-        return 6.0  # 返回一个默认值
-def move_model_to_device_with_memory_preservation(model, target_device, preserved_memory_gb=0):
-    print(f'Moving {model.__class__.__name__} to {target_device} with preserved memory: {preserved_memory_gb} GB')
-    # 如果是字符串，转换为设备
-    if isinstance(target_device, str):
-        target_device = torch.device(target_device)
-    # 如果gpu是字符串，转换为设备
-    gpu_device = gpu
-    if isinstance(gpu_device, str):
-        gpu_device = torch.device(gpu_device)
-    # 如果目标设备是CPU或当前在CPU上，直接移动
-    if target_device.type == 'cpu' or gpu_device.type == 'cpu':
-        model.to(device=target_device)
-        torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        return
-    for m in model.modules():
-        if get_cuda_free_memory_gb(target_device) <= preserved_memory_gb:
-            torch.cuda.empty_cache()
-            return
-        if hasattr(m, 'weight'):
-            m.to(device=target_device)
-    model.to(device=target_device)
-    torch.cuda.empty_cache()
-    return
-def offload_model_from_device_for_memory_preservation(model, target_device, preserved_memory_gb=0):
-    print(f'Offloading {model.__class__.__name__} from {target_device} to preserve memory: {preserved_memory_gb} GB')
-    # 如果是字符串，转换为设备
-    if isinstance(target_device, str):
-        target_device = torch.device(target_device)
-    # 如果gpu是字符串，转换为设备
-    gpu_device = gpu
-    if isinstance(gpu_device, str):
-        gpu_device = torch.device(gpu_device)
-    # 如果目标设备是CPU或当前在CPU上，直接处理
-    if target_device.type == 'cpu' or gpu_device.type == 'cpu':
-        model.to(device=cpu)
-        torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        return
-    for m in model.modules():
-        if get_cuda_free_memory_gb(target_device) >= preserved_memory_gb:
-            torch.cuda.empty_cache()
-            return
-        if hasattr(m, 'weight'):
-            m.to(device=cpu)
-    model.to(device=cpu)
-    torch.cuda.empty_cache()
-    return
-def unload_complete_models(*args):
-    for m in gpu_complete_modules + list(args):
-        m.to(device=cpu)
-        print(f'Unloaded {m.__class__.__name__} as complete.')
-    gpu_complete_modules.clear()
-    torch.cuda.empty_cache() if torch.cuda.is_available() else None
-    return
-def load_model_as_complete(model, target_device, unload=True):
-    # 如果是字符串，转换为设备
-    if isinstance(target_device, str):
-        target_device = torch.device(target_device)
-    if unload:
-        unload_complete_models()
-    model.to(device=target_device)
-    print(f'Loaded {model.__class__.__name__} to {target_device} as complete.')
-    gpu_complete_modules.append(model)
-    return

diffusers_helper/models/hunyuan_video_packed.py DELETED Viewed

@@ -1,1032 +0,0 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-import einops
-import torch.nn as nn
-import numpy as np
-from diffusers.loaders import FromOriginalModelMixin
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import PeftAdapterMixin
-from diffusers.utils import logging
-from diffusers.models.attention import FeedForward
-from diffusers.models.attention_processor import Attention
-from diffusers.models.embeddings import TimestepEmbedding, Timesteps, PixArtAlphaTextProjection
-from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers_helper.dit_common import LayerNorm
-from diffusers_helper.utils import zero_module
-enabled_backends = []
-if torch.backends.cuda.flash_sdp_enabled():
-    enabled_backends.append("flash")
-if torch.backends.cuda.math_sdp_enabled():
-    enabled_backends.append("math")
-if torch.backends.cuda.mem_efficient_sdp_enabled():
-    enabled_backends.append("mem_efficient")
-if torch.backends.cuda.cudnn_sdp_enabled():
-    enabled_backends.append("cudnn")
-print("Currently enabled native sdp backends:", enabled_backends)
-try:
-    # raise NotImplementedError
-    from xformers.ops import memory_efficient_attention as xformers_attn_func
-    print('Xformers is installed!')
-except:
-    print('Xformers is not installed!')
-    xformers_attn_func = None
-try:
-    # raise NotImplementedError
-    from flash_attn import flash_attn_varlen_func, flash_attn_func
-    print('Flash Attn is installed!')
-except:
-    print('Flash Attn is not installed!')
-    flash_attn_varlen_func = None
-    flash_attn_func = None
-try:
-    # raise NotImplementedError
-    from sageattention import sageattn_varlen, sageattn
-    print('Sage Attn is installed!')
-except:
-    print('Sage Attn is not installed!')
-    sageattn_varlen = None
-    sageattn = None
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-def pad_for_3d_conv(x, kernel_size):
-    b, c, t, h, w = x.shape
-    pt, ph, pw = kernel_size
-    pad_t = (pt - (t % pt)) % pt
-    pad_h = (ph - (h % ph)) % ph
-    pad_w = (pw - (w % pw)) % pw
-    return torch.nn.functional.pad(x, (0, pad_w, 0, pad_h, 0, pad_t), mode='replicate')
-def center_down_sample_3d(x, kernel_size):
-    # pt, ph, pw = kernel_size
-    # cp = (pt * ph * pw) // 2
-    # xp = einops.rearrange(x, 'b c (t pt) (h ph) (w pw) -> (pt ph pw) b c t h w', pt=pt, ph=ph, pw=pw)
-    # xc = xp[cp]
-    # return xc
-    return torch.nn.functional.avg_pool3d(x, kernel_size, stride=kernel_size)
-def get_cu_seqlens(text_mask, img_len):
-    batch_size = text_mask.shape[0]
-    text_len = text_mask.sum(dim=1)
-    max_len = text_mask.shape[1] + img_len
-    cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda")
-    for i in range(batch_size):
-        s = text_len[i] + img_len
-        s1 = i * max_len + s
-        s2 = (i + 1) * max_len
-        cu_seqlens[2 * i + 1] = s1
-        cu_seqlens[2 * i + 2] = s2
-    return cu_seqlens
-def apply_rotary_emb_transposed(x, freqs_cis):
-    cos, sin = freqs_cis.unsqueeze(-2).chunk(2, dim=-1)
-    x_real, x_imag = x.unflatten(-1, (-1, 2)).unbind(-1)
-    x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-    out = x.float() * cos + x_rotated.float() * sin
-    out = out.to(x)
-    return out
-def attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv):
-    if cu_seqlens_q is None and cu_seqlens_kv is None and max_seqlen_q is None and max_seqlen_kv is None:
-        if sageattn is not None:
-            x = sageattn(q, k, v, tensor_layout='NHD')
-            return x
-        if flash_attn_func is not None:
-            x = flash_attn_func(q, k, v)
-            return x
-        if xformers_attn_func is not None:
-            x = xformers_attn_func(q, k, v)
-            return x
-        x = torch.nn.functional.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(1, 2)
-        return x
-    batch_size = q.shape[0]
-    q = q.view(q.shape[0] * q.shape[1], *q.shape[2:])
-    k = k.view(k.shape[0] * k.shape[1], *k.shape[2:])
-    v = v.view(v.shape[0] * v.shape[1], *v.shape[2:])
-    if sageattn_varlen is not None:
-        x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
-    elif flash_attn_varlen_func is not None:
-        x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
-    else:
-        raise NotImplementedError('No Attn Installed!')
-    x = x.view(batch_size, max_seqlen_q, *x.shape[2:])
-    return x
-class HunyuanAttnProcessorFlashAttnDouble:
-    def __call__(self, attn, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb):
-        cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv = attention_mask
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-        query = query.unflatten(2, (attn.heads, -1))
-        key = key.unflatten(2, (attn.heads, -1))
-        value = value.unflatten(2, (attn.heads, -1))
-        query = attn.norm_q(query)
-        key = attn.norm_k(key)
-        query = apply_rotary_emb_transposed(query, image_rotary_emb)
-        key = apply_rotary_emb_transposed(key, image_rotary_emb)
-        encoder_query = attn.add_q_proj(encoder_hidden_states)
-        encoder_key = attn.add_k_proj(encoder_hidden_states)
-        encoder_value = attn.add_v_proj(encoder_hidden_states)
-        encoder_query = encoder_query.unflatten(2, (attn.heads, -1))
-        encoder_key = encoder_key.unflatten(2, (attn.heads, -1))
-        encoder_value = encoder_value.unflatten(2, (attn.heads, -1))
-        encoder_query = attn.norm_added_q(encoder_query)
-        encoder_key = attn.norm_added_k(encoder_key)
-        query = torch.cat([query, encoder_query], dim=1)
-        key = torch.cat([key, encoder_key], dim=1)
-        value = torch.cat([value, encoder_value], dim=1)
-        hidden_states = attn_varlen_func(query, key, value, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
-        hidden_states = hidden_states.flatten(-2)
-        txt_length = encoder_hidden_states.shape[1]
-        hidden_states, encoder_hidden_states = hidden_states[:, :-txt_length], hidden_states[:, -txt_length:]
-        hidden_states = attn.to_out[0](hidden_states)
-        hidden_states = attn.to_out[1](hidden_states)
-        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
-        return hidden_states, encoder_hidden_states
-class HunyuanAttnProcessorFlashAttnSingle:
-    def __call__(self, attn, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb):
-        cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv = attention_mask
-        hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-        query = query.unflatten(2, (attn.heads, -1))
-        key = key.unflatten(2, (attn.heads, -1))
-        value = value.unflatten(2, (attn.heads, -1))
-        query = attn.norm_q(query)
-        key = attn.norm_k(key)
-        txt_length = encoder_hidden_states.shape[1]
-        query = torch.cat([apply_rotary_emb_transposed(query[:, :-txt_length], image_rotary_emb), query[:, -txt_length:]], dim=1)
-        key = torch.cat([apply_rotary_emb_transposed(key[:, :-txt_length], image_rotary_emb), key[:, -txt_length:]], dim=1)
-        hidden_states = attn_varlen_func(query, key, value, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
-        hidden_states = hidden_states.flatten(-2)
-        hidden_states, encoder_hidden_states = hidden_states[:, :-txt_length], hidden_states[:, -txt_length:]
-        return hidden_states, encoder_hidden_states
-class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
-    def __init__(self, embedding_dim, pooled_projection_dim):
-        super().__init__()
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-        self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
-    def forward(self, timestep, guidance, pooled_projection):
-        timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))
-        guidance_proj = self.time_proj(guidance)
-        guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype))
-        time_guidance_emb = timesteps_emb + guidance_emb
-        pooled_projections = self.text_embedder(pooled_projection)
-        conditioning = time_guidance_emb + pooled_projections
-        return conditioning
-class CombinedTimestepTextProjEmbeddings(nn.Module):
-    def __init__(self, embedding_dim, pooled_projection_dim):
-        super().__init__()
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
-    def forward(self, timestep, pooled_projection):
-        timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))
-        pooled_projections = self.text_embedder(pooled_projection)
-        conditioning = timesteps_emb + pooled_projections
-        return conditioning
-class HunyuanVideoAdaNorm(nn.Module):
-    def __init__(self, in_features: int, out_features: Optional[int] = None) -> None:
-        super().__init__()
-        out_features = out_features or 2 * in_features
-        self.linear = nn.Linear(in_features, out_features)
-        self.nonlinearity = nn.SiLU()
-    def forward(
-        self, temb: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        temb = self.linear(self.nonlinearity(temb))
-        gate_msa, gate_mlp = temb.chunk(2, dim=-1)
-        gate_msa, gate_mlp = gate_msa.unsqueeze(1), gate_mlp.unsqueeze(1)
-        return gate_msa, gate_mlp
-class HunyuanVideoIndividualTokenRefinerBlock(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_width_ratio: str = 4.0,
-        mlp_drop_rate: float = 0.0,
-        attention_bias: bool = True,
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-        self.norm1 = LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
-        self.attn = Attention(
-            query_dim=hidden_size,
-            cross_attention_dim=None,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            bias=attention_bias,
-        )
-        self.norm2 = LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
-        self.ff = FeedForward(hidden_size, mult=mlp_width_ratio, activation_fn="linear-silu", dropout=mlp_drop_rate)
-        self.norm_out = HunyuanVideoAdaNorm(hidden_size, 2 * hidden_size)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        norm_hidden_states = self.norm1(hidden_states)
-        attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=attention_mask,
-        )
-        gate_msa, gate_mlp = self.norm_out(temb)
-        hidden_states = hidden_states + attn_output * gate_msa
-        ff_output = self.ff(self.norm2(hidden_states))
-        hidden_states = hidden_states + ff_output * gate_mlp
-        return hidden_states
-class HunyuanVideoIndividualTokenRefiner(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_layers: int,
-        mlp_width_ratio: float = 4.0,
-        mlp_drop_rate: float = 0.0,
-        attention_bias: bool = True,
-    ) -> None:
-        super().__init__()
-        self.refiner_blocks = nn.ModuleList(
-            [
-                HunyuanVideoIndividualTokenRefinerBlock(
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    mlp_width_ratio=mlp_width_ratio,
-                    mlp_drop_rate=mlp_drop_rate,
-                    attention_bias=attention_bias,
-                )
-                for _ in range(num_layers)
-            ]
-        )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-    ) -> None:
-        self_attn_mask = None
-        if attention_mask is not None:
-            batch_size = attention_mask.shape[0]
-            seq_len = attention_mask.shape[1]
-            attention_mask = attention_mask.to(hidden_states.device).bool()
-            self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
-            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
-            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
-            self_attn_mask[:, :, :, 0] = True
-        for block in self.refiner_blocks:
-            hidden_states = block(hidden_states, temb, self_attn_mask)
-        return hidden_states
-class HunyuanVideoTokenRefiner(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_layers: int,
-        mlp_ratio: float = 4.0,
-        mlp_drop_rate: float = 0.0,
-        attention_bias: bool = True,
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-        self.time_text_embed = CombinedTimestepTextProjEmbeddings(
-            embedding_dim=hidden_size, pooled_projection_dim=in_channels
-        )
-        self.proj_in = nn.Linear(in_channels, hidden_size, bias=True)
-        self.token_refiner = HunyuanVideoIndividualTokenRefiner(
-            num_attention_heads=num_attention_heads,
-            attention_head_dim=attention_head_dim,
-            num_layers=num_layers,
-            mlp_width_ratio=mlp_ratio,
-            mlp_drop_rate=mlp_drop_rate,
-            attention_bias=attention_bias,
-        )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        timestep: torch.LongTensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-    ) -> torch.Tensor:
-        if attention_mask is None:
-            pooled_projections = hidden_states.mean(dim=1)
-        else:
-            original_dtype = hidden_states.dtype
-            mask_float = attention_mask.float().unsqueeze(-1)
-            pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1)
-            pooled_projections = pooled_projections.to(original_dtype)
-        temb = self.time_text_embed(timestep, pooled_projections)
-        hidden_states = self.proj_in(hidden_states)
-        hidden_states = self.token_refiner(hidden_states, temb, attention_mask)
-        return hidden_states
-class HunyuanVideoRotaryPosEmbed(nn.Module):
-    def __init__(self, rope_dim, theta):
-        super().__init__()
-        self.DT, self.DY, self.DX = rope_dim
-        self.theta = theta
-    @torch.no_grad()
-    def get_frequency(self, dim, pos):
-        T, H, W = pos.shape
-        freqs = 1.0 / (self.theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device)[: (dim // 2)] / dim))
-        freqs = torch.outer(freqs, pos.reshape(-1)).unflatten(-1, (T, H, W)).repeat_interleave(2, dim=0)
-        return freqs.cos(), freqs.sin()
-    @torch.no_grad()
-    def forward_inner(self, frame_indices, height, width, device):
-        GT, GY, GX = torch.meshgrid(
-            frame_indices.to(device=device, dtype=torch.float32),
-            torch.arange(0, height, device=device, dtype=torch.float32),
-            torch.arange(0, width, device=device, dtype=torch.float32),
-            indexing="ij"
-        )
-        FCT, FST = self.get_frequency(self.DT, GT)
-        FCY, FSY = self.get_frequency(self.DY, GY)
-        FCX, FSX = self.get_frequency(self.DX, GX)
-        result = torch.cat([FCT, FCY, FCX, FST, FSY, FSX], dim=0)
-        return result.to(device)
-    @torch.no_grad()
-    def forward(self, frame_indices, height, width, device):
-        frame_indices = frame_indices.unbind(0)
-        results = [self.forward_inner(f, height, width, device) for f in frame_indices]
-        results = torch.stack(results, dim=0)
-        return results
-class AdaLayerNormZero(nn.Module):
-    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
-        super().__init__()
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=bias)
-        if norm_type == "layer_norm":
-            self.norm = LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
-        else:
-            raise ValueError(f"unknown norm_type {norm_type}")
-    def forward(
-        self,
-        x: torch.Tensor,
-        emb: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        emb = emb.unsqueeze(-2)
-        emb = self.linear(self.silu(emb))
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=-1)
-        x = self.norm(x) * (1 + scale_msa) + shift_msa
-        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
-class AdaLayerNormZeroSingle(nn.Module):
-    def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True):
-        super().__init__()
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(embedding_dim, 3 * embedding_dim, bias=bias)
-        if norm_type == "layer_norm":
-            self.norm = LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
-        else:
-            raise ValueError(f"unknown norm_type {norm_type}")
-    def forward(
-        self,
-        x: torch.Tensor,
-        emb: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        emb = emb.unsqueeze(-2)
-        emb = self.linear(self.silu(emb))
-        shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=-1)
-        x = self.norm(x) * (1 + scale_msa) + shift_msa
-        return x, gate_msa
-class AdaLayerNormContinuous(nn.Module):
-    def __init__(
-        self,
-        embedding_dim: int,
-        conditioning_embedding_dim: int,
-        elementwise_affine=True,
-        eps=1e-5,
-        bias=True,
-        norm_type="layer_norm",
-    ):
-        super().__init__()
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
-        if norm_type == "layer_norm":
-            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
-        else:
-            raise ValueError(f"unknown norm_type {norm_type}")
-    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
-        emb = emb.unsqueeze(-2)
-        emb = self.linear(self.silu(emb))
-        scale, shift = emb.chunk(2, dim=-1)
-        x = self.norm(x) * (1 + scale) + shift
-        return x
-class HunyuanVideoSingleTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_ratio: float = 4.0,
-        qk_norm: str = "rms_norm",
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-        mlp_dim = int(hidden_size * mlp_ratio)
-        self.attn = Attention(
-            query_dim=hidden_size,
-            cross_attention_dim=None,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=hidden_size,
-            bias=True,
-            processor=HunyuanAttnProcessorFlashAttnSingle(),
-            qk_norm=qk_norm,
-            eps=1e-6,
-            pre_only=True,
-        )
-        self.norm = AdaLayerNormZeroSingle(hidden_size, norm_type="layer_norm")
-        self.proj_mlp = nn.Linear(hidden_size, mlp_dim)
-        self.act_mlp = nn.GELU(approximate="tanh")
-        self.proj_out = nn.Linear(hidden_size + mlp_dim, hidden_size)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    ) -> torch.Tensor:
-        text_seq_length = encoder_hidden_states.shape[1]
-        hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
-        residual = hidden_states
-        # 1. Input normalization
-        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
-        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
-        norm_hidden_states, norm_encoder_hidden_states = (
-            norm_hidden_states[:, :-text_seq_length, :],
-            norm_hidden_states[:, -text_seq_length:, :],
-        )
-        # 2. Attention
-        attn_output, context_attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=norm_encoder_hidden_states,
-            attention_mask=attention_mask,
-            image_rotary_emb=image_rotary_emb,
-        )
-        attn_output = torch.cat([attn_output, context_attn_output], dim=1)
-        # 3. Modulation and residual connection
-        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
-        hidden_states = gate * self.proj_out(hidden_states)
-        hidden_states = hidden_states + residual
-        hidden_states, encoder_hidden_states = (
-            hidden_states[:, :-text_seq_length, :],
-            hidden_states[:, -text_seq_length:, :],
-        )
-        return hidden_states, encoder_hidden_states
-class HunyuanVideoTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_ratio: float,
-        qk_norm: str = "rms_norm",
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-        self.norm1 = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
-        self.norm1_context = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
-        self.attn = Attention(
-            query_dim=hidden_size,
-            cross_attention_dim=None,
-            added_kv_proj_dim=hidden_size,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=hidden_size,
-            context_pre_only=False,
-            bias=True,
-            processor=HunyuanAttnProcessorFlashAttnDouble(),
-            qk_norm=qk_norm,
-            eps=1e-6,
-        )
-        self.norm2 = LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
-        self.norm2_context = LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.ff_context = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        freqs_cis: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # 1. Input normalization
-        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
-        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(encoder_hidden_states, emb=temb)
-        # 2. Joint attention
-        attn_output, context_attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=norm_encoder_hidden_states,
-            attention_mask=attention_mask,
-            image_rotary_emb=freqs_cis,
-        )
-        # 3. Modulation and residual connection
-        hidden_states = hidden_states + attn_output * gate_msa
-        encoder_hidden_states = encoder_hidden_states + context_attn_output * c_gate_msa
-        norm_hidden_states = self.norm2(hidden_states)
-        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
-        norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
-        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp) + c_shift_mlp
-        # 4. Feed-forward
-        ff_output = self.ff(norm_hidden_states)
-        context_ff_output = self.ff_context(norm_encoder_hidden_states)
-        hidden_states = hidden_states + gate_mlp * ff_output
-        encoder_hidden_states = encoder_hidden_states + c_gate_mlp * context_ff_output
-        return hidden_states, encoder_hidden_states
-class ClipVisionProjection(nn.Module):
-    def __init__(self, in_channels, out_channels):
-        super().__init__()
-        self.up = nn.Linear(in_channels, out_channels * 3)
-        self.down = nn.Linear(out_channels * 3, out_channels)
-    def forward(self, x):
-        projected_x = self.down(nn.functional.silu(self.up(x)))
-        return projected_x
-class HunyuanVideoPatchEmbed(nn.Module):
-    def __init__(self, patch_size, in_chans, embed_dim):
-        super().__init__()
-        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-class HunyuanVideoPatchEmbedForCleanLatents(nn.Module):
-    def __init__(self, inner_dim):
-        super().__init__()
-        self.proj = nn.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
-        self.proj_2x = nn.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
-        self.proj_4x = nn.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
-    @torch.no_grad()
-    def initialize_weight_from_another_conv3d(self, another_layer):
-        weight = another_layer.weight.detach().clone()
-        bias = another_layer.bias.detach().clone()
-        sd = {
-            'proj.weight': weight.clone(),
-            'proj.bias': bias.clone(),
-            'proj_2x.weight': einops.repeat(weight, 'b c t h w -> b c (t tk) (h hk) (w wk)', tk=2, hk=2, wk=2) / 8.0,
-            'proj_2x.bias': bias.clone(),
-            'proj_4x.weight': einops.repeat(weight, 'b c t h w -> b c (t tk) (h hk) (w wk)', tk=4, hk=4, wk=4) / 64.0,
-            'proj_4x.bias': bias.clone(),
-        }
-        sd = {k: v.clone() for k, v in sd.items()}
-        self.load_state_dict(sd)
-        return
-class HunyuanVideoTransformer3DModelPacked(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 16,
-        out_channels: int = 16,
-        num_attention_heads: int = 24,
-        attention_head_dim: int = 128,
-        num_layers: int = 20,
-        num_single_layers: int = 40,
-        num_refiner_layers: int = 2,
-        mlp_ratio: float = 4.0,
-        patch_size: int = 2,
-        patch_size_t: int = 1,
-        qk_norm: str = "rms_norm",
-        guidance_embeds: bool = True,
-        text_embed_dim: int = 4096,
-        pooled_projection_dim: int = 768,
-        rope_theta: float = 256.0,
-        rope_axes_dim: Tuple[int] = (16, 56, 56),
-        has_image_proj=False,
-        image_proj_dim=1152,
-        has_clean_x_embedder=False,
-    ) -> None:
-        super().__init__()
-        inner_dim = num_attention_heads * attention_head_dim
-        out_channels = out_channels or in_channels
-        # 1. Latent and condition embedders
-        self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
-        self.context_embedder = HunyuanVideoTokenRefiner(
-            text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
-        )
-        self.time_text_embed = CombinedTimestepGuidanceTextProjEmbeddings(inner_dim, pooled_projection_dim)
-        self.clean_x_embedder = None
-        self.image_projection = None
-        # 2. RoPE
-        self.rope = HunyuanVideoRotaryPosEmbed(rope_axes_dim, rope_theta)
-        # 3. Dual stream transformer blocks
-        self.transformer_blocks = nn.ModuleList(
-            [
-                HunyuanVideoTransformerBlock(
-                    num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
-                )
-                for _ in range(num_layers)
-            ]
-        )
-        # 4. Single stream transformer blocks
-        self.single_transformer_blocks = nn.ModuleList(
-            [
-                HunyuanVideoSingleTransformerBlock(
-                    num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
-                )
-                for _ in range(num_single_layers)
-            ]
-        )
-        # 5. Output projection
-        self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
-        self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
-        self.inner_dim = inner_dim
-        self.use_gradient_checkpointing = False
-        self.enable_teacache = False
-        if has_image_proj:
-            self.install_image_projection(image_proj_dim)
-        if has_clean_x_embedder:
-            self.install_clean_x_embedder()
-        self.high_quality_fp32_output_for_inference = False
-    def install_image_projection(self, in_channels):
-        self.image_projection = ClipVisionProjection(in_channels=in_channels, out_channels=self.inner_dim)
-        self.config['has_image_proj'] = True
-        self.config['image_proj_dim'] = in_channels
-    def install_clean_x_embedder(self):
-        self.clean_x_embedder = HunyuanVideoPatchEmbedForCleanLatents(self.inner_dim)
-        self.config['has_clean_x_embedder'] = True
-    def enable_gradient_checkpointing(self):
-        self.use_gradient_checkpointing = True
-        print('self.use_gradient_checkpointing = True')
-    def disable_gradient_checkpointing(self):
-        self.use_gradient_checkpointing = False
-        print('self.use_gradient_checkpointing = False')
-    def initialize_teacache(self, enable_teacache=True, num_steps=25, rel_l1_thresh=0.15):
-        self.enable_teacache = enable_teacache
-        self.cnt = 0
-        self.num_steps = num_steps
-        self.rel_l1_thresh = rel_l1_thresh  # 0.1 for 1.6x speedup, 0.15 for 2.1x speedup
-        self.accumulated_rel_l1_distance = 0
-        self.previous_modulated_input = None
-        self.previous_residual = None
-        self.teacache_rescale_func = np.poly1d([7.33226126e+02, -4.01131952e+02, 6.75869174e+01, -3.14987800e+00, 9.61237896e-02])
-    def gradient_checkpointing_method(self, block, *args):
-        if self.use_gradient_checkpointing:
-            result = torch.utils.checkpoint.checkpoint(block, *args, use_reentrant=False)
-        else:
-            result = block(*args)
-        return result
-    def process_input_hidden_states(
-            self,
-            latents, latent_indices=None,
-            clean_latents=None, clean_latent_indices=None,
-            clean_latents_2x=None, clean_latent_2x_indices=None,
-            clean_latents_4x=None, clean_latent_4x_indices=None
-    ):
-        hidden_states = self.gradient_checkpointing_method(self.x_embedder.proj, latents)
-        B, C, T, H, W = hidden_states.shape
-        if latent_indices is None:
-            latent_indices = torch.arange(0, T).unsqueeze(0).expand(B, -1)
-        hidden_states = hidden_states.flatten(2).transpose(1, 2)
-        rope_freqs = self.rope(frame_indices=latent_indices, height=H, width=W, device=hidden_states.device)
-        rope_freqs = rope_freqs.flatten(2).transpose(1, 2)
-        if clean_latents is not None and clean_latent_indices is not None:
-            clean_latents = clean_latents.to(hidden_states)
-            clean_latents = self.gradient_checkpointing_method(self.clean_x_embedder.proj, clean_latents)
-            clean_latents = clean_latents.flatten(2).transpose(1, 2)
-            clean_latent_rope_freqs = self.rope(frame_indices=clean_latent_indices, height=H, width=W, device=clean_latents.device)
-            clean_latent_rope_freqs = clean_latent_rope_freqs.flatten(2).transpose(1, 2)
-            hidden_states = torch.cat([clean_latents, hidden_states], dim=1)
-            rope_freqs = torch.cat([clean_latent_rope_freqs, rope_freqs], dim=1)
-        if clean_latents_2x is not None and clean_latent_2x_indices is not None:
-            clean_latents_2x = clean_latents_2x.to(hidden_states)
-            clean_latents_2x = pad_for_3d_conv(clean_latents_2x, (2, 4, 4))
-            clean_latents_2x = self.gradient_checkpointing_method(self.clean_x_embedder.proj_2x, clean_latents_2x)
-            clean_latents_2x = clean_latents_2x.flatten(2).transpose(1, 2)
-            clean_latent_2x_rope_freqs = self.rope(frame_indices=clean_latent_2x_indices, height=H, width=W, device=clean_latents_2x.device)
-            clean_latent_2x_rope_freqs = pad_for_3d_conv(clean_latent_2x_rope_freqs, (2, 2, 2))
-            clean_latent_2x_rope_freqs = center_down_sample_3d(clean_latent_2x_rope_freqs, (2, 2, 2))
-            clean_latent_2x_rope_freqs = clean_latent_2x_rope_freqs.flatten(2).transpose(1, 2)
-            hidden_states = torch.cat([clean_latents_2x, hidden_states], dim=1)
-            rope_freqs = torch.cat([clean_latent_2x_rope_freqs, rope_freqs], dim=1)
-        if clean_latents_4x is not None and clean_latent_4x_indices is not None:
-            clean_latents_4x = clean_latents_4x.to(hidden_states)
-            clean_latents_4x = pad_for_3d_conv(clean_latents_4x, (4, 8, 8))
-            clean_latents_4x = self.gradient_checkpointing_method(self.clean_x_embedder.proj_4x, clean_latents_4x)
-            clean_latents_4x = clean_latents_4x.flatten(2).transpose(1, 2)
-            clean_latent_4x_rope_freqs = self.rope(frame_indices=clean_latent_4x_indices, height=H, width=W, device=clean_latents_4x.device)
-            clean_latent_4x_rope_freqs = pad_for_3d_conv(clean_latent_4x_rope_freqs, (4, 4, 4))
-            clean_latent_4x_rope_freqs = center_down_sample_3d(clean_latent_4x_rope_freqs, (4, 4, 4))
-            clean_latent_4x_rope_freqs = clean_latent_4x_rope_freqs.flatten(2).transpose(1, 2)
-            hidden_states = torch.cat([clean_latents_4x, hidden_states], dim=1)
-            rope_freqs = torch.cat([clean_latent_4x_rope_freqs, rope_freqs], dim=1)
-        return hidden_states, rope_freqs
-    def forward(
-            self,
-            hidden_states, timestep, encoder_hidden_states, encoder_attention_mask, pooled_projections, guidance,
-            latent_indices=None,
-            clean_latents=None, clean_latent_indices=None,
-            clean_latents_2x=None, clean_latent_2x_indices=None,
-            clean_latents_4x=None, clean_latent_4x_indices=None,
-            image_embeddings=None,
-            attention_kwargs=None, return_dict=True
-    ):
-        if attention_kwargs is None:
-            attention_kwargs = {}
-        batch_size, num_channels, num_frames, height, width = hidden_states.shape
-        p, p_t = self.config['patch_size'], self.config['patch_size_t']
-        post_patch_num_frames = num_frames // p_t
-        post_patch_height = height // p
-        post_patch_width = width // p
-        original_context_length = post_patch_num_frames * post_patch_height * post_patch_width
-        hidden_states, rope_freqs = self.process_input_hidden_states(hidden_states, latent_indices, clean_latents, clean_latent_indices, clean_latents_2x, clean_latent_2x_indices, clean_latents_4x, clean_latent_4x_indices)
-        temb = self.gradient_checkpointing_method(self.time_text_embed, timestep, guidance, pooled_projections)
-        encoder_hidden_states = self.gradient_checkpointing_method(self.context_embedder, encoder_hidden_states, timestep, encoder_attention_mask)
-        if self.image_projection is not None:
-            assert image_embeddings is not None, 'You must use image embeddings!'
-            extra_encoder_hidden_states = self.gradient_checkpointing_method(self.image_projection, image_embeddings)
-            extra_attention_mask = torch.ones((batch_size, extra_encoder_hidden_states.shape[1]), dtype=encoder_attention_mask.dtype, device=encoder_attention_mask.device)
-            # must cat before (not after) encoder_hidden_states, due to attn masking
-            encoder_hidden_states = torch.cat([extra_encoder_hidden_states, encoder_hidden_states], dim=1)
-            encoder_attention_mask = torch.cat([extra_attention_mask, encoder_attention_mask], dim=1)
-        with torch.no_grad():
-            if batch_size == 1:
-                # When batch size is 1, we do not need any masks or var-len funcs since cropping is mathematically same to what we want
-                # If they are not same, then their impls are wrong. Ours are always the correct one.
-                text_len = encoder_attention_mask.sum().item()
-                encoder_hidden_states = encoder_hidden_states[:, :text_len]
-                attention_mask = None, None, None, None
-            else:
-                img_seq_len = hidden_states.shape[1]
-                txt_seq_len = encoder_hidden_states.shape[1]
-                cu_seqlens_q = get_cu_seqlens(encoder_attention_mask, img_seq_len)
-                cu_seqlens_kv = cu_seqlens_q
-                max_seqlen_q = img_seq_len + txt_seq_len
-                max_seqlen_kv = max_seqlen_q
-                attention_mask = cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv
-        if self.enable_teacache:
-            modulated_inp = self.transformer_blocks[0].norm1(hidden_states, emb=temb)[0]
-            if self.cnt == 0 or self.cnt == self.num_steps-1:
-                should_calc = True
-                self.accumulated_rel_l1_distance = 0
-            else:
-                curr_rel_l1 = ((modulated_inp - self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item()
-                self.accumulated_rel_l1_distance += self.teacache_rescale_func(curr_rel_l1)
-                should_calc = self.accumulated_rel_l1_distance >= self.rel_l1_thresh
-                if should_calc:
-                    self.accumulated_rel_l1_distance = 0
-            self.previous_modulated_input = modulated_inp
-            self.cnt += 1
-            if self.cnt == self.num_steps:
-                self.cnt = 0
-            if not should_calc:
-                hidden_states = hidden_states + self.previous_residual
-            else:
-                ori_hidden_states = hidden_states.clone()
-                for block_id, block in enumerate(self.transformer_blocks):
-                    hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
-                        block,
-                        hidden_states,
-                        encoder_hidden_states,
-                        temb,
-                        attention_mask,
-                        rope_freqs
-                    )
-                for block_id, block in enumerate(self.single_transformer_blocks):
-                    hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
-                        block,
-                        hidden_states,
-                        encoder_hidden_states,
-                        temb,
-                        attention_mask,
-                        rope_freqs
-                    )
-                self.previous_residual = hidden_states - ori_hidden_states
-        else:
-            for block_id, block in enumerate(self.transformer_blocks):
-                hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
-                    block,
-                    hidden_states,
-                    encoder_hidden_states,
-                    temb,
-                    attention_mask,
-                    rope_freqs
-                )
-            for block_id, block in enumerate(self.single_transformer_blocks):
-                hidden_states, encoder_hidden_states = self.gradient_checkpointing_method(
-                    block,
-                    hidden_states,
-                    encoder_hidden_states,
-                    temb,
-                    attention_mask,
-                    rope_freqs
-                )
-        hidden_states = self.gradient_checkpointing_method(self.norm_out, hidden_states, temb)
-        hidden_states = hidden_states[:, -original_context_length:, :]
-        if self.high_quality_fp32_output_for_inference:
-            hidden_states = hidden_states.to(dtype=torch.float32)
-            if self.proj_out.weight.dtype != torch.float32:
-                self.proj_out.to(dtype=torch.float32)
-        hidden_states = self.gradient_checkpointing_method(self.proj_out, hidden_states)
-        hidden_states = einops.rearrange(hidden_states, 'b (t h w) (c pt ph pw) -> b c (t pt) (h ph) (w pw)',
-                                         t=post_patch_num_frames, h=post_patch_height, w=post_patch_width,
-                                         pt=p_t, ph=p, pw=p)
-        if return_dict:
-            return Transformer2DModelOutput(sample=hidden_states)
-        return hidden_states,

diffusers_helper/pipelines/k_diffusion_hunyuan.py DELETED Viewed

@@ -1,120 +0,0 @@
-import torch
-import math
-from diffusers_helper.k_diffusion.uni_pc_fm import sample_unipc
-from diffusers_helper.k_diffusion.wrapper import fm_wrapper
-from diffusers_helper.utils import repeat_to_batch_size
-def flux_time_shift(t, mu=1.15, sigma=1.0):
-    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
-def calculate_flux_mu(context_length, x1=256, y1=0.5, x2=4096, y2=1.15, exp_max=7.0):
-    k = (y2 - y1) / (x2 - x1)
-    b = y1 - k * x1
-    mu = k * context_length + b
-    mu = min(mu, math.log(exp_max))
-    return mu
-def get_flux_sigmas_from_mu(n, mu):
-    sigmas = torch.linspace(1, 0, steps=n + 1)
-    sigmas = flux_time_shift(sigmas, mu=mu)
-    return sigmas
-@torch.inference_mode()
-def sample_hunyuan(
-        transformer,
-        sampler='unipc',
-        initial_latent=None,
-        concat_latent=None,
-        strength=1.0,
-        width=512,
-        height=512,
-        frames=16,
-        real_guidance_scale=1.0,
-        distilled_guidance_scale=6.0,
-        guidance_rescale=0.0,
-        shift=None,
-        num_inference_steps=25,
-        batch_size=None,
-        generator=None,
-        prompt_embeds=None,
-        prompt_embeds_mask=None,
-        prompt_poolers=None,
-        negative_prompt_embeds=None,
-        negative_prompt_embeds_mask=None,
-        negative_prompt_poolers=None,
-        dtype=torch.bfloat16,
-        device=None,
-        negative_kwargs=None,
-        callback=None,
-        **kwargs,
-):
-    device = device or transformer.device
-    if batch_size is None:
-        batch_size = int(prompt_embeds.shape[0])
-    latents = torch.randn((batch_size, 16, (frames + 3) // 4, height // 8, width // 8), generator=generator, device=generator.device).to(device=device, dtype=torch.float32)
-    B, C, T, H, W = latents.shape
-    seq_length = T * H * W // 4
-    if shift is None:
-        mu = calculate_flux_mu(seq_length, exp_max=7.0)
-    else:
-        mu = math.log(shift)
-    sigmas = get_flux_sigmas_from_mu(num_inference_steps, mu).to(device)
-    k_model = fm_wrapper(transformer)
-    if initial_latent is not None:
-        sigmas = sigmas * strength
-        first_sigma = sigmas[0].to(device=device, dtype=torch.float32)
-        initial_latent = initial_latent.to(device=device, dtype=torch.float32)
-        latents = initial_latent.float() * (1.0 - first_sigma) + latents.float() * first_sigma
-    if concat_latent is not None:
-        concat_latent = concat_latent.to(latents)
-    distilled_guidance = torch.tensor([distilled_guidance_scale * 1000.0] * batch_size).to(device=device, dtype=dtype)
-    prompt_embeds = repeat_to_batch_size(prompt_embeds, batch_size)
-    prompt_embeds_mask = repeat_to_batch_size(prompt_embeds_mask, batch_size)
-    prompt_poolers = repeat_to_batch_size(prompt_poolers, batch_size)
-    negative_prompt_embeds = repeat_to_batch_size(negative_prompt_embeds, batch_size)
-    negative_prompt_embeds_mask = repeat_to_batch_size(negative_prompt_embeds_mask, batch_size)
-    negative_prompt_poolers = repeat_to_batch_size(negative_prompt_poolers, batch_size)
-    concat_latent = repeat_to_batch_size(concat_latent, batch_size)
-    sampler_kwargs = dict(
-        dtype=dtype,
-        cfg_scale=real_guidance_scale,
-        cfg_rescale=guidance_rescale,
-        concat_latent=concat_latent,
-        positive=dict(
-            pooled_projections=prompt_poolers,
-            encoder_hidden_states=prompt_embeds,
-            encoder_attention_mask=prompt_embeds_mask,
-            guidance=distilled_guidance,
-            **kwargs,
-        ),
-        negative=dict(
-            pooled_projections=negative_prompt_poolers,
-            encoder_hidden_states=negative_prompt_embeds,
-            encoder_attention_mask=negative_prompt_embeds_mask,
-            guidance=distilled_guidance,
-            **(kwargs if negative_kwargs is None else {**kwargs, **negative_kwargs}),
-        )
-    )
-    if sampler == 'unipc':
-        results = sample_unipc(k_model, latents, sigmas, extra_args=sampler_kwargs, disable=False, callback=callback)
-    else:
-        raise NotImplementedError(f'Sampler {sampler} is not supported.')
-    return results

diffusers_helper/thread_utils.py DELETED Viewed

@@ -1,123 +0,0 @@
-import time
-from threading import Thread, Lock
-class Listener:
-    task_queue = []
-    lock = Lock()
-    thread = None
-    @classmethod
-    def _process_tasks(cls):
-        while True:
-            task = None
-            with cls.lock:
-                if cls.task_queue:
-                    task = cls.task_queue.pop(0)
-            if task is None:
-                time.sleep(0.001)
-                continue
-            func, args, kwargs = task
-            try:
-                func(*args, **kwargs)
-            except Exception as e:
-                print(f"Error in listener thread: {e}")
-    @classmethod
-    def add_task(cls, func, *args, **kwargs):
-        with cls.lock:
-            cls.task_queue.append((func, args, kwargs))
-        if cls.thread is None:
-            cls.thread = Thread(target=cls._process_tasks, daemon=True)
-            cls.thread.start()
-def async_run(func, *args, **kwargs):
-    Listener.add_task(func, *args, **kwargs)
-class FIFOQueue:
-    def __init__(self):
-        self.queue = []
-        self.lock = Lock()
-        print("【调试】创建新的FIFOQueue")
-    def push(self, item):
-        print(f"【调试】FIFOQueue.push: 准备添加项目: {item}")
-        with self.lock:
-            self.queue.append(item)
-            print(f"【调试】FIFOQueue.push: 成功添加项目: {item}, 当前队列长度: {len(self.queue)}")
-    def pop(self):
-        print("【调试】FIFOQueue.pop: 准备弹出队列首项")
-        with self.lock:
-            if self.queue:
-                item = self.queue.pop(0)
-                print(f"【调试】FIFOQueue.pop: 成功弹出项目: {item}, 剩余队列长度: {len(self.queue)}")
-                return item
-            print("【调试】FIFOQueue.pop: 队列为空，返回None")
-            return None
-    def top(self):
-        print("【调试】FIFOQueue.top: 准备查看队列首项")
-        with self.lock:
-            if self.queue:
-                item = self.queue[0]
-                print(f"【调试】FIFOQueue.top: 队列首项为: {item}, 当前队列长度: {len(self.queue)}")
-                return item
-            print("【调试】FIFOQueue.top: 队列为空，返回None")
-            return None
-    def next(self):
-        print("【调试】FIFOQueue.next: 等待弹出队列首项")
-        while True:
-            with self.lock:
-                if self.queue:
-                    item = self.queue.pop(0)
-                    print(f"【调试】FIFOQueue.next: 成功弹出项目: {item}, 剩余队列长度: {len(self.queue)}")
-                    return item
-            time.sleep(0.001)
-class AsyncStream:
-    def __init__(self):
-        self.input_queue = FIFOQueue()
-        self.output_queue = FIFOQueue()
-class InterruptibleStreamData:
-    def __init__(self):
-        self.input_queue = FIFOQueue()
-        self.output_queue = FIFOQueue()
-        print("【调试】创建新的InterruptibleStreamData，初始化输入输出队列")
-    # 推送数据至输出队列
-    def push_output(self, item):
-        print(f"【调试】InterruptibleStreamData.push_output: 准备推送输出: {type(item)}")
-        self.output_queue.push(item)
-        print(f"【调试】InterruptibleStreamData.push_output: 成功推送输出")
-    # 获取下一个输出数据
-    def get_output(self):
-        print("【调试】InterruptibleStreamData.get_output: 准备获取下一个输出数据")
-        item = self.output_queue.next()
-        print(f"【调试】InterruptibleStreamData.get_output: 获取到输出数据: {type(item)}")
-        return item
-    # 推送数据至输入队列
-    def push_input(self, item):
-        print(f"【调试】InterruptibleStreamData.push_input: 准备推送输入: {type(item)}")
-        self.input_queue.push(item)
-        print(f"【调试】InterruptibleStreamData.push_input: 成功推送输入")
-    # 获取下一个输入数据
-    def get_input(self):
-        print("【调试】InterruptibleStreamData.get_input: 准备获取下一个输入数据")
-        item = self.input_queue.next()
-        print(f"【调试】InterruptibleStreamData.get_input: 获取到输入数据: {type(item)}")
-        return item

diffusers_helper/utils.py DELETED Viewed

@@ -1,613 +0,0 @@
-import os
-import cv2
-import json
-import random
-import glob
-import torch
-import einops
-import numpy as np
-import datetime
-import torchvision
-import safetensors.torch as sf
-from PIL import Image
-def min_resize(x, m):
-    if x.shape[0] < x.shape[1]:
-        s0 = m
-        s1 = int(float(m) / float(x.shape[0]) * float(x.shape[1]))
-    else:
-        s0 = int(float(m) / float(x.shape[1]) * float(x.shape[0]))
-        s1 = m
-    new_max = max(s1, s0)
-    raw_max = max(x.shape[0], x.shape[1])
-    if new_max < raw_max:
-        interpolation = cv2.INTER_AREA
-    else:
-        interpolation = cv2.INTER_LANCZOS4
-    y = cv2.resize(x, (s1, s0), interpolation=interpolation)
-    return y
-def d_resize(x, y):
-    H, W, C = y.shape
-    new_min = min(H, W)
-    raw_min = min(x.shape[0], x.shape[1])
-    if new_min < raw_min:
-        interpolation = cv2.INTER_AREA
-    else:
-        interpolation = cv2.INTER_LANCZOS4
-    y = cv2.resize(x, (W, H), interpolation=interpolation)
-    return y
-def resize_and_center_crop(image, target_width, target_height):
-    if target_height == image.shape[0] and target_width == image.shape[1]:
-        return image
-    pil_image = Image.fromarray(image)
-    original_width, original_height = pil_image.size
-    scale_factor = max(target_width / original_width, target_height / original_height)
-    resized_width = int(round(original_width * scale_factor))
-    resized_height = int(round(original_height * scale_factor))
-    resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS)
-    left = (resized_width - target_width) / 2
-    top = (resized_height - target_height) / 2
-    right = (resized_width + target_width) / 2
-    bottom = (resized_height + target_height) / 2
-    cropped_image = resized_image.crop((left, top, right, bottom))
-    return np.array(cropped_image)
-def resize_and_center_crop_pytorch(image, target_width, target_height):
-    B, C, H, W = image.shape
-    if H == target_height and W == target_width:
-        return image
-    scale_factor = max(target_width / W, target_height / H)
-    resized_width = int(round(W * scale_factor))
-    resized_height = int(round(H * scale_factor))
-    resized = torch.nn.functional.interpolate(image, size=(resized_height, resized_width), mode='bilinear', align_corners=False)
-    top = (resized_height - target_height) // 2
-    left = (resized_width - target_width) // 2
-    cropped = resized[:, :, top:top + target_height, left:left + target_width]
-    return cropped
-def resize_without_crop(image, target_width, target_height):
-    if target_height == image.shape[0] and target_width == image.shape[1]:
-        return image
-    pil_image = Image.fromarray(image)
-    resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
-    return np.array(resized_image)
-def just_crop(image, w, h):
-    if h == image.shape[0] and w == image.shape[1]:
-        return image
-    original_height, original_width = image.shape[:2]
-    k = min(original_height / h, original_width / w)
-    new_width = int(round(w * k))
-    new_height = int(round(h * k))
-    x_start = (original_width - new_width) // 2
-    y_start = (original_height - new_height) // 2
-    cropped_image = image[y_start:y_start + new_height, x_start:x_start + new_width]
-    return cropped_image
-def write_to_json(data, file_path):
-    temp_file_path = file_path + ".tmp"
-    with open(temp_file_path, 'wt', encoding='utf-8') as temp_file:
-        json.dump(data, temp_file, indent=4)
-    os.replace(temp_file_path, file_path)
-    return
-def read_from_json(file_path):
-    with open(file_path, 'rt', encoding='utf-8') as file:
-        data = json.load(file)
-    return data
-def get_active_parameters(m):
-    return {k: v for k, v in m.named_parameters() if v.requires_grad}
-def cast_training_params(m, dtype=torch.float32):
-    result = {}
-    for n, param in m.named_parameters():
-        if param.requires_grad:
-            param.data = param.to(dtype)
-            result[n] = param
-    return result
-def separate_lora_AB(parameters, B_patterns=None):
-    parameters_normal = {}
-    parameters_B = {}
-    if B_patterns is None:
-        B_patterns = ['.lora_B.', '__zero__']
-    for k, v in parameters.items():
-        if any(B_pattern in k for B_pattern in B_patterns):
-            parameters_B[k] = v
-        else:
-            parameters_normal[k] = v
-    return parameters_normal, parameters_B
-def set_attr_recursive(obj, attr, value):
-    attrs = attr.split(".")
-    for name in attrs[:-1]:
-        obj = getattr(obj, name)
-    setattr(obj, attrs[-1], value)
-    return
-def print_tensor_list_size(tensors):
-    total_size = 0
-    total_elements = 0
-    if isinstance(tensors, dict):
-        tensors = tensors.values()
-    for tensor in tensors:
-        total_size += tensor.nelement() * tensor.element_size()
-        total_elements += tensor.nelement()
-    total_size_MB = total_size / (1024 ** 2)
-    total_elements_B = total_elements / 1e9
-    print(f"Total number of tensors: {len(tensors)}")
-    print(f"Total size of tensors: {total_size_MB:.2f} MB")
-    print(f"Total number of parameters: {total_elements_B:.3f} billion")
-    return
-@torch.no_grad()
-def batch_mixture(a, b=None, probability_a=0.5, mask_a=None):
-    batch_size = a.size(0)
-    if b is None:
-        b = torch.zeros_like(a)
-    if mask_a is None:
-        mask_a = torch.rand(batch_size) < probability_a
-    mask_a = mask_a.to(a.device)
-    mask_a = mask_a.reshape((batch_size,) + (1,) * (a.dim() - 1))
-    result = torch.where(mask_a, a, b)
-    return result
-@torch.no_grad()
-def zero_module(module):
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-@torch.no_grad()
-def supress_lower_channels(m, k, alpha=0.01):
-    data = m.weight.data.clone()
-    assert int(data.shape[1]) >= k
-    data[:, :k] = data[:, :k] * alpha
-    m.weight.data = data.contiguous().clone()
-    return m
-def freeze_module(m):
-    if not hasattr(m, '_forward_inside_frozen_module'):
-        m._forward_inside_frozen_module = m.forward
-    m.requires_grad_(False)
-    m.forward = torch.no_grad()(m.forward)
-    return m
-def get_latest_safetensors(folder_path):
-    safetensors_files = glob.glob(os.path.join(folder_path, '*.safetensors'))
-    if not safetensors_files:
-        raise ValueError('No file to resume!')
-    latest_file = max(safetensors_files, key=os.path.getmtime)
-    latest_file = os.path.abspath(os.path.realpath(latest_file))
-    return latest_file
-def generate_random_prompt_from_tags(tags_str, min_length=3, max_length=32):
-    tags = tags_str.split(', ')
-    tags = random.sample(tags, k=min(random.randint(min_length, max_length), len(tags)))
-    prompt = ', '.join(tags)
-    return prompt
-def interpolate_numbers(a, b, n, round_to_int=False, gamma=1.0):
-    numbers = a + (b - a) * (np.linspace(0, 1, n) ** gamma)
-    if round_to_int:
-        numbers = np.round(numbers).astype(int)
-    return numbers.tolist()
-def uniform_random_by_intervals(inclusive, exclusive, n, round_to_int=False):
-    edges = np.linspace(0, 1, n + 1)
-    points = np.random.uniform(edges[:-1], edges[1:])
-    numbers = inclusive + (exclusive - inclusive) * points
-    if round_to_int:
-        numbers = np.round(numbers).astype(int)
-    return numbers.tolist()
-def soft_append_bcthw(history, current, overlap=0):
-    if overlap <= 0:
-        return torch.cat([history, current], dim=2)
-    assert history.shape[2] >= overlap, f"History length ({history.shape[2]}) must be >= overlap ({overlap})"
-    assert current.shape[2] >= overlap, f"Current length ({current.shape[2]}) must be >= overlap ({overlap})"
-    weights = torch.linspace(1, 0, overlap, dtype=history.dtype, device=history.device).view(1, 1, -1, 1, 1)
-    blended = weights * history[:, :, -overlap:] + (1 - weights) * current[:, :, :overlap]
-    output = torch.cat([history[:, :, :-overlap], blended, current[:, :, overlap:]], dim=2)
-    return output.to(history)
-def save_bcthw_as_mp4(x, output_filename, fps=10):
-    b, c, t, h, w = x.shape
-    per_row = b
-    for p in [6, 5, 4, 3, 2]:
-        if b % p == 0:
-            per_row = p
-            break
-    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
-    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
-    x = x.detach().cpu().to(torch.uint8)
-    x = einops.rearrange(x, '(m n) c t h w -> t (m h) (n w) c', n=per_row)
-    torchvision.io.write_video(output_filename, x, fps=fps, video_codec='libx264', options={'crf': '0'})
-    return x
-def save_bcthw_as_png(x, output_filename):
-    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
-    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
-    x = x.detach().cpu().to(torch.uint8)
-    x = einops.rearrange(x, 'b c t h w -> c (b h) (t w)')
-    torchvision.io.write_png(x, output_filename)
-    return output_filename
-def save_bchw_as_png(x, output_filename):
-    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
-    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
-    x = x.detach().cpu().to(torch.uint8)
-    x = einops.rearrange(x, 'b c h w -> c h (b w)')
-    torchvision.io.write_png(x, output_filename)
-    return output_filename
-def add_tensors_with_padding(tensor1, tensor2):
-    if tensor1.shape == tensor2.shape:
-        return tensor1 + tensor2
-    shape1 = tensor1.shape
-    shape2 = tensor2.shape
-    new_shape = tuple(max(s1, s2) for s1, s2 in zip(shape1, shape2))
-    padded_tensor1 = torch.zeros(new_shape)
-    padded_tensor2 = torch.zeros(new_shape)
-    padded_tensor1[tuple(slice(0, s) for s in shape1)] = tensor1
-    padded_tensor2[tuple(slice(0, s) for s in shape2)] = tensor2
-    result = padded_tensor1 + padded_tensor2
-    return result
-def print_free_mem():
-    torch.cuda.empty_cache()
-    free_mem, total_mem = torch.cuda.mem_get_info(0)
-    free_mem_mb = free_mem / (1024 ** 2)
-    total_mem_mb = total_mem / (1024 ** 2)
-    print(f"Free memory: {free_mem_mb:.2f} MB")
-    print(f"Total memory: {total_mem_mb:.2f} MB")
-    return
-def print_gpu_parameters(device, state_dict, log_count=1):
-    summary = {"device": device, "keys_count": len(state_dict)}
-    logged_params = {}
-    for i, (key, tensor) in enumerate(state_dict.items()):
-        if i >= log_count:
-            break
-        logged_params[key] = tensor.flatten()[:3].tolist()
-    summary["params"] = logged_params
-    print(str(summary))
-    return
-def visualize_txt_as_img(width, height, text, font_path='font/DejaVuSans.ttf', size=18):
-    from PIL import Image, ImageDraw, ImageFont
-    txt = Image.new("RGB", (width, height), color="white")
-    draw = ImageDraw.Draw(txt)
-    font = ImageFont.truetype(font_path, size=size)
-    if text == '':
-        return np.array(txt)
-    # Split text into lines that fit within the image width
-    lines = []
-    words = text.split()
-    current_line = words[0]
-    for word in words[1:]:
-        line_with_word = f"{current_line} {word}"
-        if draw.textbbox((0, 0), line_with_word, font=font)[2] <= width:
-            current_line = line_with_word
-        else:
-            lines.append(current_line)
-            current_line = word
-    lines.append(current_line)
-    # Draw the text line by line
-    y = 0
-    line_height = draw.textbbox((0, 0), "A", font=font)[3]
-    for line in lines:
-        if y + line_height > height:
-            break  # stop drawing if the next line will be outside the image
-        draw.text((0, y), line, fill="black", font=font)
-        y += line_height
-    return np.array(txt)
-def blue_mark(x):
-    x = x.copy()
-    c = x[:, :, 2]
-    b = cv2.blur(c, (9, 9))
-    x[:, :, 2] = ((c - b) * 16.0 + b).clip(-1, 1)
-    return x
-def green_mark(x):
-    x = x.copy()
-    x[:, :, 2] = -1
-    x[:, :, 0] = -1
-    return x
-def frame_mark(x):
-    x = x.copy()
-    x[:64] = -1
-    x[-64:] = -1
-    x[:, :8] = 1
-    x[:, -8:] = 1
-    return x
-@torch.inference_mode()
-def pytorch2numpy(imgs):
-    results = []
-    for x in imgs:
-        y = x.movedim(0, -1)
-        y = y * 127.5 + 127.5
-        y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
-        results.append(y)
-    return results
-@torch.inference_mode()
-def numpy2pytorch(imgs):
-    h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
-    h = h.movedim(-1, 1)
-    return h
-@torch.no_grad()
-def duplicate_prefix_to_suffix(x, count, zero_out=False):
-    if zero_out:
-        return torch.cat([x, torch.zeros_like(x[:count])], dim=0)
-    else:
-        return torch.cat([x, x[:count]], dim=0)
-def weighted_mse(a, b, weight):
-    return torch.mean(weight.float() * (a.float() - b.float()) ** 2)
-def clamped_linear_interpolation(x, x_min, y_min, x_max, y_max, sigma=1.0):
-    x = (x - x_min) / (x_max - x_min)
-    x = max(0.0, min(x, 1.0))
-    x = x ** sigma
-    return y_min + x * (y_max - y_min)
-def expand_to_dims(x, target_dims):
-    return x.view(*x.shape, *([1] * max(0, target_dims - x.dim())))
-def repeat_to_batch_size(tensor: torch.Tensor, batch_size: int):
-    if tensor is None:
-        return None
-    first_dim = tensor.shape[0]
-    if first_dim == batch_size:
-        return tensor
-    if batch_size % first_dim != 0:
-        raise ValueError(f"Cannot evenly repeat first dim {first_dim} to match batch_size {batch_size}.")
-    repeat_times = batch_size // first_dim
-    return tensor.repeat(repeat_times, *[1] * (tensor.dim() - 1))
-def dim5(x):
-    return expand_to_dims(x, 5)
-def dim4(x):
-    return expand_to_dims(x, 4)
-def dim3(x):
-    return expand_to_dims(x, 3)
-def crop_or_pad_yield_mask(x, length):
-    B, F, C = x.shape
-    device = x.device
-    dtype = x.dtype
-    if F < length:
-        y = torch.zeros((B, length, C), dtype=dtype, device=device)
-        mask = torch.zeros((B, length), dtype=torch.bool, device=device)
-        y[:, :F, :] = x
-        mask[:, :F] = True
-        return y, mask
-    return x[:, :length, :], torch.ones((B, length), dtype=torch.bool, device=device)
-def extend_dim(x, dim, minimal_length, zero_pad=False):
-    original_length = int(x.shape[dim])
-    if original_length >= minimal_length:
-        return x
-    if zero_pad:
-        padding_shape = list(x.shape)
-        padding_shape[dim] = minimal_length - original_length
-        padding = torch.zeros(padding_shape, dtype=x.dtype, device=x.device)
-    else:
-        idx = (slice(None),) * dim + (slice(-1, None),) + (slice(None),) * (len(x.shape) - dim - 1)
-        last_element = x[idx]
-        padding = last_element.repeat_interleave(minimal_length - original_length, dim=dim)
-    return torch.cat([x, padding], dim=dim)
-def lazy_positional_encoding(t, repeats=None):
-    if not isinstance(t, list):
-        t = [t]
-    from diffusers.models.embeddings import get_timestep_embedding
-    te = torch.tensor(t)
-    te = get_timestep_embedding(timesteps=te, embedding_dim=256, flip_sin_to_cos=True, downscale_freq_shift=0.0, scale=1.0)
-    if repeats is None:
-        return te
-    te = te[:, None, :].expand(-1, repeats, -1)
-    return te
-def state_dict_offset_merge(A, B, C=None):
-    result = {}
-    keys = A.keys()
-    for key in keys:
-        A_value = A[key]
-        B_value = B[key].to(A_value)
-        if C is None:
-            result[key] = A_value + B_value
-        else:
-            C_value = C[key].to(A_value)
-            result[key] = A_value + B_value - C_value
-    return result
-def state_dict_weighted_merge(state_dicts, weights):
-    if len(state_dicts) != len(weights):
-        raise ValueError("Number of state dictionaries must match number of weights")
-    if not state_dicts:
-        return {}
-    total_weight = sum(weights)
-    if total_weight == 0:
-        raise ValueError("Sum of weights cannot be zero")
-    normalized_weights = [w / total_weight for w in weights]
-    keys = state_dicts[0].keys()
-    result = {}
-    for key in keys:
-        result[key] = state_dicts[0][key] * normalized_weights[0]
-        for i in range(1, len(state_dicts)):
-            state_dict_value = state_dicts[i][key].to(result[key])
-            result[key] += state_dict_value * normalized_weights[i]
-    return result
-def group_files_by_folder(all_files):
-    grouped_files = {}
-    for file in all_files:
-        folder_name = os.path.basename(os.path.dirname(file))
-        if folder_name not in grouped_files:
-            grouped_files[folder_name] = []
-        grouped_files[folder_name].append(file)
-    list_of_lists = list(grouped_files.values())
-    return list_of_lists
-def generate_timestamp():
-    now = datetime.datetime.now()
-    timestamp = now.strftime('%y%m%d_%H%M%S')
-    milliseconds = f"{int(now.microsecond / 1000):03d}"
-    random_number = random.randint(0, 9999)
-    return f"{timestamp}_{milliseconds}_{random_number}"
-def write_PIL_image_with_png_info(image, metadata, path):
-    from PIL.PngImagePlugin import PngInfo
-    png_info = PngInfo()
-    for key, value in metadata.items():
-        png_info.add_text(key, value)
-    image.save(path, "PNG", pnginfo=png_info)
-    return image
-def torch_safe_save(content, path):
-    torch.save(content, path + '_tmp')
-    os.replace(path + '_tmp', path)
-    return path
-def move_optimizer_to_device(optimizer, device):
-    for state in optimizer.state.values():
-        for k, v in state.items():
-            if isinstance(v, torch.Tensor):
-                state[k] = v.to(device)