import os import imageio import numpy as np from typing import Union import torch import torchvision from tqdm import tqdm from einops import rearrange def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=4, fps=8): videos = rearrange(videos, "b c t h w -> t b c h w") outputs = [] for x in videos: x = torchvision.utils.make_grid(x, nrow=n_rows) x = x.transpose(0, 1).transpose(1, 2).squeeze(-1) if rescale: x = (x + 1.0) / 2.0 # -1,1 -> 0,1 x = (x * 255).numpy().astype(np.uint8) outputs.append(x) os.makedirs(os.path.dirname(path), exist_ok=True) imageio.mimsave(path, outputs, fps=fps) # DDIM Inversion @torch.no_grad() def init_prompt(prompt, pipeline): uncond_input = pipeline.tokenizer( [""], padding="max_length", max_length=pipeline.tokenizer.model_max_length, return_tensors="pt" ) uncond_embeddings = pipeline.text_encoder(uncond_input.input_ids.to(pipeline.device))[0] text_input = pipeline.tokenizer( [prompt], padding="max_length", max_length=pipeline.tokenizer.model_max_length, truncation=True, return_tensors="pt", ) text_embeddings = pipeline.text_encoder(text_input.input_ids.to(pipeline.device))[0] context = torch.cat([uncond_embeddings, text_embeddings]) return context def next_step(model_output: Union[torch.FloatTensor, np.ndarray], timestep: int, sample: Union[torch.FloatTensor, np.ndarray], ddim_scheduler): timestep, next_timestep = min( timestep - ddim_scheduler.config.num_train_timesteps // ddim_scheduler.num_inference_steps, 999), timestep alpha_prod_t = ddim_scheduler.alphas_cumprod[timestep] if timestep >= 0 else ddim_scheduler.final_alpha_cumprod alpha_prod_t_next = ddim_scheduler.alphas_cumprod[next_timestep] beta_prod_t = 1 - alpha_prod_t next_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5 next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction return next_sample def get_noise_pred_single(latents, t, context, unet): noise_pred = unet(latents, t, encoder_hidden_states=context)["sample"] return noise_pred @torch.no_grad() def ddim_loop(pipeline, ddim_scheduler, latent, num_inv_steps, prompt): context = init_prompt(prompt, pipeline) uncond_embeddings, cond_embeddings = context.chunk(2) all_latent = [latent] latent = latent.clone().detach() for i in tqdm(range(num_inv_steps)): t = ddim_scheduler.timesteps[len(ddim_scheduler.timesteps) - i - 1] noise_pred = get_noise_pred_single(latent, t, cond_embeddings, pipeline.unet) latent = next_step(noise_pred, t, latent, ddim_scheduler) all_latent.append(latent) return all_latent @torch.no_grad() def ddim_inversion(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt=""): ddim_latents = ddim_loop(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt) return ddim_latents def rendering(): pass def force_zero_snr(betas): alphas = 1 - betas alphas_bar = torch.cumprod(alphas, dim=0) alphas_bar_sqrt = alphas_bar ** (1/2) alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone() alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone() - 1e-6 alphas_bar_sqrt -= alphas_bar_sqrt_T alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T) alphas_bar = alphas_bar_sqrt ** 2 alphas = alphas_bar[1:] / alphas_bar[:-1] alphas = torch.cat([alphas_bar[0:1], alphas], 0) betas = 1 - alphas return betas def make_beta_schedule(schedule="scaled_linear", n_timestep=1000, linear_start=0.00085, linear_end=0.012, cosine_s=8e-3, shift_scale=None): if schedule == "scaled_linear": betas = ( torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2 ) elif schedule == 'linear': betas = ( torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ) elif schedule == "cosine": timesteps = ( torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s ) alphas = timesteps / (1 + cosine_s) * np.pi / 2 alphas = torch.cos(alphas).pow(2) alphas = alphas / alphas[0] betas = 1 - alphas[1:] / alphas[:-1] betas = np.clip(betas, a_min=0, a_max=0.999) elif schedule == "sqrt_linear": betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) elif schedule == "sqrt": betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5 elif schedule == 'linear_force_zero_snr': betas = ( torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2 ) betas = force_zero_snr(betas) elif schedule == 'linear_100': betas = ( torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2 ) betas = betas[:100] else: raise ValueError(f"schedule '{schedule}' unknown.") if shift_scale is not None: print("shift_scale") betas = shift_schedule(betas, shift_scale) return betas.numpy() def shift_schedule(base_betas, shift_scale): alphas = 1 - base_betas alphas_bar = torch.cumprod(alphas, dim=0) snr = alphas_bar / (1 - alphas_bar) # snr(1-ab)=ab; snr-snr*ab=ab; snr=(1+snr)ab; ab=snr/(1+snr) shifted_snr = snr * ((1 / shift_scale) ** 2) shifted_alphas_bar = shifted_snr / (1 + shifted_snr) shifted_alphas = shifted_alphas_bar[1:] / shifted_alphas_bar[:-1] shifted_alphas = torch.cat([shifted_alphas_bar[0:1], shifted_alphas], 0) shifted_betas = 1 - shifted_alphas return shifted_betas def shift_dim(x, src_dim=-1, dest_dim=-1, make_contiguous=True): n_dims = len(x.shape) if src_dim < 0: src_dim = n_dims + src_dim if dest_dim < 0: dest_dim = n_dims + dest_dim assert 0 <= src_dim < n_dims and 0 <= dest_dim < n_dims dims = list(range(n_dims)) del dims[src_dim] permutation = [] ctr = 0 for i in range(n_dims): if i == dest_dim: permutation.append(src_dim) else: permutation.append(dims[ctr]) ctr += 1 x = x.permute(permutation) if make_contiguous: x = x.contiguous() return x # reshapes tensor start from dim i (inclusive) # to dim j (exclusive) to the desired shape # e.g. if x.shape = (b, thw, c) then # view_range(x, 1, 2, (t, h, w)) returns # x of shape (b, t, h, w, c) def view_range(x, i, j, shape): shape = tuple(shape) n_dims = len(x.shape) if i < 0: i = n_dims + i if j is None: j = n_dims elif j < 0: j = n_dims + j assert 0 <= i < j <= n_dims x_shape = x.shape target_shape = x_shape[:i] + shape + x_shape[j:] return x.view(target_shape) def tensor_slice(x, begin, size): assert all([b >= 0 for b in begin]) size = [l - b if s == -1 else s for s, b, l in zip(size, begin, x.shape)] assert all([s >= 0 for s in size]) slices = [slice(b, b + s) for b, s in zip(begin, size)] return x[slices]