Spaces:
Running
on
Zero
Running
on
Zero
Linoy Tsaban
commited on
Commit
·
8623f65
1
Parent(s):
32fdae0
Update preprocess_utils.py
Browse fileschange in dm components loading
- preprocess_utils.py +22 -42
preprocess_utils.py
CHANGED
|
@@ -10,7 +10,7 @@ import torch.nn as nn
|
|
| 10 |
import argparse
|
| 11 |
from torchvision.io import write_video
|
| 12 |
from pathlib import Path
|
| 13 |
-
from
|
| 14 |
import torchvision.transforms as T
|
| 15 |
|
| 16 |
|
|
@@ -25,7 +25,7 @@ def get_timesteps(scheduler, num_inference_steps, strength, device):
|
|
| 25 |
|
| 26 |
|
| 27 |
class Preprocess(nn.Module):
|
| 28 |
-
def __init__(self, device, opt, hf_key=None):
|
| 29 |
super().__init__()
|
| 30 |
|
| 31 |
self.device = device
|
|
@@ -47,15 +47,23 @@ class Preprocess(nn.Module):
|
|
| 47 |
model_key = "stabilityai/stable-diffusion-2-depth"
|
| 48 |
else:
|
| 49 |
raise ValueError(f'Stable-diffusion version {self.sd_version} not supported.')
|
|
|
|
| 50 |
self.model_key = model_key
|
|
|
|
| 51 |
# Create model
|
| 52 |
-
self.vae = AutoencoderKL.from_pretrained(model_key, subfolder="vae", revision="fp16",
|
| 53 |
-
|
| 54 |
-
self.tokenizer = CLIPTokenizer.from_pretrained(model_key, subfolder="tokenizer")
|
| 55 |
-
self.text_encoder = CLIPTextModel.from_pretrained(model_key, subfolder="text_encoder", revision="fp16",
|
| 56 |
-
|
| 57 |
-
self.unet = UNet2DConditionModel.from_pretrained(model_key, subfolder="unet", revision="fp16",
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
self.total_inverted_latents = {}
|
| 60 |
|
| 61 |
self.paths, self.frames, self.latents = self.get_data(self.config["data_path"], self.config["n_frames"])
|
|
@@ -74,11 +82,12 @@ class Preprocess(nn.Module):
|
|
| 74 |
self.canny_cond = self.get_canny_cond()
|
| 75 |
elif self.sd_version == 'depth':
|
| 76 |
self.depth_maps = self.prepare_depth_maps()
|
| 77 |
-
self.scheduler =
|
| 78 |
|
| 79 |
-
|
| 80 |
print(f'[INFO] loaded stable diffusion!')
|
| 81 |
-
|
|
|
|
| 82 |
@torch.no_grad()
|
| 83 |
def prepare_depth_maps(self, model_type='DPT_Large', device='cuda'):
|
| 84 |
depth_maps = []
|
|
@@ -363,33 +372,4 @@ def prep(opt):
|
|
| 363 |
|
| 364 |
|
| 365 |
return frames, latents, total_inverted_latents, rgb_reconstruction
|
| 366 |
-
|
| 367 |
-
# os.mkdir(os.path.join(save_path, f'frames'))
|
| 368 |
-
# for i, frame in enumerate(recon_frames):
|
| 369 |
-
# T.ToPILImage()(frame).save(os.path.join(save_path, f'frames', f'{i:05d}.png'))
|
| 370 |
-
# frames = (recon_frames * 255).to(torch.uint8).cpu().permute(0, 2, 3, 1)
|
| 371 |
-
# write_video(os.path.join(save_path, f'inverted.mp4'), frames, fps=10)
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
# if __name__ == "__main__":
|
| 375 |
-
# device = 'cuda'
|
| 376 |
-
# parser = argparse.ArgumentParser()
|
| 377 |
-
# parser.add_argument('--data_path', type=str,
|
| 378 |
-
# default='data/woman-running.mp4')
|
| 379 |
-
# parser.add_argument('--H', type=int, default=512,
|
| 380 |
-
# help='for non-square videos, we recommand using 672 x 384 or 384 x 672, aspect ratio 1.75')
|
| 381 |
-
# parser.add_argument('--W', type=int, default=512,
|
| 382 |
-
# help='for non-square videos, we recommand using 672 x 384 or 384 x 672, aspect ratio 1.75')
|
| 383 |
-
# parser.add_argument('--save_dir', type=str, default='latents')
|
| 384 |
-
# parser.add_argument('--sd_version', type=str, default='2.1', choices=['1.5', '2.0', '2.1', 'ControlNet', 'depth'],
|
| 385 |
-
# help="stable diffusion version")
|
| 386 |
-
# parser.add_argument('--steps', type=int, default=500)
|
| 387 |
-
# parser.add_argument('--batch_size', type=int, default=40)
|
| 388 |
-
# parser.add_argument('--save_steps', type=int, default=50)
|
| 389 |
-
# parser.add_argument('--n_frames', type=int, default=40)
|
| 390 |
-
# parser.add_argument('--inversion_prompt', type=str, default='a woman running')
|
| 391 |
-
# opt = parser.parse_args()
|
| 392 |
-
# video_path = opt.data_path
|
| 393 |
-
# save_video_frames(video_path, img_size=(opt.H, opt.W))
|
| 394 |
-
# opt.data_path = os.path.join('data', Path(video_path).stem)
|
| 395 |
-
# prep(opt)
|
|
|
|
| 10 |
import argparse
|
| 11 |
from torchvision.io import write_video
|
| 12 |
from pathlib import Path
|
| 13 |
+
from util import *
|
| 14 |
import torchvision.transforms as T
|
| 15 |
|
| 16 |
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
class Preprocess(nn.Module):
|
| 28 |
+
def __init__(self, device, opt, vae, tokenizer, text_encoder, unet,scheduler, hf_key=None):
|
| 29 |
super().__init__()
|
| 30 |
|
| 31 |
self.device = device
|
|
|
|
| 47 |
model_key = "stabilityai/stable-diffusion-2-depth"
|
| 48 |
else:
|
| 49 |
raise ValueError(f'Stable-diffusion version {self.sd_version} not supported.')
|
| 50 |
+
|
| 51 |
self.model_key = model_key
|
| 52 |
+
|
| 53 |
# Create model
|
| 54 |
+
# self.vae = AutoencoderKL.from_pretrained(model_key, subfolder="vae", revision="fp16",
|
| 55 |
+
# torch_dtype=torch.float16).to(self.device)
|
| 56 |
+
# self.tokenizer = CLIPTokenizer.from_pretrained(model_key, subfolder="tokenizer")
|
| 57 |
+
# self.text_encoder = CLIPTextModel.from_pretrained(model_key, subfolder="text_encoder", revision="fp16",
|
| 58 |
+
# torch_dtype=torch.float16).to(self.device)
|
| 59 |
+
# self.unet = UNet2DConditionModel.from_pretrained(model_key, subfolder="unet", revision="fp16",
|
| 60 |
+
# torch_dtype=torch.float16).to(self.device)
|
| 61 |
+
|
| 62 |
+
self.vae = vae
|
| 63 |
+
self.tokenizer = tokenizer
|
| 64 |
+
self.text_encoder = text_encoder
|
| 65 |
+
self.unet = unet
|
| 66 |
+
self.scheduler=scheduler
|
| 67 |
self.total_inverted_latents = {}
|
| 68 |
|
| 69 |
self.paths, self.frames, self.latents = self.get_data(self.config["data_path"], self.config["n_frames"])
|
|
|
|
| 82 |
self.canny_cond = self.get_canny_cond()
|
| 83 |
elif self.sd_version == 'depth':
|
| 84 |
self.depth_maps = self.prepare_depth_maps()
|
| 85 |
+
self.scheduler = scheduler
|
| 86 |
|
| 87 |
+
self.unet.enable_xformers_memory_efficient_attention()
|
| 88 |
print(f'[INFO] loaded stable diffusion!')
|
| 89 |
+
|
| 90 |
+
|
| 91 |
@torch.no_grad()
|
| 92 |
def prepare_depth_maps(self, model_type='DPT_Large', device='cuda'):
|
| 93 |
depth_maps = []
|
|
|
|
| 372 |
|
| 373 |
|
| 374 |
return frames, latents, total_inverted_latents, rgb_reconstruction
|
| 375 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|