dynamcraf2

Sleeping

App Files Files Community

seawolf2357 commited on Aug 11, 2024

Commit

eb70557

verified ·

1 Parent(s): feda343

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -12

app.py CHANGED Viewed

@@ -108,11 +108,9 @@ def generate_image(prompt: str):
 # @spaces.GPU(duration=300, gpu_type="l40s")
 def infer(prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, frames=64):
     try:
-        # 이미지 생성
         image_path = generate_image(prompt)
         image = torchvision.io.read_image(image_path).float() / 255.0
-        # 한글 입력 확인 및 번역
         if any('\u3131' <= char <= '\u318E' or '\uAC00' <= char <= '\uD7A3' for char in prompt):
             translated = translator(prompt, max_length=512)
             prompt = translated[0]['translation_text']
@@ -120,9 +118,7 @@ def infer(prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, frames=64):
         resolution = (576, 1024)
         save_fps = 8
         seed_everything(seed)
-        transform = transforms.Compose([
-            transforms.Resize(resolution, antialias=True),
-        ])
         print('start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
         start = time.time()
@@ -130,30 +126,30 @@ def infer(prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, frames=64):
             steps = 60
         batch_size = 1
-        channels = model.module.model.diffusion_model.out_channels
         h, w = resolution[0] // 8, resolution[1] // 8
         noise_shape = [batch_size, channels, frames, h, w]
         with torch.no_grad(), torch.cuda.amp.autocast():
-            text_emb = model.module.get_learned_conditioning([prompt])
             img_tensor = image.to(torch.cuda.current_device())
             img_tensor = (img_tensor - 0.5) * 2
             image_tensor_resized = transform(img_tensor)
             videos = image_tensor_resized.unsqueeze(0)
-            z = get_latent_z(model.module, videos.unsqueeze(2))
             img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)
-            cond_images = model.module.embedder(img_tensor.unsqueeze(0))
-            img_emb = model.module.image_proj_model(cond_images)
             imtext_cond = torch.cat([text_emb, img_emb], dim=1)
             fs = torch.tensor([fs], dtype=torch.long, device=torch.cuda.current_device())
             cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
-            batch_samples = batch_ddim_sampling(model.module, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
             video_path = './output.mp4'
             save_videos(batch_samples, './', filenames=['output'], fps=save_fps)
@@ -168,7 +164,8 @@ def infer(prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, frames=64):
         print(f"Error occurred: {e}")
         return None
     finally:
-        torch.cuda.empty_cache()
 i2v_examples = [
     ['우주인 복장으로 기타를 치는 남자', 30, 7.5, 1.0, 6, 123, 64],

 # @spaces.GPU(duration=300, gpu_type="l40s")
 def infer(prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, frames=64):
     try:
         image_path = generate_image(prompt)
         image = torchvision.io.read_image(image_path).float() / 255.0
         if any('\u3131' <= char <= '\u318E' or '\uAC00' <= char <= '\uD7A3' for char in prompt):
             translated = translator(prompt, max_length=512)
             prompt = translated[0]['translation_text']
         resolution = (576, 1024)
         save_fps = 8
         seed_everything(seed)
+        transform = transforms.Compose([transforms.Resize(resolution, antialias=True)])
         print('start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
         start = time.time()
             steps = 60
         batch_size = 1
+        channels = model.diffusion_model.out_channels  # model.module 제거
         h, w = resolution[0] // 8, resolution[1] // 8
         noise_shape = [batch_size, channels, frames, h, w]
         with torch.no_grad(), torch.cuda.amp.autocast():
+            text_emb = model.get_learned_conditioning([prompt])  # model.module 제거
             img_tensor = image.to(torch.cuda.current_device())
             img_tensor = (img_tensor - 0.5) * 2
             image_tensor_resized = transform(img_tensor)
             videos = image_tensor_resized.unsqueeze(0)
+            z = get_latent_z(model, videos.unsqueeze(2))  # model.module 제거
             img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)
+            cond_images = model.embedder(img_tensor.unsqueeze(0))  # model.module 제거
+            img_emb = model.image_proj_model(cond_images)  # model.module 제거
             imtext_cond = torch.cat([text_emb, img_emb], dim=1)
             fs = torch.tensor([fs], dtype=torch.long, device=torch.cuda.current_device())
             cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
+            batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)  # model.module 제거
             video_path = './output.mp4'
             save_videos(batch_samples, './', filenames=['output'], fps=save_fps)
         print(f"Error occurred: {e}")
         return None
     finally:
+        torch.cuda.empty_cache()
 i2v_examples = [
     ['우주인 복장으로 기타를 치는 남자', 30, 7.5, 1.0, 6, 123, 64],