Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

App Files Files Community

openfree commited on May 11

Commit

e10969c

verified ·

1 Parent(s): 6ab32bd

Update sonic.py

Browse files

Files changed (1) hide show

sonic.py +39 -33

sonic.py CHANGED Viewed

@@ -1,9 +1,7 @@
-import os, math
-import torch
 from PIL import Image
 from omegaconf import OmegaConf
 from tqdm import tqdm
-import cv2
 from diffusers import AutoencoderKLTemporalDecoder
 from diffusers.schedulers import EulerDiscreteScheduler
@@ -40,15 +38,15 @@ def test(
     face_mask = batch["face_mask"]
     image_embeds = image_encoder(clip_img).image_embeds
-    audio_feature = batch["audio_feature"]                # (1,80,T)
-    audio_len     = int(batch["audio_len"])               # Python int
     step          = int(config.step)
-    # --- [★ 수정]  step 보정 (최소 1) --------------------------------------
     if audio_len < step:
         step = max(1, audio_len)
-    window = 16000                                        # 1 초 구간
     audio_prompts, last_prompts = [], []
     # --- window 단위 Whisper 인코딩 --------------------------------------
@@ -56,26 +54,28 @@ def test(
         chunk = audio_feature[:, :, i : i + window]
         prompt_layers = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
-        last_hidden   = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)
-        audio_prompts.append(torch.stack(prompt_layers, dim=2))
-        last_prompts.append(last_hidden)
     if len(audio_prompts) == 0:
         raise ValueError("[ERROR] No speech recognised in the provided audio.")
     audio_prompts = torch.cat(audio_prompts, dim=1)
-    last_prompts  = torch.cat(last_prompts, dim=1)
     # padding 규칙
     audio_prompts = torch.cat(
-        [torch.zeros_like(audio_prompts[:, :4]),  audio_prompts,
          torch.zeros_like(audio_prompts[:, :6])], dim=1)
     last_prompts = torch.cat(
-        [torch.zeros_like(last_prompts[:, :24]),  last_prompts,
          torch.zeros_like(last_prompts[:, :26])], dim=1)
-    # --- [★ 수정]  반드시 ≥1 chunk ----------------------------------------
     total_tokens = audio_prompts.shape[1]
     num_chunks   = max(1, math.ceil(total_tokens / (2 * step)))
@@ -84,19 +84,25 @@ def test(
     for i in tqdm(range(num_chunks)):
         start = i * 2 * step
-        cond_clip = audio_prompts[:, start : start + 10]
-        if cond_clip.shape[2] < 10:                               # [★ 수정] 패딩
-            pad = torch.zeros_like(cond_clip[:, :, : 10 - cond_clip.shape[2]])
-            cond_clip = torch.cat([cond_clip, pad], dim=2)
-        bucket_clip = last_prompts[:, start : start + 50]           # (1 , 50 , 384)
-        bucket_clip = bucket_clip.unsqueeze(0).unsqueeze(-2)        # (1 , 1 , 50 , 1 , 384) ✔
-        motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
         ref_list.append(ref_img[0])
-        audio_list.append(audio_pe(cond_clip).squeeze(0)[0])
-        uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0)[0])
         motion_buckets.append(motion[0])
     # ----------------------------------------------------------------------
@@ -125,7 +131,7 @@ def test(
 # ------------------------------------------------------------------
-#                        Sonic  클래스
 # ------------------------------------------------------------------
 class Sonic:
     config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
@@ -144,18 +150,18 @@ class Sonic:
     def _load_models(self, cfg):
         dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[cfg.weight_dtype]
-        vae = AutoencoderKLTemporalDecoder.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", variant="fp16")
         sched = EulerDiscreteScheduler.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="scheduler")
-        image_enc = CLIPVisionModelWithProjection.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="image_encoder", variant="fp16")
-        unet = UNetSpatioTemporalConditionModel.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="unet", variant="fp16")
         add_ip_adapters(unet, [32], [cfg.ip_audio_scale])
         a2t = AudioProjModel(10, 5, 384, 1024, 1024, 32).to(self.device)
         a2b = Audio2bucketModel(50, 1, 384, 1024, 1024, 1, 2).to(self.device)
-        unet.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path), map_location="cpu"))
-        a2t.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path), map_location="cpu"))
-        a2b.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path), map_location="cpu"))
         whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
         whisper.requires_grad_(False)
@@ -166,11 +172,11 @@ class Sonic:
             self.rife = RIFEModel(device=self.device)
             self.rife.load_model(os.path.join(BASE_DIR, "checkpoints/RIFE/"))
-        for m in (image_enc, vae, unet):
             m.to(dtype)
-        self.pipe          = SonicPipeline(unet=unet, image_encoder=image_enc, vae=vae, scheduler=sched).to(device=self.device, dtype=dtype)
-        self.image_encoder = image_enc
         self.audio2token   = a2t
         self.audio2bucket  = a2b
         self.whisper       = whisper

+import os, math, torch, cv2
 from PIL import Image
 from omegaconf import OmegaConf
 from tqdm import tqdm
 from diffusers import AutoencoderKLTemporalDecoder
 from diffusers.schedulers import EulerDiscreteScheduler
     face_mask = batch["face_mask"]
     image_embeds = image_encoder(clip_img).image_embeds
+    audio_feature = batch["audio_feature"]            # (1,80,T)
+    audio_len     = int(batch["audio_len"])           # Python int
     step          = int(config.step)
+    # --- step 보정 (최소 1) -----------------------------------------------
     if audio_len < step:
         step = max(1, audio_len)
+    window = 16000                                    # 1초 chunk
     audio_prompts, last_prompts = [], []
     # --- window 단위 Whisper 인코딩 --------------------------------------
         chunk = audio_feature[:, :, i : i + window]
         prompt_layers = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
+        last_hidden   = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)  # (1,t,1,384)
+        audio_prompts.append(torch.stack(prompt_layers, dim=2))  # (1,L,12,80)
+        last_prompts.append(last_hidden)                         # (1,L,1,384)
     if len(audio_prompts) == 0:
         raise ValueError("[ERROR] No speech recognised in the provided audio.")
     audio_prompts = torch.cat(audio_prompts, dim=1)
+    last_prompts  = torch.cat(last_prompts,  dim=1)
     # padding 규칙
     audio_prompts = torch.cat(
+        [torch.zeros_like(audio_prompts[:, :4]),
+         audio_prompts,
          torch.zeros_like(audio_prompts[:, :6])], dim=1)
     last_prompts = torch.cat(
+        [torch.zeros_like(last_prompts[:, :24]),
+         last_prompts,
          torch.zeros_like(last_prompts[:, :26])], dim=1)
+    # --- 반드시 ≥1 chunk --------------------------------------------------
     total_tokens = audio_prompts.shape[1]
     num_chunks   = max(1, math.ceil(total_tokens / (2 * step)))
     for i in tqdm(range(num_chunks)):
         start = i * 2 * step
+        cond_clip = audio_prompts[:, start : start + 10]              # (1,10,12,80)
+        if cond_clip.shape[1] < 10:                                   # 짧으면 패딩
+            pad = torch.zeros_like(cond_clip[:, : 10 - cond_clip.shape[1]])
+            cond_clip = torch.cat([cond_clip, pad], dim=1)
+        # ------------------  (★) bucket_clip 차원 맞춤  -------------------
+        bucket_clip = last_prompts[:, start : start + 50]             # (1,50,1,384)
+        if bucket_clip.shape[1] < 50:                                 # 짧으면 패딩
+            pad = torch.zeros_like(bucket_clip[:, : 50 - bucket_clip.shape[1]])
+            bucket_clip = torch.cat([bucket_clip, pad], dim=1)
+        bucket_clip = bucket_clip.unsqueeze(1)                        # → (1,1,50,1,384) ✔ 5-D
+        # -----------------------------------------------------------------
+        motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
         ref_list.append(ref_img[0])
+        audio_list.append(audio_pe(cond_clip.unsqueeze(1)).squeeze(0)[0])     # (10,···)→ unsqueeze 후 4-D
+        uncond_list.append(audio_pe(torch.zeros_like(cond_clip).unsqueeze(1)).squeeze(0)[0])
         motion_buckets.append(motion[0])
     # ----------------------------------------------------------------------
 # ------------------------------------------------------------------
+#                          Sonic  클래스
 # ------------------------------------------------------------------
 class Sonic:
     config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
     def _load_models(self, cfg):
         dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[cfg.weight_dtype]
+        vae   = AutoencoderKLTemporalDecoder.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", variant="fp16")
         sched = EulerDiscreteScheduler.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="scheduler")
+        imgE  = CLIPVisionModelWithProjection.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="image_encoder", variant="fp16")
+        unet  = UNetSpatioTemporalConditionModel.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="unet", variant="fp16")
         add_ip_adapters(unet, [32], [cfg.ip_audio_scale])
         a2t = AudioProjModel(10, 5, 384, 1024, 1024, 32).to(self.device)
         a2b = Audio2bucketModel(50, 1, 384, 1024, 1024, 1, 2).to(self.device)
+        unet.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path),          map_location="cpu"))
+        a2t.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path),    map_location="cpu"))
+        a2b.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path),   map_location="cpu"))
         whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
         whisper.requires_grad_(False)
             self.rife = RIFEModel(device=self.device)
             self.rife.load_model(os.path.join(BASE_DIR, "checkpoints/RIFE/"))
+        for m in (imgE, vae, unet):
             m.to(dtype)
+        self.pipe          = SonicPipeline(unet=unet, image_encoder=imgE, vae=vae, scheduler=sched).to(device=self.device, dtype=dtype)
+        self.image_encoder = imgE
         self.audio2token   = a2t
         self.audio2bucket  = a2b
         self.whisper       = whisper