Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

App Files Files Community

openfree commited on May 11

Commit

581c19e

verified ·

1 Parent(s): 430d42a

Update sonic.py

Browse files

Files changed (1) hide show

sonic.py +19 -18

sonic.py CHANGED Viewed

@@ -25,24 +25,24 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 def test(pipe, cfg, wav_enc, audio_pe, audio2bucket, image_encoder,
          width, height, batch):
-    # ---------- 배치 차원 맞추기 -----------------------------------------
     for k, v in batch.items():
         if isinstance(v, torch.Tensor):
             batch[k] = v.unsqueeze(0).to(pipe.device).float()
-    ref_img   = batch["ref_img"]        # (1,C,H,W)
     clip_img  = batch["clip_images"]
     face_mask = batch["face_mask"]
     image_embeds = image_encoder(clip_img).image_embeds
-    audio_feature = batch["audio_feature"]            # (1,80,T)
     audio_len     = int(batch["audio_len"])
-    step          = max(1, int(cfg.step))             # 최소 1 보장
-    window = 16_000                                   # 1-second chunk
     audio_prompts, last_prompts = [], []
-    # ---------- Whisper 인코딩 ------------------------------------------
     for i in range(0, audio_feature.shape[-1], window):
         chunk = audio_feature[:, :, i:i+window]
@@ -58,7 +58,7 @@ def test(pipe, cfg, wav_enc, audio_pe, audio2bucket, image_encoder,
     audio_prompts = torch.cat(audio_prompts, dim=1)       # (1,T,12,384)
     last_prompts  = torch.cat(last_prompts,  dim=1)       # (1,T,1,384)
-    # ---------- padding 규칙 --------------------------------------------
     audio_prompts = torch.cat(
         [torch.zeros_like(audio_prompts[:, :4]),
          audio_prompts,
@@ -76,28 +76,29 @@ def test(pipe, cfg, wav_enc, audio_pe, audio2bucket, image_encoder,
     for i in tqdm(range(num_chunks)):
         start = i * 2 * step
-        # --------- cond_clip : (1,10,12,384) → (1,10,5,384) --------------
-        cond_clip = audio_prompts[:, start : start + 10]          # (1,≤10,12,384)
-        if cond_clip.shape[1] < 10:                               # seq_len 패딩
             pad = torch.zeros_like(cond_clip[:, :10-cond_clip.shape[1]])
             cond_clip = torch.cat([cond_clip, pad], dim=1)
-        cond_clip = cond_clip[:, :, :5, :]                        # 5 blocks 선택
-        # --------- bucket_clip : (1,50,1,384) → unsqueeze(0) -----
-        bucket_clip = last_prompts[:, start : start + 50]         # (1,≤50,1,384)
-        if bucket_clip.shape[1] < 50:                             # 길이 패딩
             pad = torch.zeros_like(bucket_clip[:, :50-bucket_clip.shape[1]])
             bucket_clip = torch.cat([bucket_clip, pad], dim=1)
-        bucket_clip = bucket_clip.unsqueeze(0)                    # (1,1,50,1,384)
         motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
         ref_list.append(ref_img[0])
-        audio_list.append(audio_pe(cond_clip).squeeze(0)[0])      # (10,*)
         uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0)[0])
         motion_buckets.append(motion[0])
-    # ---------- diffusion ------------------------------------------------
     video = pipe(
         ref_img, clip_img, face_mask,
         audio_list, uncond_list, motion_buckets,
@@ -123,7 +124,7 @@ def test(pipe, cfg, wav_enc, audio_pe, audio2bucket, image_encoder,
 # ------------------------------------------------------------------
-#                           Sonic  class
 # ------------------------------------------------------------------
 class Sonic:
     config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")

 def test(pipe, cfg, wav_enc, audio_pe, audio2bucket, image_encoder,
          width, height, batch):
+    # -------- 배치 차원 정리 ---------------------------------------------
     for k, v in batch.items():
         if isinstance(v, torch.Tensor):
             batch[k] = v.unsqueeze(0).to(pipe.device).float()
+    ref_img   = batch["ref_img"]                       # (1,C,H,W)
     clip_img  = batch["clip_images"]
     face_mask = batch["face_mask"]
     image_embeds = image_encoder(clip_img).image_embeds
+    audio_feature = batch["audio_feature"]             # (1,80,T)
     audio_len     = int(batch["audio_len"])
+    step          = max(1, int(cfg.step))              # 최소 1 보장
+    # -------- Whisper 인코딩 --------------------------------------------
+    window = 16_000                                    # 1-초 단위
     audio_prompts, last_prompts = [], []
     for i in range(0, audio_feature.shape[-1], window):
         chunk = audio_feature[:, :, i:i+window]
     audio_prompts = torch.cat(audio_prompts, dim=1)       # (1,T,12,384)
     last_prompts  = torch.cat(last_prompts,  dim=1)       # (1,T,1,384)
+    # -------- 앞뒤 padding ----------------------------------------------
     audio_prompts = torch.cat(
         [torch.zeros_like(audio_prompts[:, :4]),
          audio_prompts,
     for i in tqdm(range(num_chunks)):
         start = i * 2 * step
+        # ------ cond_clip : (bz=1, f=1, w=10, b=5, c=384) ----------------
+        cond_clip = audio_prompts[:, start:start+10]          # (1,≤10,12,384)
+        if cond_clip.shape[1] < 10:                           # w 길이 패딩
             pad = torch.zeros_like(cond_clip[:, :10-cond_clip.shape[1]])
             cond_clip = torch.cat([cond_clip, pad], dim=1)
+        cond_clip = cond_clip.unsqueeze(1)                    # f 차원 삽입 → (1,1,10,12,384)
+        cond_clip = cond_clip[:, :, :, :5, :]                 # b 차원 5 로 절단 → (1,1,10,5,384)
+        # ------ bucket_clip : (1,1,50,1,384) -----------------------------
+        bucket_clip = last_prompts[:, start:start+50]         # (1,≤50,1,384)
+        if bucket_clip.shape[1] < 50:
             pad = torch.zeros_like(bucket_clip[:, :50-bucket_clip.shape[1]])
             bucket_clip = torch.cat([bucket_clip, pad], dim=1)
+        bucket_clip = bucket_clip.unsqueeze(1)                # (1,1,50,1,384)
         motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
         ref_list.append(ref_img[0])
+        audio_list.append(audio_pe(cond_clip).squeeze(0)[0])  # (tokens,1024)
         uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0)[0])
         motion_buckets.append(motion[0])
+    # -------- diffusion --------------------------------------------------
     video = pipe(
         ref_img, clip_img, face_mask,
         audio_list, uncond_list, motion_buckets,
 # ------------------------------------------------------------------
+#                            Sonic class
 # ------------------------------------------------------------------
 class Sonic:
     config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")