Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

App Files Files Community

openfree commited on May 11

Commit

430d42a

verified ·

1 Parent(s): e10969c

Update sonic.py

Browse files

Files changed (1) hide show

sonic.py +73 -109

sonic.py CHANGED Viewed

@@ -9,9 +9,7 @@ from transformers import WhisperModel, CLIPVisionModelWithProjection, AutoFeatur
 from src.utils.util import save_videos_grid, seed_everything
 from src.dataset.test_preprocess import process_bbox, image_audio_to_tensor
-from src.models.base.unet_spatio_temporal_condition import (
-    UNetSpatioTemporalConditionModel, add_ip_adapters,
-)
 from src.pipelines.pipeline_sonic import SonicPipeline
 from src.models.audio_adapter.audio_proj import AudioProjModel
 from src.models.audio_adapter.audio_to_bucket import Audio2bucketModel
@@ -22,13 +20,12 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 # ------------------------------------------------------------------
-#            single image + speech  →  video-tensor generator
 # ------------------------------------------------------------------
-def test(
-    pipe, config, wav_enc, audio_pe, audio2bucket, image_encoder,
-    width, height, batch,
-):
-    # --- 배치 차원 맞추기 --------------------------------------------------
     for k, v in batch.items():
         if isinstance(v, torch.Tensor):
             batch[k] = v.unsqueeze(0).to(pipe.device).float()
@@ -39,33 +36,29 @@ def test(
     image_embeds = image_encoder(clip_img).image_embeds
     audio_feature = batch["audio_feature"]            # (1,80,T)
-    audio_len     = int(batch["audio_len"])           # Python int
-    step          = int(config.step)
-    # --- step 보정 (최소 1) -----------------------------------------------
-    if audio_len < step:
-        step = max(1, audio_len)
-    window = 16000                                    # 1초 chunk
     audio_prompts, last_prompts = [], []
-    # --- window 단위 Whisper 인코딩 --------------------------------------
     for i in range(0, audio_feature.shape[-1], window):
-        chunk = audio_feature[:, :, i : i + window]
-        prompt_layers = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
-        last_hidden   = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)  # (1,t,1,384)
-        audio_prompts.append(torch.stack(prompt_layers, dim=2))  # (1,L,12,80)
-        last_prompts.append(last_hidden)                         # (1,L,1,384)
-    if len(audio_prompts) == 0:
         raise ValueError("[ERROR] No speech recognised in the provided audio.")
-    audio_prompts = torch.cat(audio_prompts, dim=1)
-    last_prompts  = torch.cat(last_prompts,  dim=1)
-    # padding 규칙
     audio_prompts = torch.cat(
         [torch.zeros_like(audio_prompts[:, :4]),
          audio_prompts,
@@ -75,7 +68,6 @@ def test(
          last_prompts,
          torch.zeros_like(last_prompts[:, :26])], dim=1)
-    # --- 반드시 ≥1 chunk --------------------------------------------------
     total_tokens = audio_prompts.shape[1]
     num_chunks   = max(1, math.ceil(total_tokens / (2 * step)))
@@ -84,46 +76,46 @@ def test(
     for i in tqdm(range(num_chunks)):
         start = i * 2 * step
-        cond_clip = audio_prompts[:, start : start + 10]              # (1,10,12,80)
-        if cond_clip.shape[1] < 10:                                   # 짧으면 패딩
-            pad = torch.zeros_like(cond_clip[:, : 10 - cond_clip.shape[1]])
             cond_clip = torch.cat([cond_clip, pad], dim=1)
-        # ------------------  (★) bucket_clip 차원 맞춤  -------------------
-        bucket_clip = last_prompts[:, start : start + 50]             # (1,50,1,384)
-        if bucket_clip.shape[1] < 50:                                 # 짧으면 패딩
-            pad = torch.zeros_like(bucket_clip[:, : 50 - bucket_clip.shape[1]])
             bucket_clip = torch.cat([bucket_clip, pad], dim=1)
-        bucket_clip = bucket_clip.unsqueeze(1)                        # → (1,1,50,1,384) ✔ 5-D
-        # -----------------------------------------------------------------
         motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
         ref_list.append(ref_img[0])
-        audio_list.append(audio_pe(cond_clip.unsqueeze(1)).squeeze(0)[0])     # (10,···)→ unsqueeze 후 4-D
-        uncond_list.append(audio_pe(torch.zeros_like(cond_clip).unsqueeze(1)).squeeze(0)[0])
         motion_buckets.append(motion[0])
-    # ----------------------------------------------------------------------
     video = pipe(
         ref_img, clip_img, face_mask,
         audio_list, uncond_list, motion_buckets,
         height=height, width=width,
         num_frames=len(audio_list),
-        decode_chunk_size=config.decode_chunk_size,
-        motion_bucket_scale=config.motion_bucket_scale,
-        fps=config.fps,
-        noise_aug_strength=config.noise_aug_strength,
-        min_guidance_scale1=config.min_appearance_guidance_scale,
-        max_guidance_scale1=config.max_appearance_guidance_scale,
-        min_guidance_scale2=config.audio_guidance_scale,
-        max_guidance_scale2=config.audio_guidance_scale,
-        overlap=config.overlap,
-        shift_offset=config.shift_offset,
-        frames_per_batch=config.n_sample_frames,
-        num_inference_steps=config.num_inference_steps,
-        i2i_noise_strength=config.i2i_noise_strength,
     ).frames
     video = (video * 0.5 + 0.5).clamp(0, 1)
@@ -131,16 +123,16 @@ def test(
 # ------------------------------------------------------------------
-#                          Sonic  클래스
 # ------------------------------------------------------------------
 class Sonic:
     config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
     config      = OmegaConf.load(config_file)
     def __init__(self, device_id: int = 0, enable_interpolate_frame: bool = True):
-        cfg                = self.config
         cfg.use_interframe = enable_interpolate_frame
-        self.device        = f"cuda:{device_id}" if device_id >= 0 and torch.cuda.is_available() else "cpu"
         cfg.pretrained_model_name_or_path = os.path.join(BASE_DIR, cfg.pretrained_model_name_or_path)
         self._load_models(cfg)
@@ -159,9 +151,9 @@ class Sonic:
         a2t = AudioProjModel(10, 5, 384, 1024, 1024, 32).to(self.device)
         a2b = Audio2bucketModel(50, 1, 384, 1024, 1024, 1, 2).to(self.device)
-        unet.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path),          map_location="cpu"))
-        a2t.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path),    map_location="cpu"))
-        a2b.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path),   map_location="cpu"))
         whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
         whisper.requires_grad_(False)
@@ -188,78 +180,50 @@ class Sonic:
         _, _, bboxes = self.face_det(img, maxface=True)
         if bboxes:
             x1, y1, ww, hh = bboxes[0]
-            return {"face_num": 1, "crop_bbox": process_bbox((x1, y1, x1 + ww, y1 + hh), expand_ratio, h, w)}
         return {"face_num": 0, "crop_bbox": None}
     # --------------------------------------------------------------
     @torch.no_grad()
-    def process(
-        self,
-        image_path: str,
-        audio_path: str,
-        output_path: str,
-        min_resolution: int = 512,
-        inference_steps: int = 25,
-        dynamic_scale: float = 1.0,
-        keep_resolution: bool = False,
-        seed: int | None = None,
-    ):
         cfg = self.config
         if seed is not None:
             cfg.seed = seed
-        cfg.num_inference_steps = inference_steps
-        cfg.motion_bucket_scale = dynamic_scale
         seed_everything(cfg.seed)
-        # 이미지·오디오 → tensor
         test_data = image_audio_to_tensor(
-            self.face_det,
-            self.feature_extractor,
-            image_path,
-            audio_path,
-            limit=-1,
-            image_size=min_resolution,
-            area=cfg.area,
         )
         if test_data is None:
             return -1
         h, w = test_data["ref_img"].shape[-2:]
-        resolution = (
-            f"{(Image.open(image_path).width // 2) * 2}x{(Image.open(image_path).height // 2) * 2}"
-            if keep_resolution
-            else f"{w}x{h}"
-        )
-        # 비디오 프레임 생성
-        video = test(
-            self.pipe,
-            cfg,
-            wav_enc=self.whisper,
-            audio_pe=self.audio2token,
-            audio2bucket=self.audio2bucket,
-            image_encoder=self.image_encoder,
-            width=w,
-            height=h,
-            batch=test_data,
-        )
-        # 중간 프레임 보간
         if cfg.use_interframe:
             out = video.to(self.device)
             frames = []
-            for i in tqdm(range(out.shape[2] - 1), ncols=0):
-                mid = self.rife.inference(out[:, :, i], out[:, :, i + 1]).clamp(0, 1).detach()
-                frames.extend([out[:, :, i], mid])
-            frames.append(out[:, :, -1])
             video = torch.stack(frames, 2).cpu()
-        # 저장
         tmp = output_path.replace(".mp4", "_noaudio.mp4")
-        save_videos_grid(video, tmp, n_rows=video.shape[0], fps=cfg.fps * (2 if cfg.use_interframe else 1))
-        os.system(
-            f"ffmpeg -i '{tmp}' -i '{audio_path}' -s {resolution} "
-            f"-vcodec libx264 -acodec aac -crf 18 -shortest '{output_path}' -y -loglevel error"
-        )
         os.remove(tmp)
         return 0

 from src.utils.util import save_videos_grid, seed_everything
 from src.dataset.test_preprocess import process_bbox, image_audio_to_tensor
+from src.models.base.unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel, add_ip_adapters
 from src.pipelines.pipeline_sonic import SonicPipeline
 from src.models.audio_adapter.audio_proj import AudioProjModel
 from src.models.audio_adapter.audio_to_bucket import Audio2bucketModel
 # ------------------------------------------------------------------
+#             single image + speech → video-tensor generator
 # ------------------------------------------------------------------
+def test(pipe, cfg, wav_enc, audio_pe, audio2bucket, image_encoder,
+         width, height, batch):
+    # ---------- 배치 차원 맞추기 -----------------------------------------
     for k, v in batch.items():
         if isinstance(v, torch.Tensor):
             batch[k] = v.unsqueeze(0).to(pipe.device).float()
     image_embeds = image_encoder(clip_img).image_embeds
     audio_feature = batch["audio_feature"]            # (1,80,T)
+    audio_len     = int(batch["audio_len"])
+    step          = max(1, int(cfg.step))             # 최소 1 보장
+    window = 16_000                                   # 1-second chunk
     audio_prompts, last_prompts = [], []
+    # ---------- Whisper 인코딩 ------------------------------------------
     for i in range(0, audio_feature.shape[-1], window):
+        chunk = audio_feature[:, :, i:i+window]
+        hs_all   = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
+        last_hid = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)  # (1,t,1,384)
+        audio_prompts.append(torch.stack(hs_all, dim=2))  # (1,t,12,384)
+        last_prompts.append(last_hid)                     # (1,t,1,384)
+    if not audio_prompts:
         raise ValueError("[ERROR] No speech recognised in the provided audio.")
+    audio_prompts = torch.cat(audio_prompts, dim=1)       # (1,T,12,384)
+    last_prompts  = torch.cat(last_prompts,  dim=1)       # (1,T,1,384)
+    # ---------- padding 규칙 --------------------------------------------
     audio_prompts = torch.cat(
         [torch.zeros_like(audio_prompts[:, :4]),
          audio_prompts,
          last_prompts,
          torch.zeros_like(last_prompts[:, :26])], dim=1)
     total_tokens = audio_prompts.shape[1]
     num_chunks   = max(1, math.ceil(total_tokens / (2 * step)))
     for i in tqdm(range(num_chunks)):
         start = i * 2 * step
+        # --------- cond_clip : (1,10,12,384) → (1,10,5,384) --------------
+        cond_clip = audio_prompts[:, start : start + 10]          # (1,≤10,12,384)
+        if cond_clip.shape[1] < 10:                               # seq_len 패딩
+            pad = torch.zeros_like(cond_clip[:, :10-cond_clip.shape[1]])
             cond_clip = torch.cat([cond_clip, pad], dim=1)
+        cond_clip = cond_clip[:, :, :5, :]                        # 5 blocks 선택
+        # --------- bucket_clip : (1,50,1,384) → unsqueeze(0) -----
+        bucket_clip = last_prompts[:, start : start + 50]         # (1,≤50,1,384)
+        if bucket_clip.shape[1] < 50:                             # 길이 패딩
+            pad = torch.zeros_like(bucket_clip[:, :50-bucket_clip.shape[1]])
             bucket_clip = torch.cat([bucket_clip, pad], dim=1)
+        bucket_clip = bucket_clip.unsqueeze(0)                    # (1,1,50,1,384)
         motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
         ref_list.append(ref_img[0])
+        audio_list.append(audio_pe(cond_clip).squeeze(0)[0])      # (10,*)
+        uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0)[0])
         motion_buckets.append(motion[0])
+    # ---------- diffusion ------------------------------------------------
     video = pipe(
         ref_img, clip_img, face_mask,
         audio_list, uncond_list, motion_buckets,
         height=height, width=width,
         num_frames=len(audio_list),
+        decode_chunk_size=cfg.decode_chunk_size,
+        motion_bucket_scale=cfg.motion_bucket_scale,
+        fps=cfg.fps,
+        noise_aug_strength=cfg.noise_aug_strength,
+        min_guidance_scale1=cfg.min_appearance_guidance_scale,
+        max_guidance_scale1=cfg.max_appearance_guidance_scale,
+        min_guidance_scale2=cfg.audio_guidance_scale,
+        max_guidance_scale2=cfg.audio_guidance_scale,
+        overlap=cfg.overlap,
+        shift_offset=cfg.shift_offset,
+        frames_per_batch=cfg.n_sample_frames,
+        num_inference_steps=cfg.num_inference_steps,
+        i2i_noise_strength=cfg.i2i_noise_strength,
     ).frames
     video = (video * 0.5 + 0.5).clamp(0, 1)
 # ------------------------------------------------------------------
+#                           Sonic  class
 # ------------------------------------------------------------------
 class Sonic:
     config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
     config      = OmegaConf.load(config_file)
     def __init__(self, device_id: int = 0, enable_interpolate_frame: bool = True):
+        cfg = self.config
         cfg.use_interframe = enable_interpolate_frame
+        self.device = f"cuda:{device_id}" if device_id >= 0 and torch.cuda.is_available() else "cpu"
         cfg.pretrained_model_name_or_path = os.path.join(BASE_DIR, cfg.pretrained_model_name_or_path)
         self._load_models(cfg)
         a2t = AudioProjModel(10, 5, 384, 1024, 1024, 32).to(self.device)
         a2b = Audio2bucketModel(50, 1, 384, 1024, 1024, 1, 2).to(self.device)
+        unet.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path),        map_location="cpu"))
+        a2t.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path),  map_location="cpu"))
+        a2b.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path), map_location="cpu"))
         whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
         whisper.requires_grad_(False)
         _, _, bboxes = self.face_det(img, maxface=True)
         if bboxes:
             x1, y1, ww, hh = bboxes[0]
+            return {"face_num": 1, "crop_bbox": process_bbox((x1, y1, x1+ww, y1+hh), expand_ratio, h, w)}
         return {"face_num": 0, "crop_bbox": None}
     # --------------------------------------------------------------
     @torch.no_grad()
+    def process(self, image_path: str, audio_path: str, output_path: str,
+                min_resolution: int = 512, inference_steps: int = 25,
+                dynamic_scale: float = 1.0, keep_resolution: bool = False,
+                seed: int | None = None):
         cfg = self.config
         if seed is not None:
             cfg.seed = seed
+        cfg.num_inference_steps  = inference_steps
+        cfg.motion_bucket_scale  = dynamic_scale
         seed_everything(cfg.seed)
         test_data = image_audio_to_tensor(
+            self.face_det, self.feature_extractor,
+            image_path, audio_path, limit=-1,
+            image_size=min_resolution, area=cfg.area,
         )
         if test_data is None:
             return -1
         h, w = test_data["ref_img"].shape[-2:]
+        resolution = (f"{(Image.open(image_path).width//2)*2}x{(Image.open(image_path).height//2)*2}"
+                      if keep_resolution else f"{w}x{h}")
+        video = test(self.pipe, cfg, self.whisper, self.audio2token,
+                     self.audio2bucket, self.image_encoder, w, h, test_data)
         if cfg.use_interframe:
             out = video.to(self.device)
             frames = []
+            for i in tqdm(range(out.shape[2]-1), ncols=0):
+                mid = self.rife.inference(out[:,:,i], out[:,:,i+1]).clamp(0,1).detach()
+                frames.extend([out[:,:,i], mid])
+            frames.append(out[:,:,-1])
             video = torch.stack(frames, 2).cpu()
         tmp = output_path.replace(".mp4", "_noaudio.mp4")
+        save_videos_grid(video, tmp, n_rows=video.shape[0], fps=cfg.fps*(2 if cfg.use_interframe else 1))
+        os.system(f"ffmpeg -i '{tmp}' -i '{audio_path}' -s {resolution} "
+                  f"-vcodec libx264 -acodec aac -crf 18 -shortest '{output_path}' -y -loglevel error")
         os.remove(tmp)
         return 0