Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

App Files Files Community

openfree commited on May 11

Commit

f40c908

verified ·

1 Parent(s): 43cb38b

Update sonic.py

Browse files

Files changed (1) hide show

sonic.py +58 -78

sonic.py CHANGED Viewed

@@ -1,9 +1,7 @@
-import os, math
-import torch
 from PIL import Image
 from omegaconf import OmegaConf
 from tqdm import tqdm
-import cv2
 from diffusers import AutoencoderKLTemporalDecoder
 from diffusers.schedulers import EulerDiscreteScheduler
@@ -22,10 +20,6 @@ from src.dataset.face_align.align import AlignImage
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-# ------------------------------------------------------------------
-#            single image + speech  →  video-tensor generator
-# ------------------------------------------------------------------
-# …(상단 import 및 기타 정의 동일)…
 # ------------------------------------------------------------------
 #            single image + speech  →  video-tensor generator
@@ -42,20 +36,20 @@ def test(
     ref_img   = batch["ref_img"]
     clip_img  = batch["clip_images"]
     face_mask = batch["face_mask"]
-    image_embeds = image_encoder(clip_img).image_embeds
-    audio_feature = batch["audio_feature"]
     audio_len     = int(batch["audio_len"])
     step          = int(config.step)
-    window = 16_000                                                  # 1 초
     audio_prompts, last_prompts = [], []
     for i in range(0, audio_feature.shape[-1], window):
-        chunk = audio_feature[:, :, i : i + window]
         layers = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
         last   = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)
-        audio_prompts.append(torch.stack(layers, dim=2))             # (1,w,L,384)
         last_prompts.append(last)
     if not audio_prompts:
@@ -64,6 +58,7 @@ def test(
     audio_prompts = torch.cat(audio_prompts, dim=1)
     last_prompts  = torch.cat(last_prompts,  dim=1)
     audio_prompts = torch.cat(
         [torch.zeros_like(audio_prompts[:, :4]), audio_prompts,
          torch.zeros_like(audio_prompts[:, :6])], dim=1)
@@ -80,34 +75,35 @@ def test(
         start = i * 2 * step
         # ------------ cond_clip : (1,1,10,5,384) ------------------
-        clip_raw = audio_prompts[:, start : start + 10]               # (1,≤10,L,384)
-        if clip_raw.shape[1] < 10:                                    # w-pad
-            pad_w = torch.zeros_like(clip_raw[:, :10 - clip_raw.shape[1]])
             clip_raw = torch.cat([clip_raw, pad_w], dim=1)
-        # ★ L-pad → 정확히 5 레이어 만들기
         while clip_raw.shape[2] < 5:
             clip_raw = torch.cat([clip_raw, clip_raw[:, :, -1:]], dim=2)
-        clip_raw = clip_raw[:, :, :5]                                 # (1,10,5,384)
-        cond_clip = clip_raw.unsqueeze(1)                             # (1,1,10,5,384)
         # ------------ bucket_clip : (1,1,50,1,384) -----------------
         bucket_raw = last_prompts[:, start : start + 50]
-        if bucket_raw.shape[1] < 50:
-            pad_w = torch.zeros_like(bucket_raw[:, :50 - bucket_raw.shape[1]])
             bucket_raw = torch.cat([bucket_raw, pad_w], dim=1)
-        bucket_clip = bucket_raw.unsqueeze(1)                         # (1,1,50,1,384)
         motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
         ref_list.append(ref_img[0])
-        # ★ 여기: squeeze(0)만 (bz 제거). [0] 인덱싱 제거
-        audio_list.append(audio_pe(cond_clip).squeeze(0))             # (50,1024)
         uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0))
         motion_buckets.append(motion[0])
-    # ---- Stable Video Diffusion 호출 ------------------------------
     video = pipe(
         ref_img, clip_img, face_mask,
         audio_list, uncond_list, motion_buckets,
@@ -132,20 +128,17 @@ def test(
     return video.to(pipe.device).unsqueeze(0).cpu()
 # ------------------------------------------------------------------
-#                             Sonic class
 # ------------------------------------------------------------------
 class Sonic:
     config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
     config      = OmegaConf.load(config_file)
     def __init__(self, device_id: int = 0, enable_interpolate_frame: bool = True):
-        cfg = self.config
         cfg.use_interframe = enable_interpolate_frame
-        self.device = f"cuda:{device_id}" if device_id >= 0 and torch.cuda.is_available() else "cpu"
         cfg.pretrained_model_name_or_path = os.path.join(BASE_DIR, cfg.pretrained_model_name_or_path)
         self._load_models(cfg)
@@ -155,18 +148,18 @@ class Sonic:
     def _load_models(self, cfg):
         dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[cfg.weight_dtype]
-        vae   = AutoencoderKLTemporalDecoder.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae",          variant="fp16")
-        sched = EulerDiscreteScheduler          .from_pretrained(cfg.pretrained_model_name_or_path, subfolder="scheduler")
-        imgenc= CLIPVisionModelWithProjection   .from_pretrained(cfg.pretrained_model_name_or_path, subfolder="image_encoder", variant="fp16")
         unet  = UNetSpatioTemporalConditionModel.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="unet", variant="fp16")
         add_ip_adapters(unet, [32], [cfg.ip_audio_scale])
         a2t = AudioProjModel(10, 5, 384, 1024, 1024, 32).to(self.device)
         a2b = Audio2bucketModel(50, 1, 384, 1024, 1024, 1, 2).to(self.device)
-        unet .load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path),          map_location="cpu"))
-        a2t  .load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path),    map_location="cpu"))
-        a2b  .load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path),   map_location="cpu"))
         whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
         whisper.requires_grad_(False)
@@ -177,22 +170,21 @@ class Sonic:
             self.rife = RIFEModel(device=self.device)
             self.rife.load_model(os.path.join(BASE_DIR, "checkpoints/RIFE/"))
-        for m in (imgenc, vae, unet):
-            m.to(dtype)
-        self.pipe          = SonicPipeline(unet=unet, image_encoder=imgenc, vae=vae, scheduler=sched).to(device=self.device, dtype=dtype)
-        self.image_encoder = imgenc
         self.audio2token   = a2t
         self.audio2bucket  = a2b
         self.whisper       = whisper
     # --------------------------------------------------------------
-    def preprocess(self, image_path: str, expand_ratio: float = 1.0):
-        img = cv2.imread(image_path)
         h, w = img.shape[:2]
-        _, _, bboxes = self.face_det(img, maxface=True)
-        if bboxes:
-            x1, y1, ww, hh = bboxes[0]
             return {"face_num": 1, "crop_bbox": process_bbox((x1, y1, x1 + ww, y1 + hh), expand_ratio, h, w)}
         return {"face_num": 0, "crop_bbox": None}
@@ -200,50 +192,39 @@ class Sonic:
     @torch.no_grad()
     def process(
         self,
-        image_path: str,
-        audio_path: str,
-        output_path: str,
         min_resolution: int = 512,
-        inference_steps: int = 25,
         dynamic_scale: float = 1.0,
         keep_resolution: bool = False,
         seed: int | None = None,
     ):
         cfg = self.config
-        if seed is not None:
-            cfg.seed = seed
-        cfg.num_inference_steps = inference_steps
-        cfg.motion_bucket_scale = dynamic_scale
         seed_everything(cfg.seed)
-        # 이미지·오디오 → tensor
-        test_data = image_audio_to_tensor(
-            self.face_det,
-            self.feature_extractor,
-            image_path,
-            audio_path,
-            limit=-1,
-            image_size=min_resolution,
-            area=cfg.area,
         )
-        if test_data is None:
             return -1
-        h, w = test_data["ref_img"].shape[-2:]
-        resolution = (
-            f"{(Image.open(image_path).width // 2) * 2}x{(Image.open(image_path).height // 2) * 2}"
-            if keep_resolution
-            else f"{w}x{h}"
-        )
-        # 비디오 프레임 생성
         video = test(
             self.pipe, cfg, self.whisper, self.audio2token,
             self.audio2bucket, self.image_encoder,
-            width=w, height=h, batch=test_data,
         )
-        # 중간 프레임 보간
         if cfg.use_interframe:
             out = video.to(self.device)
             frames = []
@@ -253,12 +234,11 @@ class Sonic:
             frames.append(out[:, :, -1])
             video = torch.stack(frames, 2).cpu()
-        # 저장
-        tmp_mp4 = output_path.replace(".mp4", "_noaudio.mp4")
-        save_videos_grid(video, tmp_mp4, n_rows=video.shape[0], fps=cfg.fps * (2 if cfg.use_interframe else 1))
         os.system(
-            f"ffmpeg -i '{tmp_mp4}' -i '{audio_path}' -s {resolution} "
-            f"-vcodec libx264 -acodec aac -crf 18 -shortest '{output_path}' -y -loglevel error"
         )
-        os.remove(tmp_mp4)
         return 0

+import os, math, torch, cv2
 from PIL import Image
 from omegaconf import OmegaConf
 from tqdm import tqdm
 from diffusers import AutoencoderKLTemporalDecoder
 from diffusers.schedulers import EulerDiscreteScheduler
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 # ------------------------------------------------------------------
 #            single image + speech  →  video-tensor generator
     ref_img   = batch["ref_img"]
     clip_img  = batch["clip_images"]
     face_mask = batch["face_mask"]
+    image_embeds = image_encoder(clip_img).image_embeds             # (1,1024)
+    audio_feature = batch["audio_feature"]                          # (1, 80, T)
     audio_len     = int(batch["audio_len"])
     step          = int(config.step)
+    window = 16_000                                                 # 1-sec chunks
     audio_prompts, last_prompts = [], []
     for i in range(0, audio_feature.shape[-1], window):
+        chunk = audio_feature[:, :, i : i + window]                 # (1, 80, win)
         layers = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
         last   = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)
+        audio_prompts.append(torch.stack(layers, dim=2))            # (1, w, L, 384)
         last_prompts.append(last)
     if not audio_prompts:
     audio_prompts = torch.cat(audio_prompts, dim=1)
     last_prompts  = torch.cat(last_prompts,  dim=1)
+    # padding 규칙
     audio_prompts = torch.cat(
         [torch.zeros_like(audio_prompts[:, :4]), audio_prompts,
          torch.zeros_like(audio_prompts[:, :6])], dim=1)
         start = i * 2 * step
         # ------------ cond_clip : (1,1,10,5,384) ------------------
+        clip_raw = audio_prompts[:, start : start + 10]              # (1, ≤10, L, 384)
+        # ★ W-padding은 dim=1 이어야 함!
+        if clip_raw.shape[1] < 10:
+            pad_w = torch.zeros_like(clip_raw[:, : 10 - clip_raw.shape[1]])
             clip_raw = torch.cat([clip_raw, pad_w], dim=1)
+        # ★ L-padding은 dim=2
         while clip_raw.shape[2] < 5:
             clip_raw = torch.cat([clip_raw, clip_raw[:, :, -1:]], dim=2)
+        clip_raw = clip_raw[:, :, :5]                                # (1,10,5,384)
+        cond_clip = clip_raw.unsqueeze(1)                            # (1,1,10,5,384)
         # ------------ bucket_clip : (1,1,50,1,384) -----------------
         bucket_raw = last_prompts[:, start : start + 50]
+        if bucket_raw.shape[1] < 50:                                 # ★ dim=1
+            pad_w = torch.zeros_like(bucket_raw[:, : 50 - bucket_raw.shape[1]])
             bucket_raw = torch.cat([bucket_raw, pad_w], dim=1)
+        bucket_clip = bucket_raw.unsqueeze(1)                        # (1,1,50,1,384)
         motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
         ref_list.append(ref_img[0])
+        audio_list.append(audio_pe(cond_clip).squeeze(0))            # (50,1024)
         uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0))
         motion_buckets.append(motion[0])
+    # ---- Stable-Video-Diffusion 호출 ------------------------------
     video = pipe(
         ref_img, clip_img, face_mask,
         audio_list, uncond_list, motion_buckets,
     return video.to(pipe.device).unsqueeze(0).cpu()
 # ------------------------------------------------------------------
+#                        Sonic  클래스
 # ------------------------------------------------------------------
 class Sonic:
     config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
     config      = OmegaConf.load(config_file)
     def __init__(self, device_id: int = 0, enable_interpolate_frame: bool = True):
+        cfg                = self.config
         cfg.use_interframe = enable_interpolate_frame
+        self.device        = f"cuda:{device_id}" if device_id >= 0 and torch.cuda.is_available() else "cpu"
         cfg.pretrained_model_name_or_path = os.path.join(BASE_DIR, cfg.pretrained_model_name_or_path)
         self._load_models(cfg)
     def _load_models(self, cfg):
         dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[cfg.weight_dtype]
+        vae   = AutoencoderKLTemporalDecoder.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", variant="fp16")
+        sched = EulerDiscreteScheduler.from_pretrained        (cfg.pretrained_model_name_or_path, subfolder="scheduler")
+        img_e = CLIPVisionModelWithProjection.from_pretrained (cfg.pretrained_model_name_or_path, subfolder="image_encoder", variant="fp16")
         unet  = UNetSpatioTemporalConditionModel.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="unet", variant="fp16")
         add_ip_adapters(unet, [32], [cfg.ip_audio_scale])
         a2t = AudioProjModel(10, 5, 384, 1024, 1024, 32).to(self.device)
         a2b = Audio2bucketModel(50, 1, 384, 1024, 1024, 1, 2).to(self.device)
+        unet.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path),          map_location="cpu"))
+        a2t.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path),    map_location="cpu"))
+        a2b.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path),   map_location="cpu"))
         whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
         whisper.requires_grad_(False)
             self.rife = RIFEModel(device=self.device)
             self.rife.load_model(os.path.join(BASE_DIR, "checkpoints/RIFE/"))
+        img_e.to(dtype); vae.to(dtype); unet.to(dtype)
+        self.pipe          = SonicPipeline(unet=unet, image_encoder=img_e, vae=vae, scheduler=sched).to(device=self.device, dtype=dtype)
+        self.image_encoder = img_e
         self.audio2token   = a2t
         self.audio2bucket  = a2b
         self.whisper       = whisper
     # --------------------------------------------------------------
+    def preprocess(self, img_path: str, expand_ratio: float = 1.0):
+        img = cv2.imread(img_path)
         h, w = img.shape[:2]
+        _, _, faces = self.face_det(img, maxface=True)
+        if faces:
+            x1, y1, ww, hh = faces[0]
             return {"face_num": 1, "crop_bbox": process_bbox((x1, y1, x1 + ww, y1 + hh), expand_ratio, h, w)}
         return {"face_num": 0, "crop_bbox": None}
     @torch.no_grad()
     def process(
         self,
+        img_path:  str,
+        audio_path:str,
+        out_path:  str,
         min_resolution: int = 512,
+        inference_steps:int = 25,
         dynamic_scale: float = 1.0,
         keep_resolution: bool = False,
         seed: int | None = None,
     ):
         cfg = self.config
+        if seed is not None: cfg.seed = seed
+        cfg.num_inference_steps  = inference_steps
+        cfg.motion_bucket_scale  = dynamic_scale
         seed_everything(cfg.seed)
+        sample = image_audio_to_tensor(
+            self.face_det, self.feature_extractor,
+            img_path, audio_path,
+            limit=-1, image_size=min_resolution, area=cfg.area,
         )
+        if sample is None:
             return -1
+        h, w = sample["ref_img"].shape[-2:]
+        resolution = (f"{(Image.open(img_path).width  //2)*2}x{(Image.open(img_path).height//2)*2}"
+                      if keep_resolution else f"{w}x{h}")
         video = test(
             self.pipe, cfg, self.whisper, self.audio2token,
             self.audio2bucket, self.image_encoder,
+            w, h, sample,
         )
         if cfg.use_interframe:
             out = video.to(self.device)
             frames = []
             frames.append(out[:, :, -1])
             video = torch.stack(frames, 2).cpu()
+        tmp = out_path.replace(".mp4", "_noaudio.mp4")
+        save_videos_grid(video, tmp, n_rows=video.shape[0], fps=cfg.fps * (2 if cfg.use_interframe else 1))
         os.system(
+            f"ffmpeg -i '{tmp}' -i '{audio_path}' -s {resolution} "
+            f"-vcodec libx264 -acodec aac -crf 18 -shortest '{out_path}' -y -loglevel error"
         )
+        os.remove(tmp)
         return 0