Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

App Files Files Community

openfree commited on May 11

Commit

85ad908

verified ·

1 Parent(s): 1fb410d

Update sonic.py

Browse files

Files changed (1) hide show

sonic.py +91 -142

sonic.py CHANGED Viewed

@@ -1,7 +1,5 @@
-import os
-import math                                     # [★ 수정]  ceil 계산용
 import torch
-import torch.utils.checkpoint
 from PIL import Image
 from omegaconf import OmegaConf
 from tqdm import tqdm
@@ -26,109 +24,89 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 # ------------------------------------------------------------------
-# test() : 한 장의 얼굴 + 오디오 → 프레임 텐서 시퀀스
 # ------------------------------------------------------------------
 def test(
-    pipe,
-    config,
-    wav_enc,
-    audio_pe,
-    audio2bucket,
-    image_encoder,
-    width,
-    height,
-    batch,
 ):
-    # (B,C,H,W) → (1,B,C,H,W)
     for k, v in batch.items():
         if isinstance(v, torch.Tensor):
             batch[k] = v.unsqueeze(0).to(pipe.device).float()
-    ref_img   = batch["ref_img"]
     clip_img  = batch["clip_images"]
     face_mask = batch["face_mask"]
     image_embeds = image_encoder(clip_img).image_embeds
-    audio_feature = batch["audio_feature"]                # (C,T)
-    audio_len     = batch["audio_len"]                    # # of whisper tokens
     step          = int(config.step)
-    # -----------------------------  [★ 수정]  -----------------------------
-    #   ① 1 초 구간 단위를 위해 window 16000 → whisper‐tiny 기준 1 초
-    #   ② audio_len < step 이면 step 을 줄여 빈 리스트 방지
-    # --------------------------------------------------------------------
-    window = 16000
     if audio_len < step:
         step = max(1, audio_len)
-    # ── 오디오를 1 초 단위로 자르면서 Whisper 인코딩
-    audio_prompts, last_audio_prompts = [], []
     for i in range(0, audio_feature.shape[-1], window):
-        chunk = audio_feature[:, :, i : i + window]       # (B,C,window)
-        # whisper encoder
-        prompt_layers   = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
-        last_hidden     = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)
         audio_prompts.append(torch.stack(prompt_layers, dim=2))
-        last_audio_prompts.append(last_hidden)
-    # ── 예외: 아무 내용도 없으면 종료
     if len(audio_prompts) == 0:
-        raise ValueError(
-            "[ERROR] No speech recognized from the audio. "
-            "Please provide a valid speech recording."
-        )
-    # Whisper token 시퀀스 재구성 (+ 모델 padding 규칙)
-    audio_prompts      = torch.cat(audio_prompts, dim=1)[:, : audio_len * 2]
-    audio_prompts      = torch.cat(
-        [torch.zeros_like(audio_prompts[:, :4]), audio_prompts, torch.zeros_like(audio_prompts[:, :6])],
-        dim=1,
-    )
-    last_audio_prompts = torch.cat(last_audio_prompts, dim=1)[:, : audio_len * 2]
-    last_audio_prompts = torch.cat(
-        [torch.zeros_like(last_audio_prompts[:, :24]), last_audio_prompts, torch.zeros_like(last_audio_prompts[:, :26])],
-        dim=1,
-    )
-    # --------------------------------------------------------------------
-    # step 조정 결과를 반영해 총 chunk 횟수 계산 (ceil)
-    # --------------------------------------------------------------------
-    num_chunks = math.ceil(audio_len / step)
-    ref_tensor_list, audio_tensor_list, uncond_audio_tensor_list, motion_buckets = [], [], [], []
     for i in tqdm(range(num_chunks)):
         start = i * 2 * step
-        audio_clip           = audio_prompts[:, start : start + 10].unsqueeze(0)
-        audio_clip_for_bucket = last_audio_prompts[:, start : start + 50].unsqueeze(0)
-        motion_bucket = audio2bucket(audio_clip_for_bucket, image_embeds) * 16 + 16
-        motion_buckets.append(motion_bucket[0])
-        cond_audio   = audio_pe(audio_clip).squeeze(0)
-        uncond_audio = audio_pe(torch.zeros_like(audio_clip)).squeeze(0)
-        ref_tensor_list.append(ref_img[0])
-        audio_tensor_list.append(cond_audio[0])
-        uncond_audio_tensor_list.append(uncond_audio[0])
-    # 빈 리스트 방지
-    if len(audio_tensor_list) == 0:
-        raise ValueError("[ERROR] Audio too short for the configured 'step' size; no frames produced.")
-    # --------------------------------------------------------------------
     video = pipe(
-        ref_img,
-        clip_img,
-        face_mask,
-        audio_tensor_list,
-        uncond_audio_tensor_list,
-        motion_buckets,
-        height=height,
-        width=width,
-        num_frames=len(audio_tensor_list),
         decode_chunk_size=config.decode_chunk_size,
         motion_bucket_scale=config.motion_bucket_scale,
         fps=config.fps,
@@ -143,81 +121,60 @@ def test(
         num_inference_steps=config.num_inference_steps,
         i2i_noise_strength=config.i2i_noise_strength,
     ).frames
-    # --------------------------------------------------------------------
     video = (video * 0.5 + 0.5).clamp(0, 1)
     return video.to(pipe.device).unsqueeze(0).cpu()
 # ------------------------------------------------------------------
-#                      Sonic 클래스
 # ------------------------------------------------------------------
 class Sonic:
     config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
     config      = OmegaConf.load(config_file)
     def __init__(self, device_id: int = 0, enable_interpolate_frame: bool = True):
-        cfg                   = self.config
-        cfg.use_interframe    = enable_interpolate_frame
-        self.device           = f"cuda:{device_id}" if device_id >= 0 and torch.cuda.is_available() else "cpu"
         cfg.pretrained_model_name_or_path = os.path.join(BASE_DIR, cfg.pretrained_model_name_or_path)
-        # ───────────── 모델 로드
         self._load_models(cfg)
         print("Sonic init done")
-    # --------------------------------------------------------------
-    #               model / pipeline loader
     # --------------------------------------------------------------
     def _load_models(self, cfg):
-        dtype_map = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}
-        weight_dtype = dtype_map.get(cfg.weight_dtype, torch.float32)
-        # backbone
-        vae = AutoencoderKLTemporalDecoder.from_pretrained(
-            cfg.pretrained_model_name_or_path, subfolder="vae", variant="fp16"
-        )
-        scheduler = EulerDiscreteScheduler.from_pretrained(
-            cfg.pretrained_model_name_or_path, subfolder="scheduler"
-        )
-        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-            cfg.pretrained_model_name_or_path, subfolder="image_encoder", variant="fp16"
-        )
-        unet = UNetSpatioTemporalConditionModel.from_pretrained(
-            cfg.pretrained_model_name_or_path, subfolder="unet", variant="fp16"
-        )
         add_ip_adapters(unet, [32], [cfg.ip_audio_scale])
-        # audio adapters
-        audio2token = AudioProjModel(10, 5, 384, 1024, 1024, 32).to(self.device)
-        audio2bucket = Audio2bucketModel(50, 1, 384, 1024, 1024, 1, 2).to(self.device)
-        # checkpoints
         unet.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path), map_location="cpu"))
-        audio2token.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path), map_location="cpu"))
-        audio2bucket.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path), map_location="cpu"))
-        # whisper
         whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
         whisper.requires_grad_(False)
-        # extras
         self.feature_extractor = AutoFeatureExtractor.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny"))
         self.face_det = AlignImage(self.device, det_path=os.path.join(BASE_DIR, "checkpoints/yoloface_v5m.pt"))
         if cfg.use_interframe:
             self.rife = RIFEModel(device=self.device)
             self.rife.load_model(os.path.join(BASE_DIR, "checkpoints/RIFE/"))
-        # dtype
-        for m in (image_encoder, vae, unet):
-            m.to(weight_dtype)
-        # pipeline
-        pipe = SonicPipeline(unet=unet, image_encoder=image_encoder, vae=vae, scheduler=scheduler)
-        self.pipe          = pipe.to(device=self.device, dtype=weight_dtype)
-        self.audio2token   = audio2token
-        self.audio2bucket  = audio2bucket
-        self.image_encoder = image_encoder
         self.whisper       = whisper
     # --------------------------------------------------------------
@@ -227,9 +184,7 @@ class Sonic:
         _, _, bboxes = self.face_det(img, maxface=True)
         if bboxes:
             x1, y1, ww, hh = bboxes[0]
-            bbox = (x1, y1, x1 + ww, y1 + hh)
-            crop_bbox = process_bbox(bbox, expand_radio=expand_ratio, height=h, width=w)
-            return {"face_num": len(bboxes), "crop_bbox": crop_bbox}
         return {"face_num": 0, "crop_bbox": None}
     # --------------------------------------------------------------
@@ -248,19 +203,17 @@ class Sonic:
         cfg = self.config
         if seed is not None:
             cfg.seed = seed
-        cfg.num_inference_steps   = inference_steps
-        cfg.motion_bucket_scale   = dynamic_scale
         seed_everything(cfg.seed)
-        # ----------------------------------------------------------
-        # 이미지·오디오 → 텐서
-        # ----------------------------------------------------------
         test_data = image_audio_to_tensor(
             self.face_det,
             self.feature_extractor,
             image_path,
             audio_path,
-            limit=-1,            # 전체 오디오 사용
             image_size=min_resolution,
             area=cfg.area,
         )
@@ -269,14 +222,12 @@ class Sonic:
         h, w = test_data["ref_img"].shape[-2:]
         resolution = (
-            f"{(Image.open(image_path).width // 2)*2}x{(Image.open(image_path).height // 2)*2}"
             if keep_resolution
             else f"{w}x{h}"
         )
-        # ----------------------------------------------------------
-        # 프레임 생성
-        # ----------------------------------------------------------
         video = test(
             self.pipe,
             cfg,
@@ -291,22 +242,20 @@ class Sonic:
         # 중간 프레임 보간
         if cfg.use_interframe:
-            out, results = video.to(self.device), []
             for i in tqdm(range(out.shape[2] - 1), ncols=0):
-                I1, I2 = out[:, :, i], out[:, :, i + 1]
-                middle = self.rife.inference(I1, I2).clamp(0, 1).detach()
-                results.extend([out[:, :, i], middle])
-            results.append(out[:, :, -1])
-            video = torch.stack(results, 2).cpu()
-        # ----------------------------------------------------------
-        # 파일 저장
-        # ----------------------------------------------------------
-        tmp_video = output_path.replace(".mp4", "_noaudio.mp4")
-        save_videos_grid(video, tmp_video, n_rows=video.shape[0], fps=cfg.fps * (2 if cfg.use_interframe else 1))
         os.system(
-            f"ffmpeg -i '{tmp_video}' -i '{audio_path}' -s {resolution} "
             f"-vcodec libx264 -acodec aac -crf 18 -shortest '{output_path}' -y -loglevel error"
         )
-        os.remove(tmp_video)
         return 0

+import os, math
 import torch
 from PIL import Image
 from omegaconf import OmegaConf
 from tqdm import tqdm
 # ------------------------------------------------------------------
+#            single image + speech  →  video-tensor generator
 # ------------------------------------------------------------------
 def test(
+    pipe, config, wav_enc, audio_pe, audio2bucket, image_encoder,
+    width, height, batch,
 ):
+    # --- 배치 차원 맞추기 --------------------------------------------------
     for k, v in batch.items():
         if isinstance(v, torch.Tensor):
             batch[k] = v.unsqueeze(0).to(pipe.device).float()
+    ref_img   = batch["ref_img"]        # (1,C,H,W)
     clip_img  = batch["clip_images"]
     face_mask = batch["face_mask"]
     image_embeds = image_encoder(clip_img).image_embeds
+    audio_feature = batch["audio_feature"]                # (1,80,T)
+    audio_len     = int(batch["audio_len"])               # Python int
     step          = int(config.step)
+    # --- [★ 수정]  step 보정 (최소 1) --------------------------------------
     if audio_len < step:
         step = max(1, audio_len)
+    window = 16000                                        # 1 초 구간
+    audio_prompts, last_prompts = [], []
+    # --- window 단위 Whisper 인코딩 --------------------------------------
     for i in range(0, audio_feature.shape[-1], window):
+        chunk = audio_feature[:, :, i : i + window]
+        prompt_layers = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
+        last_hidden   = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)
         audio_prompts.append(torch.stack(prompt_layers, dim=2))
+        last_prompts.append(last_hidden)
     if len(audio_prompts) == 0:
+        raise ValueError("[ERROR] No speech recognised in the provided audio.")
+    audio_prompts = torch.cat(audio_prompts, dim=1)
+    last_prompts  = torch.cat(last_prompts, dim=1)
+    # padding 규칙
+    audio_prompts = torch.cat(
+        [torch.zeros_like(audio_prompts[:, :4]),  audio_prompts,
+         torch.zeros_like(audio_prompts[:, :6])], dim=1)
+    last_prompts = torch.cat(
+        [torch.zeros_like(last_prompts[:, :24]),  last_prompts,
+         torch.zeros_like(last_prompts[:, :26])], dim=1)
+    # --- [★ 수정]  반드시 ≥1 chunk ----------------------------------------
+    total_tokens = audio_prompts.shape[1]
+    num_chunks   = max(1, math.ceil(total_tokens / (2 * step)))
+    ref_list, audio_list, uncond_list, motion_buckets = [], [], [], []
     for i in tqdm(range(num_chunks)):
         start = i * 2 * step
+        cond_clip = audio_prompts[:, start : start + 10]
+        if cond_clip.shape[2] < 10:                               # [★ 수정] 패딩
+            pad = torch.zeros_like(cond_clip[:, :, : 10 - cond_clip.shape[2]])
+            cond_clip = torch.cat([cond_clip, pad], dim=2)
+        bucket_clip = last_prompts[:, start : start + 50]
+        if bucket_clip.shape[2] < 50:                             # [★ 수정] 패딩
+            pad = torch.zeros_like(bucket_clip[:, :, : 50 - bucket_clip.shape[2]])
+            bucket_clip = torch.cat([bucket_clip, pad], dim=2)
+        motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
+        ref_list.append(ref_img[0])
+        audio_list.append(audio_pe(cond_clip).squeeze(0)[0])
+        uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0)[0])
+        motion_buckets.append(motion[0])
+    # ----------------------------------------------------------------------
     video = pipe(
+        ref_img, clip_img, face_mask,
+        audio_list, uncond_list, motion_buckets,
+        height=height, width=width,
+        num_frames=len(audio_list),
         decode_chunk_size=config.decode_chunk_size,
         motion_bucket_scale=config.motion_bucket_scale,
         fps=config.fps,
         num_inference_steps=config.num_inference_steps,
         i2i_noise_strength=config.i2i_noise_strength,
     ).frames
     video = (video * 0.5 + 0.5).clamp(0, 1)
     return video.to(pipe.device).unsqueeze(0).cpu()
 # ------------------------------------------------------------------
+#                        Sonic  클래스
 # ------------------------------------------------------------------
 class Sonic:
     config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
     config      = OmegaConf.load(config_file)
     def __init__(self, device_id: int = 0, enable_interpolate_frame: bool = True):
+        cfg                = self.config
+        cfg.use_interframe = enable_interpolate_frame
+        self.device        = f"cuda:{device_id}" if device_id >= 0 and torch.cuda.is_available() else "cpu"
         cfg.pretrained_model_name_or_path = os.path.join(BASE_DIR, cfg.pretrained_model_name_or_path)
         self._load_models(cfg)
         print("Sonic init done")
     # --------------------------------------------------------------
     def _load_models(self, cfg):
+        dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[cfg.weight_dtype]
+        vae = AutoencoderKLTemporalDecoder.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", variant="fp16")
+        sched = EulerDiscreteScheduler.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="scheduler")
+        image_enc = CLIPVisionModelWithProjection.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="image_encoder", variant="fp16")
+        unet = UNetSpatioTemporalConditionModel.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="unet", variant="fp16")
         add_ip_adapters(unet, [32], [cfg.ip_audio_scale])
+        a2t = AudioProjModel(10, 5, 384, 1024, 1024, 32).to(self.device)
+        a2b = Audio2bucketModel(50, 1, 384, 1024, 1024, 1, 2).to(self.device)
         unet.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path), map_location="cpu"))
+        a2t.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path), map_location="cpu"))
+        a2b.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path), map_location="cpu"))
         whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
         whisper.requires_grad_(False)
         self.feature_extractor = AutoFeatureExtractor.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny"))
         self.face_det = AlignImage(self.device, det_path=os.path.join(BASE_DIR, "checkpoints/yoloface_v5m.pt"))
         if cfg.use_interframe:
             self.rife = RIFEModel(device=self.device)
             self.rife.load_model(os.path.join(BASE_DIR, "checkpoints/RIFE/"))
+        for m in (image_enc, vae, unet):
+            m.to(dtype)
+        self.pipe          = SonicPipeline(unet=unet, image_encoder=image_enc, vae=vae, scheduler=sched).to(device=self.device, dtype=dtype)
+        self.image_encoder = image_enc
+        self.audio2token   = a2t
+        self.audio2bucket  = a2b
         self.whisper       = whisper
     # --------------------------------------------------------------
         _, _, bboxes = self.face_det(img, maxface=True)
         if bboxes:
             x1, y1, ww, hh = bboxes[0]
+            return {"face_num": 1, "crop_bbox": process_bbox((x1, y1, x1 + ww, y1 + hh), expand_ratio, h, w)}
         return {"face_num": 0, "crop_bbox": None}
     # --------------------------------------------------------------
         cfg = self.config
         if seed is not None:
             cfg.seed = seed
+        cfg.num_inference_steps = inference_steps
+        cfg.motion_bucket_scale = dynamic_scale
         seed_everything(cfg.seed)
+        # 이미지·오디오 → tensor
         test_data = image_audio_to_tensor(
             self.face_det,
             self.feature_extractor,
             image_path,
             audio_path,
+            limit=-1,
             image_size=min_resolution,
             area=cfg.area,
         )
         h, w = test_data["ref_img"].shape[-2:]
         resolution = (
+            f"{(Image.open(image_path).width // 2) * 2}x{(Image.open(image_path).height // 2) * 2}"
             if keep_resolution
             else f"{w}x{h}"
         )
+        # 비디오 프레임 생성
         video = test(
             self.pipe,
             cfg,
         # 중간 프레임 보간
         if cfg.use_interframe:
+            out = video.to(self.device)
+            frames = []
             for i in tqdm(range(out.shape[2] - 1), ncols=0):
+                mid = self.rife.inference(out[:, :, i], out[:, :, i + 1]).clamp(0, 1).detach()
+                frames.extend([out[:, :, i], mid])
+            frames.append(out[:, :, -1])
+            video = torch.stack(frames, 2).cpu()
+        # 저장
+        tmp = output_path.replace(".mp4", "_noaudio.mp4")
+        save_videos_grid(video, tmp, n_rows=video.shape[0], fps=cfg.fps * (2 if cfg.use_interframe else 1))
         os.system(
+            f"ffmpeg -i '{tmp}' -i '{audio_path}' -s {resolution} "
             f"-vcodec libx264 -acodec aac -crf 18 -shortest '{output_path}' -y -loglevel error"
         )
+        os.remove(tmp)
         return 0