Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

App Files Files Community

openfree commited on May 11

Commit

9d31513

verified ·

1 Parent(s): f40c908

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -232

app.py CHANGED Viewed

@@ -1,244 +1,135 @@
-import os, math, torch, cv2
 from PIL import Image
-from omegaconf import OmegaConf
-from tqdm import tqdm
-from diffusers import AutoencoderKLTemporalDecoder
-from diffusers.schedulers import EulerDiscreteScheduler
-from transformers import WhisperModel, CLIPVisionModelWithProjection, AutoFeatureExtractor
-from src.utils.util import save_videos_grid, seed_everything
-from src.dataset.test_preprocess import process_bbox, image_audio_to_tensor
-from src.models.base.unet_spatio_temporal_condition import (
-    UNetSpatioTemporalConditionModel, add_ip_adapters,
 )
-from src.pipelines.pipeline_sonic import SonicPipeline
-from src.models.audio_adapter.audio_proj import AudioProjModel
-from src.models.audio_adapter.audio_to_bucket import Audio2bucketModel
-from src.utils.RIFE.RIFE_HDv3 import RIFEModel
-from src.dataset.face_align.align import AlignImage
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 # ------------------------------------------------------------------
-#            single image + speech  →  video-tensor generator
 # ------------------------------------------------------------------
-def test(
-    pipe, config, wav_enc, audio_pe, audio2bucket, image_encoder,
-    width, height, batch,
-):
-    # ---- 배치 차원 맞추기 -----------------------------------------
-    for k, v in batch.items():
-        if isinstance(v, torch.Tensor):
-            batch[k] = v.unsqueeze(0).to(pipe.device).float()
-    ref_img   = batch["ref_img"]
-    clip_img  = batch["clip_images"]
-    face_mask = batch["face_mask"]
-    image_embeds = image_encoder(clip_img).image_embeds             # (1,1024)
-    audio_feature = batch["audio_feature"]                          # (1, 80, T)
-    audio_len     = int(batch["audio_len"])
-    step          = int(config.step)
-    window = 16_000                                                 # 1-sec chunks
-    audio_prompts, last_prompts = [], []
-    for i in range(0, audio_feature.shape[-1], window):
-        chunk = audio_feature[:, :, i : i + window]                 # (1, 80, win)
-        layers = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
-        last   = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)
-        audio_prompts.append(torch.stack(layers, dim=2))            # (1, w, L, 384)
-        last_prompts.append(last)
-    if not audio_prompts:
-        raise ValueError("[ERROR] No speech recognised in the provided audio.")
-    audio_prompts = torch.cat(audio_prompts, dim=1)
-    last_prompts  = torch.cat(last_prompts,  dim=1)
-    # padding 규칙
-    audio_prompts = torch.cat(
-        [torch.zeros_like(audio_prompts[:, :4]), audio_prompts,
-         torch.zeros_like(audio_prompts[:, :6])], dim=1)
-    last_prompts = torch.cat(
-        [torch.zeros_like(last_prompts[:, :24]), last_prompts,
-         torch.zeros_like(last_prompts[:, :26])], dim=1)
-    total_tokens = audio_prompts.shape[1]
-    num_chunks   = max(1, math.ceil(total_tokens / (2 * step)))
-    ref_list, audio_list, uncond_list, motion_buckets = [], [], [], []
-    for i in tqdm(range(num_chunks)):
-        start = i * 2 * step
-        # ------------ cond_clip : (1,1,10,5,384) ------------------
-        clip_raw = audio_prompts[:, start : start + 10]              # (1, ≤10, L, 384)
-        # ★ W-padding은 dim=1 이어야 함!
-        if clip_raw.shape[1] < 10:
-            pad_w = torch.zeros_like(clip_raw[:, : 10 - clip_raw.shape[1]])
-            clip_raw = torch.cat([clip_raw, pad_w], dim=1)
-        # ★ L-padding은 dim=2
-        while clip_raw.shape[2] < 5:
-            clip_raw = torch.cat([clip_raw, clip_raw[:, :, -1:]], dim=2)
-        clip_raw = clip_raw[:, :, :5]                                # (1,10,5,384)
-        cond_clip = clip_raw.unsqueeze(1)                            # (1,1,10,5,384)
-        # ------------ bucket_clip : (1,1,50,1,384) -----------------
-        bucket_raw = last_prompts[:, start : start + 50]
-        if bucket_raw.shape[1] < 50:                                 # ★ dim=1
-            pad_w = torch.zeros_like(bucket_raw[:, : 50 - bucket_raw.shape[1]])
-            bucket_raw = torch.cat([bucket_raw, pad_w], dim=1)
-        bucket_clip = bucket_raw.unsqueeze(1)                        # (1,1,50,1,384)
-        motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
-        ref_list.append(ref_img[0])
-        audio_list.append(audio_pe(cond_clip).squeeze(0))            # (50,1024)
-        uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0))
-        motion_buckets.append(motion[0])
-    # ---- Stable-Video-Diffusion 호출 ------------------------------
-    video = pipe(
-        ref_img, clip_img, face_mask,
-        audio_list, uncond_list, motion_buckets,
-        height=height, width=width,
-        num_frames=len(audio_list),
-        decode_chunk_size=config.decode_chunk_size,
-        motion_bucket_scale=config.motion_bucket_scale,
-        fps=config.fps,
-        noise_aug_strength=config.noise_aug_strength,
-        min_guidance_scale1=config.min_appearance_guidance_scale,
-        max_guidance_scale1=config.max_appearance_guidance_scale,
-        min_guidance_scale2=config.audio_guidance_scale,
-        max_guidance_scale2=config.audio_guidance_scale,
-        overlap=config.overlap,
-        shift_offset=config.shift_offset,
-        frames_per_batch=config.n_sample_frames,
-        num_inference_steps=config.num_inference_steps,
-        i2i_noise_strength=config.i2i_noise_strength,
-    ).frames
-    video = (video * 0.5 + 0.5).clamp(0, 1)
-    return video.to(pipe.device).unsqueeze(0).cpu()
 # ------------------------------------------------------------------
-#                        Sonic  클래스
 # ------------------------------------------------------------------
-class Sonic:
-    config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
-    config      = OmegaConf.load(config_file)
-    def __init__(self, device_id: int = 0, enable_interpolate_frame: bool = True):
-        cfg                = self.config
-        cfg.use_interframe = enable_interpolate_frame
-        self.device        = f"cuda:{device_id}" if device_id >= 0 and torch.cuda.is_available() else "cpu"
-        cfg.pretrained_model_name_or_path = os.path.join(BASE_DIR, cfg.pretrained_model_name_or_path)
-        self._load_models(cfg)
-        print("Sonic init done")
-    # --------------------------------------------------------------
-    def _load_models(self, cfg):
-        dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[cfg.weight_dtype]
-        vae   = AutoencoderKLTemporalDecoder.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", variant="fp16")
-        sched = EulerDiscreteScheduler.from_pretrained        (cfg.pretrained_model_name_or_path, subfolder="scheduler")
-        img_e = CLIPVisionModelWithProjection.from_pretrained (cfg.pretrained_model_name_or_path, subfolder="image_encoder", variant="fp16")
-        unet  = UNetSpatioTemporalConditionModel.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="unet", variant="fp16")
-        add_ip_adapters(unet, [32], [cfg.ip_audio_scale])
-        a2t = AudioProjModel(10, 5, 384, 1024, 1024, 32).to(self.device)
-        a2b = Audio2bucketModel(50, 1, 384, 1024, 1024, 1, 2).to(self.device)
-        unet.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path),          map_location="cpu"))
-        a2t.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path),    map_location="cpu"))
-        a2b.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path),   map_location="cpu"))
-        whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
-        whisper.requires_grad_(False)
-        self.feature_extractor = AutoFeatureExtractor.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny"))
-        self.face_det = AlignImage(self.device, det_path=os.path.join(BASE_DIR, "checkpoints/yoloface_v5m.pt"))
-        if cfg.use_interframe:
-            self.rife = RIFEModel(device=self.device)
-            self.rife.load_model(os.path.join(BASE_DIR, "checkpoints/RIFE/"))
-        img_e.to(dtype); vae.to(dtype); unet.to(dtype)
-        self.pipe          = SonicPipeline(unet=unet, image_encoder=img_e, vae=vae, scheduler=sched).to(device=self.device, dtype=dtype)
-        self.image_encoder = img_e
-        self.audio2token   = a2t
-        self.audio2bucket  = a2b
-        self.whisper       = whisper
-    # --------------------------------------------------------------
-    def preprocess(self, img_path: str, expand_ratio: float = 1.0):
-        img = cv2.imread(img_path)
-        h, w = img.shape[:2]
-        _, _, faces = self.face_det(img, maxface=True)
-        if faces:
-            x1, y1, ww, hh = faces[0]
-            return {"face_num": 1, "crop_bbox": process_bbox((x1, y1, x1 + ww, y1 + hh), expand_ratio, h, w)}
-        return {"face_num": 0, "crop_bbox": None}
-    # --------------------------------------------------------------
-    @torch.no_grad()
-    def process(
-        self,
-        img_path:  str,
-        audio_path:str,
-        out_path:  str,
-        min_resolution: int = 512,
-        inference_steps:int = 25,
-        dynamic_scale: float = 1.0,
-        keep_resolution: bool = False,
-        seed: int | None = None,
-    ):
-        cfg = self.config
-        if seed is not None: cfg.seed = seed
-        cfg.num_inference_steps  = inference_steps
-        cfg.motion_bucket_scale  = dynamic_scale
-        seed_everything(cfg.seed)
-        sample = image_audio_to_tensor(
-            self.face_det, self.feature_extractor,
-            img_path, audio_path,
-            limit=-1, image_size=min_resolution, area=cfg.area,
-        )
-        if sample is None:
-            return -1
-        h, w = sample["ref_img"].shape[-2:]
-        resolution = (f"{(Image.open(img_path).width  //2)*2}x{(Image.open(img_path).height//2)*2}"
-                      if keep_resolution else f"{w}x{h}")
-        video = test(
-            self.pipe, cfg, self.whisper, self.audio2token,
-            self.audio2bucket, self.image_encoder,
-            w, h, sample,
-        )
-        if cfg.use_interframe:
-            out = video.to(self.device)
-            frames = []
-            for i in tqdm(range(out.shape[2] - 1), ncols=0):
-                mid = self.rife.inference(out[:, :, i], out[:, :, i + 1]).clamp(0, 1).detach()
-                frames.extend([out[:, :, i], mid])
-            frames.append(out[:, :, -1])
-            video = torch.stack(frames, 2).cpu()
-        tmp = out_path.replace(".mp4", "_noaudio.mp4")
-        save_videos_grid(video, tmp, n_rows=video.shape[0], fps=cfg.fps * (2 if cfg.use_interframe else 1))
-        os.system(
-            f"ffmpeg -i '{tmp}' -i '{audio_path}' -s {resolution} "
-            f"-vcodec libx264 -acodec aac -crf 18 -shortest '{out_path}' -y -loglevel error"
-        )
-        os.remove(tmp)
-        return 0

+# ---------------------------------------------------------
+# app.py – Gradio UI + inference wrapper for the revised Sonic
+# ---------------------------------------------------------
+import os, io, hashlib
+import numpy as np
+from pydub import AudioSegment
 from PIL import Image
+import gradio as gr
+import spaces
+from sonic import Sonic     # ← 현재 수정-완료된 sonic.py 를 사용
+# ------------------------------------------------------------------
+# 1. 필요 리소스(모델) 자동 다운로드  ── HF Spaces에서는 캐시 활용
+# ------------------------------------------------------------------
+os.system(
+    'python3 -m pip install "huggingface_hub[cli]" accelerate -q; '
+    'huggingface-cli download LeonJoe13/Sonic '
+    '      --local-dir checkpoints -q; '
+    'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt '
+    '      --local-dir checkpoints/stable-video-diffusion-img2vid-xt -q; '
+    'huggingface-cli download openai/whisper-tiny '
+    '      --local-dir checkpoints/whisper-tiny -q'
 )
+pipe = Sonic()                                # GPU 메모리를 즉시 점유
 # ------------------------------------------------------------------
+# 2. 유틸
 # ------------------------------------------------------------------
+def md5(b: bytes) -> str:
+    return hashlib.md5(b).hexdigest()
+TMP_DIR  = "./tmp_path";  os.makedirs(TMP_DIR,  exist_ok=True)
+RES_DIR  = "./res_path";  os.makedirs(RES_DIR,  exist_ok=True)
+# ------------------------------------------------------------------
+# 3. Sonic 실행 (GPU 태그 10 min)
+# ------------------------------------------------------------------
+@spaces.GPU(duration=600)
+def get_video_res(img_path, wav_path, out_path, dyn_scale=1.0):
+    """실제 Sonic 파이프라인 실행."""
+    audio = AudioSegment.from_file(wav_path)
+    dur_s = len(audio) / 1000.0                             # 초
+    # 프레임 수 ≈ 초당 12.5  →  inference_steps
+    inf_steps = max(25, min(int(dur_s * 12.5), 750))
+    print(f"[INFO] Audio duration: {dur_s:.2f}s → inference_steps={inf_steps}")
+    # 얼굴 사전 검출(디버그용 로그)
+    face_info = pipe.preprocess(img_path)
+    print(f"[INFO] Face detection info: {face_info}")
+    if face_info["face_num"] == 0:
+        return -1                                           # 얼굴 없음
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    pipe.process(
+        img_path, wav_path, out_path,
+        inference_steps=inf_steps,
+        dynamic_scale=dyn_scale,
+        min_resolution=512,
+    )
+    return out_path
 # ------------------------------------------------------------------
+# 4. Gradio 인터페이스
 # ------------------------------------------------------------------
+def process_sonic(img: Image.Image, audio_tuple, dyn_scale):
+    if img is None:
+        raise gr.Error("Please upload an image.")
+    if audio_tuple is None:
+        raise gr.Error("Please upload an audio file.")
+    # ---- 캐싱 키 ----------------------------------------------------
+    img_bytes = io.BytesIO(); img.save(img_bytes, format="PNG")
+    img_key   = md5(img_bytes.getvalue())
+    rate, arr = audio_tuple
+    if arr.ndim == 1:
+        arr = arr[:, None]
+    segment = AudioSegment(
+        arr.tobytes(), frame_rate=rate,
+        sample_width=arr.dtype.itemsize, channels=arr.shape[1]
+    ).set_channels(1).set_frame_rate(16_000)
+    segment = segment[:60_000]                      # ≤60 s
+    buf_audio = io.BytesIO(); segment.export(buf_audio, format="wav")
+    aud_key   = md5(buf_audio.getvalue())
+    img_path = os.path.join(TMP_DIR, f"{img_key}.png")
+    wav_path = os.path.join(TMP_DIR, f"{aud_key}.wav")
+    out_path = os.path.join(RES_DIR, f"{img_key}_{aud_key}_{dyn_scale}.mp4")
+    # ---- 캐시 저장 --------------------------------------------------
+    if not os.path.exists(img_path):
+        with open(img_path, "wb") as f: f.write(img_bytes.getvalue())
+    if not os.path.exists(wav_path):
+        with open(wav_path, "wb") as f: f.write(buf_audio.getvalue())
+    if os.path.exists(out_path):
+        print(f"[INFO] Using cached result: {out_path}")
+        return out_path
+    print(f"[INFO] Generating new video with dynamic_scale={dyn_scale}")
+    res = get_video_res(img_path, wav_path, out_path, dyn_scale)
+    if res == -1:
+        raise gr.Error("No face detected in the image.")
+    return res
+# ---- Gradio UI -----------------------------------------------------
+CSS = """
+.gradio-container {font-family: 'Arial', sans-serif;}
+.main-header      {text-align:center;color:#2a2a2a;margin-bottom:2em;}
+"""
+with gr.Blocks(css=CSS) as demo:
+    gr.HTML("""
+    <div class="main-header">
+        <h1>🎭 Sonic – Portrait Animation (≤60 s audio)</h1>
+        <p>Still image → talking-head video, driven by your voice.</p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column():
+            img_in  = gr.Image(type="pil",  label="Portrait Image")
+            aud_in  = gr.Audio(type="numpy", label="Voice (≤1 min)")
+            dyn_sl  = gr.Slider(0.5, 2.0, 1.0, 0.1, label="Animation Intensity")
+            btn_go  = gr.Button("Generate", variant="primary")
+        vid_out = gr.Video(label="Result")
+    btn_go.click(process_sonic, inputs=[img_in, aud_in, dyn_sl], outputs=vid_out)
+demo.launch(share=True)