Spaces:

Bils
/

ShortiFoley

Running on Zero

App Files Files Community

Bils commited on 14 days ago

Commit

33f355d

verified ·

1 Parent(s): 0cc15e7

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -60

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import gradio as gr
 import spaces
 from huggingface_hub import snapshot_download
-# ========= Paths & Repo =========
 ROOT = Path(__file__).parent.resolve()
 REPO_DIR = ROOT / "HunyuanVideo-Foley"
 WEIGHTS_DIR = ROOT / "weights"
@@ -16,11 +16,11 @@ ASSETS = ROOT / "assets"
 ASSETS.mkdir(exist_ok=True)
 BILS_BRAND = os.environ.get("BILS_BRAND", "Bilsimaging · Foley Studio")
-PRIMARY_COLOR = os.environ.get("PRIMARY_COLOR", "#6B5BFF")  # purple-ish
-MAX_SECS = int(os.environ.get("MAX_SECS", "22"))  # ZeroGPU-friendly
 TARGET_H = int(os.environ.get("TARGET_H", "480"))  # downscale target height
-SR = int(os.environ.get("TARGET_SR", "48000"))     # target audio sample rate
 def sh(cmd: str):
     print(">>", cmd)
@@ -36,9 +36,62 @@ def ffprobe_duration(path: str) -> float:
     except Exception:
         return 0.0
 def prepare_once():
-    """Clone repo + download weights on cold start."""
-    REPO_DIR.exists() or sh("git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git")
     WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
     snapshot_download(
         repo_id="tencent/HunyuanVideo-Foley",
@@ -47,6 +100,7 @@ def prepare_once():
         repo_type="model",
     )
     os.environ["HIFI_FOLEY_MODEL_PATH"] = str(WEIGHTS_DIR)
     CACHE_DIR.mkdir(exist_ok=True)
     OUT_DIR.mkdir(exist_ok=True)
@@ -55,34 +109,31 @@ prepare_once()
 # ========= Preprocessing =========
 def preprocess_video(in_path: str) -> Tuple[str, float]:
     """
-    - Validates duration (<= MAX_SECS). If longer, auto-trims to MAX_SECS.
-    - Downscales to TARGET_H height (keeping AR), H.264 baseline, AAC passthrough.
-    - Returns path to processed mp4 and final duration.
     """
     dur = ffprobe_duration(in_path)
     temp_dir = Path(tempfile.mkdtemp(prefix="pre_"))
     trimmed = temp_dir / "trim.mp4"
     processed = temp_dir / "proc.mp4"
-    # If longer than budget, trim to MAX_SECS (from start).
-    if dur == 0:
-        raise RuntimeError("Unable to read the video duration.")
-    trim_filter = []
-    if dur > MAX_SECS:
-        trim_filter = ["-t", str(MAX_SECS)]
-    # First, ensure we have a small, uniform container (mp4)
     sh(" ".join([
         "ffmpeg", "-y", "-i", f"\"{in_path}\"",
-        *trim_filter,
-        "-an",                               # remove original audio (we're generating new foley)
         "-vcodec", "libx264", "-preset", "veryfast", "-crf", "23",
         "-movflags", "+faststart",
         f"\"{trimmed}\""
     ]))
-    # Downscale to TARGET_H keeping AR; re-encode efficiently
-    # Use mod2 dimensions for compatibility
     vf = f"scale=-2:{TARGET_H}:flags=bicubic"
     sh(" ".join([
         "ffmpeg", "-y", "-i", f"\"{trimmed}\"",
@@ -99,23 +150,23 @@ def preprocess_video(in_path: str) -> Tuple[str, float]:
     return str(processed), final_dur
 # ========= Inference (ZeroGPU) =========
-@spaces.GPU(duration=240)  # ~4 minutes per call window
 def run_model(video_path: str, prompt_text: str) -> str:
     """
-    Run Tencent's infer.py on ZeroGPU. Returns path to WAV.
     """
     job_id = uuid.uuid4().hex[:8]
     work_out = OUT_DIR / f"job_{job_id}"
     work_out.mkdir(parents=True, exist_ok=True)
     cmd = [
-        "python", f"{REPO_DIR}/infer.py",
         "--model_path", str(WEIGHTS_DIR),
-        "--config_path", f"{REPO_DIR}/configs/hunyuanvideo-foley-xxl.yaml",
         "--single_video", video_path,
         "--single_prompt", json.dumps(prompt_text or ""),
         "--output_dir", str(work_out),
-        "--device", "cuda"
     ]
     sh(" ".join(cmd))
@@ -127,7 +178,7 @@ def run_model(video_path: str, prompt_text: str) -> str:
     if not wav:
         raise RuntimeError("No audio produced by the model.")
-    # Normalize / resample to SR (safeguard)
     fixed = work_out / "foley_48k.wav"
     sh(" ".join([
         "ffmpeg", "-y", "-i", f"\"{str(wav)}\"",
@@ -136,10 +187,9 @@ def run_model(video_path: str, prompt_text: str) -> str:
     ]))
     return str(fixed)
-# ========= Post: optional mux back to the video =========
 def mux_audio_with_video(video_path: str, audio_path: str) -> str:
     out_path = Path(tempfile.mkdtemp(prefix="mux_")) / "with_foley.mp4"
-    # Copy video, add foley audio as AAC
     sh(" ".join([
         "ffmpeg", "-y",
         "-i", f"\"{video_path}\"",
@@ -151,22 +201,16 @@ def mux_audio_with_video(video_path: str, audio_path: str) -> str:
     ]))
     return str(out_path)
-# ========= Gradio UI Logic =========
 def single_generate(video: str, prompt: str, want_mux: bool, project_name: str) -> Tuple[Optional[str], Optional[str], str, list]:
-    """
-    Returns: (wav_path, muxed_video_path_or_None, status_markdown, history_list)
-    """
     history = []
     try:
         if not video:
             return None, None, "⚠️ Please upload a video.", history
-        # Preprocess
         history.append(["Preprocess", "Downscaling / trimming…"])
         pre_path, final_dur = preprocess_video(video)
-        # Run model (ZeroGPU)
         history.append(["Inference", "Generating foley on GPU…"])
         wav = run_model(pre_path, prompt or "")
-        # Optional Mux
         muxed = None
         if want_mux:
             history.append(["Mux", "Combining foley with video…"])
@@ -178,10 +222,6 @@ def single_generate(video: str, prompt: str, want_mux: bool, project_name: str)
         return None, None, f"❌ {type(e).__name__}: {e}", history
 def batch_lite_generate(files: List[str], prompt: str, want_mux: bool) -> Tuple[str, list]:
-    """
-    Run a tiny queue sequentially; ZeroGPU handles each call in series.
-    We enforce 3 items max to stay quota-friendly.
-    """
     log = []
     if not files:
         return "⚠️ Please upload 1–3 videos.", log
@@ -201,7 +241,6 @@ def batch_lite_generate(files: List[str], prompt: str, want_mux: bool) -> Tuple[
             log.append([f"Done {i}", "OK"])
         except Exception as e:
             log.append([f"Error {i}", str(e)])
-    # Write a small manifest to outputs
     manifest = OUT_DIR / f"batchlite_{uuid.uuid4().hex[:6]}.json"
     manifest.write_text(json.dumps(
         [{"wav": w, "video": v} for (w, v) in outputs], ensure_ascii=False, indent=2
@@ -221,13 +260,9 @@ THEME_CSS = f"""
   color: white; padding: 12px 16px; border-radius: 12px;
 }}
 #brandbar strong {{ letter-spacing: .3px; }}
-footer, #footer {{}}
 """
-with gr.Blocks(
-    css=THEME_CSS,
-    title="Foley Studio · ZeroGPU"
-) as demo:
     with gr.Row():
         gr.HTML(f'<div id="brandbar"><strong>{BILS_BRAND}</strong> — HunyuanVideo-Foley on ZeroGPU</div>')
@@ -236,13 +271,13 @@ with gr.Blocks(
             with gr.Group():
                 project_name = gr.Textbox(label="Project name (optional)", placeholder="e.g., JawharaFM Teaser 09-2025")
                 with gr.Row():
-                    v_single = gr.Video(label="Video (≤ ~20s recommended)")
-                    p_single = gr.Textbox(label="Sound prompt (optional)", placeholder="e.g., soft footsteps, indoor reverb, light rain outside")
                 with gr.Row():
                     want_mux_single = gr.Checkbox(value=True, label="Mux foley back into video (MP4)")
                 run_btn = gr.Button("Generate", variant="primary")
                 with gr.Row():
-                    out_audio = gr.Audio(label="Generated Foley (48 kHz WAV)", type="filepath")
                     out_mux = gr.Video(label="Video + Foley (MP4)", visible=True)
                 status_md = gr.Markdown()
                 history_table = gr.Dataframe(headers=["Step", "Note"], datatype=["str","str"], interactive=False, wrap=True, label="Activity")
@@ -269,18 +304,15 @@ with gr.Blocks(
         with gr.Tab("⚙️ Settings / Tips"):
             gr.Markdown(f"""
-**ZeroGPU Budget Tips**
-- Keep clips **≤ {MAX_SECS}s** (tool trims automatically if longer).
-- Video is downscaled to **{TARGET_H}p** to speed up inference.
-- If you hit a quota message, try again later; ZeroGPU limits GPU minutes per visitor.
-**Branding**
-- Change brand name / color via environment variables:
-  - `BILS_BRAND` → header text
-  - `PRIMARY_COLOR` → UI accent hex
-**Outputs**
-- WAV is 48 kHz stereo. Toggle **Mux** to get a ready MP4 with the foley track.
 """)
     demo.queue(max_size=24).launch()

 import spaces
 from huggingface_hub import snapshot_download
+# ========= Paths & Config =========
 ROOT = Path(__file__).parent.resolve()
 REPO_DIR = ROOT / "HunyuanVideo-Foley"
 WEIGHTS_DIR = ROOT / "weights"
 ASSETS.mkdir(exist_ok=True)
 BILS_BRAND = os.environ.get("BILS_BRAND", "Bilsimaging · Foley Studio")
+PRIMARY_COLOR = os.environ.get("PRIMARY_COLOR", "#6B5BFF")  # UI accent
+MAX_SECS = int(os.environ.get("MAX_SECS", "22"))  # ZeroGPU-friendly clip length
 TARGET_H = int(os.environ.get("TARGET_H", "480"))  # downscale target height
+SR = int(os.environ.get("TARGET_SR", "48000"))     # output WAV sample rate
 def sh(cmd: str):
     print(">>", cmd)
     except Exception:
         return 0.0
+def _clone_without_lfs():
+    """
+    Try a shallow clone while skipping LFS smudge so demo MP4s aren't pulled.
+    Falls back to sparse checkout with needed paths only.
+    """
+    if REPO_DIR.exists():
+        return
+    # Attempt 1: shallow clone, disable LFS filters
+    try:
+        sh(
+            "GIT_LFS_SKIP_SMUDGE=1 "
+            "git -c filter.lfs.smudge= -c filter.lfs.required=false "
+            f"clone --depth 1 https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git {REPO_DIR}"
+        )
+        # Optional: remove assets folder if present
+        assets = REPO_DIR / "assets"
+        if assets.exists():
+            shutil.rmtree(assets, ignore_errors=True)
+        return
+    except subprocess.CalledProcessError as e:
+        print("Shallow clone with LFS skipped failed, trying sparse checkout…", e)
+    # Attempt 2: sparse checkout minimal files
+    REPO_DIR.mkdir(parents=True, exist_ok=True)
+    sh(f"git -C {REPO_DIR} init")
+    sh(
+        f"git -C {REPO_DIR} -c filter.lfs.smudge= -c filter.lfs.required=false "
+        "remote add origin https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git"
+    )
+    sh(f"git -C {REPO_DIR} config core.sparseCheckout true")
+    # Choose only essential paths
+    sparse_file = REPO_DIR / ".git" / "info" / "sparse-checkout"
+    sparse_file.parent.mkdir(parents=True, exist_ok=True)
+    sparse_file.write_text("\n".join([
+        "infer.py",
+        "configs/",
+        "gradio_app.py",
+        "requirements.txt",
+        "LICENSE",
+        "README.md",
+    ]) + "\n")
+    # Branch might be main; change to master if needed
+    try:
+        sh(f"git -C {REPO_DIR} fetch --depth 1 origin main")
+        sh(f"git -C {REPO_DIR} checkout main")
+    except subprocess.CalledProcessError:
+        sh(f"git -C {REPO_DIR} fetch --depth 1 origin master")
+        sh(f"git -C {REPO_DIR} checkout master")
 def prepare_once():
+    """Clone code (skipping LFS), download weights, set env, prepare dirs."""
+    _clone_without_lfs()
     WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
     snapshot_download(
         repo_id="tencent/HunyuanVideo-Foley",
         repo_type="model",
     )
     os.environ["HIFI_FOLEY_MODEL_PATH"] = str(WEIGHTS_DIR)
     CACHE_DIR.mkdir(exist_ok=True)
     OUT_DIR.mkdir(exist_ok=True)
 # ========= Preprocessing =========
 def preprocess_video(in_path: str) -> Tuple[str, float]:
     """
+    - Validate/trim to <= MAX_SECS.
+    - Downscale to TARGET_H (keep AR), strip original audio.
+    - Return processed mp4 path and final duration.
     """
     dur = ffprobe_duration(in_path)
+    if dur == 0:
+        raise RuntimeError("Unable to read the video duration.")
     temp_dir = Path(tempfile.mkdtemp(prefix="pre_"))
     trimmed = temp_dir / "trim.mp4"
     processed = temp_dir / "proc.mp4"
+    trim_args = ["-t", str(MAX_SECS)] if dur > MAX_SECS else []
+    # Normalize container & remove audio
     sh(" ".join([
         "ffmpeg", "-y", "-i", f"\"{in_path}\"",
+        *trim_args,
+        "-an",
         "-vcodec", "libx264", "-preset", "veryfast", "-crf", "23",
         "-movflags", "+faststart",
         f"\"{trimmed}\""
     ]))
+    # Downscale to TARGET_H; ensure mod2 width, baseline profile for compatibility
     vf = f"scale=-2:{TARGET_H}:flags=bicubic"
     sh(" ".join([
         "ffmpeg", "-y", "-i", f"\"{trimmed}\"",
     return str(processed), final_dur
 # ========= Inference (ZeroGPU) =========
+@spaces.GPU(duration=240)  # ~4 minutes per call (fits ZeroGPU window)
 def run_model(video_path: str, prompt_text: str) -> str:
     """
+    Call Tencent's infer.py on GPU and return a 48 kHz WAV path.
     """
     job_id = uuid.uuid4().hex[:8]
     work_out = OUT_DIR / f"job_{job_id}"
     work_out.mkdir(parents=True, exist_ok=True)
     cmd = [
+        "python", str(REPO_DIR / "infer.py"),
         "--model_path", str(WEIGHTS_DIR),
+        "--config_path", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml"),
         "--single_video", video_path,
         "--single_prompt", json.dumps(prompt_text or ""),
         "--output_dir", str(work_out),
+        "--device", "cuda",
     ]
     sh(" ".join(cmd))
     if not wav:
         raise RuntimeError("No audio produced by the model.")
+    # Normalize / resample to SR stereo
     fixed = work_out / "foley_48k.wav"
     sh(" ".join([
         "ffmpeg", "-y", "-i", f"\"{str(wav)}\"",
     ]))
     return str(fixed)
+# ========= Optional: Mux Foley back to video =========
 def mux_audio_with_video(video_path: str, audio_path: str) -> str:
     out_path = Path(tempfile.mkdtemp(prefix="mux_")) / "with_foley.mp4"
     sh(" ".join([
         "ffmpeg", "-y",
         "-i", f"\"{video_path}\"",
     ]))
     return str(out_path)
+# ========= UI Handlers =========
 def single_generate(video: str, prompt: str, want_mux: bool, project_name: str) -> Tuple[Optional[str], Optional[str], str, list]:
     history = []
     try:
         if not video:
             return None, None, "⚠️ Please upload a video.", history
         history.append(["Preprocess", "Downscaling / trimming…"])
         pre_path, final_dur = preprocess_video(video)
         history.append(["Inference", "Generating foley on GPU…"])
         wav = run_model(pre_path, prompt or "")
         muxed = None
         if want_mux:
             history.append(["Mux", "Combining foley with video…"])
         return None, None, f"❌ {type(e).__name__}: {e}", history
 def batch_lite_generate(files: List[str], prompt: str, want_mux: bool) -> Tuple[str, list]:
     log = []
     if not files:
         return "⚠️ Please upload 1–3 videos.", log
             log.append([f"Done {i}", "OK"])
         except Exception as e:
             log.append([f"Error {i}", str(e)])
     manifest = OUT_DIR / f"batchlite_{uuid.uuid4().hex[:6]}.json"
     manifest.write_text(json.dumps(
         [{"wav": w, "video": v} for (w, v) in outputs], ensure_ascii=False, indent=2
   color: white; padding: 12px 16px; border-radius: 12px;
 }}
 #brandbar strong {{ letter-spacing: .3px; }}
 """
+with gr.Blocks(css=THEME_CSS, title="Foley Studio · ZeroGPU") as demo:
     with gr.Row():
         gr.HTML(f'<div id="brandbar"><strong>{BILS_BRAND}</strong> — HunyuanVideo-Foley on ZeroGPU</div>')
             with gr.Group():
                 project_name = gr.Textbox(label="Project name (optional)", placeholder="e.g., JawharaFM Teaser 09-2025")
                 with gr.Row():
+                    v_single = gr.Video(label=f"Video (≤ ~{MAX_SECS}s recommended)")
+                    p_single = gr.Textbox(label="Sound prompt (optional)", placeholder="e.g., soft footsteps, indoor reverb, light rain")
                 with gr.Row():
                     want_mux_single = gr.Checkbox(value=True, label="Mux foley back into video (MP4)")
                 run_btn = gr.Button("Generate", variant="primary")
                 with gr.Row():
+                    out_audio = gr.Audio(label=f"Generated Foley ({SR//1000} kHz WAV)", type="filepath")
                     out_mux = gr.Video(label="Video + Foley (MP4)", visible=True)
                 status_md = gr.Markdown()
                 history_table = gr.Dataframe(headers=["Step", "Note"], datatype=["str","str"], interactive=False, wrap=True, label="Activity")
         with gr.Tab("⚙️ Settings / Tips"):
             gr.Markdown(f"""
+**ZeroGPU Tips**
+- Clips are trimmed to **≤ {MAX_SECS}s** automatically.
+- Video downscaled to **{TARGET_H}p** to fit the GPU time window.
+- If you hit a quota error, try again later (ZeroGPU limits minutes per visitor).
+**Branding via ENV**
+- `BILS_BRAND` → header text
+- `PRIMARY_COLOR` → hex color
+- `MAX_SECS`, `TARGET_H`, `TARGET_SR` → processing behavior
 """)
     demo.queue(max_size=24).launch()