Bils commited on
Commit
33f355d
·
verified ·
1 Parent(s): 0cc15e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -60
app.py CHANGED
@@ -6,7 +6,7 @@ import gradio as gr
6
  import spaces
7
  from huggingface_hub import snapshot_download
8
 
9
- # ========= Paths & Repo =========
10
  ROOT = Path(__file__).parent.resolve()
11
  REPO_DIR = ROOT / "HunyuanVideo-Foley"
12
  WEIGHTS_DIR = ROOT / "weights"
@@ -16,11 +16,11 @@ ASSETS = ROOT / "assets"
16
  ASSETS.mkdir(exist_ok=True)
17
 
18
  BILS_BRAND = os.environ.get("BILS_BRAND", "Bilsimaging · Foley Studio")
19
- PRIMARY_COLOR = os.environ.get("PRIMARY_COLOR", "#6B5BFF") # purple-ish
20
 
21
- MAX_SECS = int(os.environ.get("MAX_SECS", "22")) # ZeroGPU-friendly
22
  TARGET_H = int(os.environ.get("TARGET_H", "480")) # downscale target height
23
- SR = int(os.environ.get("TARGET_SR", "48000")) # target audio sample rate
24
 
25
  def sh(cmd: str):
26
  print(">>", cmd)
@@ -36,9 +36,62 @@ def ffprobe_duration(path: str) -> float:
36
  except Exception:
37
  return 0.0
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def prepare_once():
40
- """Clone repo + download weights on cold start."""
41
- REPO_DIR.exists() or sh("git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git")
 
42
  WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
43
  snapshot_download(
44
  repo_id="tencent/HunyuanVideo-Foley",
@@ -47,6 +100,7 @@ def prepare_once():
47
  repo_type="model",
48
  )
49
  os.environ["HIFI_FOLEY_MODEL_PATH"] = str(WEIGHTS_DIR)
 
50
  CACHE_DIR.mkdir(exist_ok=True)
51
  OUT_DIR.mkdir(exist_ok=True)
52
 
@@ -55,34 +109,31 @@ prepare_once()
55
  # ========= Preprocessing =========
56
  def preprocess_video(in_path: str) -> Tuple[str, float]:
57
  """
58
- - Validates duration (<= MAX_SECS). If longer, auto-trims to MAX_SECS.
59
- - Downscales to TARGET_H height (keeping AR), H.264 baseline, AAC passthrough.
60
- - Returns path to processed mp4 and final duration.
61
  """
62
  dur = ffprobe_duration(in_path)
 
 
 
63
  temp_dir = Path(tempfile.mkdtemp(prefix="pre_"))
64
  trimmed = temp_dir / "trim.mp4"
65
  processed = temp_dir / "proc.mp4"
66
 
67
- # If longer than budget, trim to MAX_SECS (from start).
68
- if dur == 0:
69
- raise RuntimeError("Unable to read the video duration.")
70
- trim_filter = []
71
- if dur > MAX_SECS:
72
- trim_filter = ["-t", str(MAX_SECS)]
73
 
74
- # First, ensure we have a small, uniform container (mp4)
75
  sh(" ".join([
76
  "ffmpeg", "-y", "-i", f"\"{in_path}\"",
77
- *trim_filter,
78
- "-an", # remove original audio (we're generating new foley)
79
  "-vcodec", "libx264", "-preset", "veryfast", "-crf", "23",
80
  "-movflags", "+faststart",
81
  f"\"{trimmed}\""
82
  ]))
83
 
84
- # Downscale to TARGET_H keeping AR; re-encode efficiently
85
- # Use mod2 dimensions for compatibility
86
  vf = f"scale=-2:{TARGET_H}:flags=bicubic"
87
  sh(" ".join([
88
  "ffmpeg", "-y", "-i", f"\"{trimmed}\"",
@@ -99,23 +150,23 @@ def preprocess_video(in_path: str) -> Tuple[str, float]:
99
  return str(processed), final_dur
100
 
101
  # ========= Inference (ZeroGPU) =========
102
- @spaces.GPU(duration=240) # ~4 minutes per call window
103
  def run_model(video_path: str, prompt_text: str) -> str:
104
  """
105
- Run Tencent's infer.py on ZeroGPU. Returns path to WAV.
106
  """
107
  job_id = uuid.uuid4().hex[:8]
108
  work_out = OUT_DIR / f"job_{job_id}"
109
  work_out.mkdir(parents=True, exist_ok=True)
110
 
111
  cmd = [
112
- "python", f"{REPO_DIR}/infer.py",
113
  "--model_path", str(WEIGHTS_DIR),
114
- "--config_path", f"{REPO_DIR}/configs/hunyuanvideo-foley-xxl.yaml",
115
  "--single_video", video_path,
116
  "--single_prompt", json.dumps(prompt_text or ""),
117
  "--output_dir", str(work_out),
118
- "--device", "cuda"
119
  ]
120
  sh(" ".join(cmd))
121
 
@@ -127,7 +178,7 @@ def run_model(video_path: str, prompt_text: str) -> str:
127
  if not wav:
128
  raise RuntimeError("No audio produced by the model.")
129
 
130
- # Normalize / resample to SR (safeguard)
131
  fixed = work_out / "foley_48k.wav"
132
  sh(" ".join([
133
  "ffmpeg", "-y", "-i", f"\"{str(wav)}\"",
@@ -136,10 +187,9 @@ def run_model(video_path: str, prompt_text: str) -> str:
136
  ]))
137
  return str(fixed)
138
 
139
- # ========= Post: optional mux back to the video =========
140
  def mux_audio_with_video(video_path: str, audio_path: str) -> str:
141
  out_path = Path(tempfile.mkdtemp(prefix="mux_")) / "with_foley.mp4"
142
- # Copy video, add foley audio as AAC
143
  sh(" ".join([
144
  "ffmpeg", "-y",
145
  "-i", f"\"{video_path}\"",
@@ -151,22 +201,16 @@ def mux_audio_with_video(video_path: str, audio_path: str) -> str:
151
  ]))
152
  return str(out_path)
153
 
154
- # ========= Gradio UI Logic =========
155
  def single_generate(video: str, prompt: str, want_mux: bool, project_name: str) -> Tuple[Optional[str], Optional[str], str, list]:
156
- """
157
- Returns: (wav_path, muxed_video_path_or_None, status_markdown, history_list)
158
- """
159
  history = []
160
  try:
161
  if not video:
162
  return None, None, "⚠️ Please upload a video.", history
163
- # Preprocess
164
  history.append(["Preprocess", "Downscaling / trimming…"])
165
  pre_path, final_dur = preprocess_video(video)
166
- # Run model (ZeroGPU)
167
  history.append(["Inference", "Generating foley on GPU…"])
168
  wav = run_model(pre_path, prompt or "")
169
- # Optional Mux
170
  muxed = None
171
  if want_mux:
172
  history.append(["Mux", "Combining foley with video…"])
@@ -178,10 +222,6 @@ def single_generate(video: str, prompt: str, want_mux: bool, project_name: str)
178
  return None, None, f"❌ {type(e).__name__}: {e}", history
179
 
180
  def batch_lite_generate(files: List[str], prompt: str, want_mux: bool) -> Tuple[str, list]:
181
- """
182
- Run a tiny queue sequentially; ZeroGPU handles each call in series.
183
- We enforce 3 items max to stay quota-friendly.
184
- """
185
  log = []
186
  if not files:
187
  return "⚠️ Please upload 1–3 videos.", log
@@ -201,7 +241,6 @@ def batch_lite_generate(files: List[str], prompt: str, want_mux: bool) -> Tuple[
201
  log.append([f"Done {i}", "OK"])
202
  except Exception as e:
203
  log.append([f"Error {i}", str(e)])
204
- # Write a small manifest to outputs
205
  manifest = OUT_DIR / f"batchlite_{uuid.uuid4().hex[:6]}.json"
206
  manifest.write_text(json.dumps(
207
  [{"wav": w, "video": v} for (w, v) in outputs], ensure_ascii=False, indent=2
@@ -221,13 +260,9 @@ THEME_CSS = f"""
221
  color: white; padding: 12px 16px; border-radius: 12px;
222
  }}
223
  #brandbar strong {{ letter-spacing: .3px; }}
224
- footer, #footer {{}}
225
  """
226
 
227
- with gr.Blocks(
228
- css=THEME_CSS,
229
- title="Foley Studio · ZeroGPU"
230
- ) as demo:
231
  with gr.Row():
232
  gr.HTML(f'<div id="brandbar"><strong>{BILS_BRAND}</strong> — HunyuanVideo-Foley on ZeroGPU</div>')
233
 
@@ -236,13 +271,13 @@ with gr.Blocks(
236
  with gr.Group():
237
  project_name = gr.Textbox(label="Project name (optional)", placeholder="e.g., JawharaFM Teaser 09-2025")
238
  with gr.Row():
239
- v_single = gr.Video(label="Video (≤ ~20s recommended)")
240
- p_single = gr.Textbox(label="Sound prompt (optional)", placeholder="e.g., soft footsteps, indoor reverb, light rain outside")
241
  with gr.Row():
242
  want_mux_single = gr.Checkbox(value=True, label="Mux foley back into video (MP4)")
243
  run_btn = gr.Button("Generate", variant="primary")
244
  with gr.Row():
245
- out_audio = gr.Audio(label="Generated Foley (48 kHz WAV)", type="filepath")
246
  out_mux = gr.Video(label="Video + Foley (MP4)", visible=True)
247
  status_md = gr.Markdown()
248
  history_table = gr.Dataframe(headers=["Step", "Note"], datatype=["str","str"], interactive=False, wrap=True, label="Activity")
@@ -269,18 +304,15 @@ with gr.Blocks(
269
 
270
  with gr.Tab("⚙️ Settings / Tips"):
271
  gr.Markdown(f"""
272
- **ZeroGPU Budget Tips**
273
- - Keep clips **≤ {MAX_SECS}s** (tool trims automatically if longer).
274
- - Video is downscaled to **{TARGET_H}p** to speed up inference.
275
- - If you hit a quota message, try again later; ZeroGPU limits GPU minutes per visitor.
276
-
277
- **Branding**
278
- - Change brand name / color via environment variables:
279
- - `BILS_BRAND` → header text
280
- - `PRIMARY_COLOR` → UI accent hex
281
-
282
- **Outputs**
283
- - WAV is 48 kHz stereo. Toggle **Mux** to get a ready MP4 with the foley track.
284
  """)
285
 
286
  demo.queue(max_size=24).launch()
 
6
  import spaces
7
  from huggingface_hub import snapshot_download
8
 
9
+ # ========= Paths & Config =========
10
  ROOT = Path(__file__).parent.resolve()
11
  REPO_DIR = ROOT / "HunyuanVideo-Foley"
12
  WEIGHTS_DIR = ROOT / "weights"
 
16
  ASSETS.mkdir(exist_ok=True)
17
 
18
  BILS_BRAND = os.environ.get("BILS_BRAND", "Bilsimaging · Foley Studio")
19
+ PRIMARY_COLOR = os.environ.get("PRIMARY_COLOR", "#6B5BFF") # UI accent
20
 
21
+ MAX_SECS = int(os.environ.get("MAX_SECS", "22")) # ZeroGPU-friendly clip length
22
  TARGET_H = int(os.environ.get("TARGET_H", "480")) # downscale target height
23
+ SR = int(os.environ.get("TARGET_SR", "48000")) # output WAV sample rate
24
 
25
  def sh(cmd: str):
26
  print(">>", cmd)
 
36
  except Exception:
37
  return 0.0
38
 
39
+ def _clone_without_lfs():
40
+ """
41
+ Try a shallow clone while skipping LFS smudge so demo MP4s aren't pulled.
42
+ Falls back to sparse checkout with needed paths only.
43
+ """
44
+ if REPO_DIR.exists():
45
+ return
46
+
47
+ # Attempt 1: shallow clone, disable LFS filters
48
+ try:
49
+ sh(
50
+ "GIT_LFS_SKIP_SMUDGE=1 "
51
+ "git -c filter.lfs.smudge= -c filter.lfs.required=false "
52
+ f"clone --depth 1 https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git {REPO_DIR}"
53
+ )
54
+ # Optional: remove assets folder if present
55
+ assets = REPO_DIR / "assets"
56
+ if assets.exists():
57
+ shutil.rmtree(assets, ignore_errors=True)
58
+ return
59
+ except subprocess.CalledProcessError as e:
60
+ print("Shallow clone with LFS skipped failed, trying sparse checkout…", e)
61
+
62
+ # Attempt 2: sparse checkout minimal files
63
+ REPO_DIR.mkdir(parents=True, exist_ok=True)
64
+ sh(f"git -C {REPO_DIR} init")
65
+ sh(
66
+ f"git -C {REPO_DIR} -c filter.lfs.smudge= -c filter.lfs.required=false "
67
+ "remote add origin https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git"
68
+ )
69
+ sh(f"git -C {REPO_DIR} config core.sparseCheckout true")
70
+
71
+ # Choose only essential paths
72
+ sparse_file = REPO_DIR / ".git" / "info" / "sparse-checkout"
73
+ sparse_file.parent.mkdir(parents=True, exist_ok=True)
74
+ sparse_file.write_text("\n".join([
75
+ "infer.py",
76
+ "configs/",
77
+ "gradio_app.py",
78
+ "requirements.txt",
79
+ "LICENSE",
80
+ "README.md",
81
+ ]) + "\n")
82
+
83
+ # Branch might be main; change to master if needed
84
+ try:
85
+ sh(f"git -C {REPO_DIR} fetch --depth 1 origin main")
86
+ sh(f"git -C {REPO_DIR} checkout main")
87
+ except subprocess.CalledProcessError:
88
+ sh(f"git -C {REPO_DIR} fetch --depth 1 origin master")
89
+ sh(f"git -C {REPO_DIR} checkout master")
90
+
91
  def prepare_once():
92
+ """Clone code (skipping LFS), download weights, set env, prepare dirs."""
93
+ _clone_without_lfs()
94
+
95
  WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
96
  snapshot_download(
97
  repo_id="tencent/HunyuanVideo-Foley",
 
100
  repo_type="model",
101
  )
102
  os.environ["HIFI_FOLEY_MODEL_PATH"] = str(WEIGHTS_DIR)
103
+
104
  CACHE_DIR.mkdir(exist_ok=True)
105
  OUT_DIR.mkdir(exist_ok=True)
106
 
 
109
  # ========= Preprocessing =========
110
  def preprocess_video(in_path: str) -> Tuple[str, float]:
111
  """
112
+ - Validate/trim to <= MAX_SECS.
113
+ - Downscale to TARGET_H (keep AR), strip original audio.
114
+ - Return processed mp4 path and final duration.
115
  """
116
  dur = ffprobe_duration(in_path)
117
+ if dur == 0:
118
+ raise RuntimeError("Unable to read the video duration.")
119
+
120
  temp_dir = Path(tempfile.mkdtemp(prefix="pre_"))
121
  trimmed = temp_dir / "trim.mp4"
122
  processed = temp_dir / "proc.mp4"
123
 
124
+ trim_args = ["-t", str(MAX_SECS)] if dur > MAX_SECS else []
 
 
 
 
 
125
 
126
+ # Normalize container & remove audio
127
  sh(" ".join([
128
  "ffmpeg", "-y", "-i", f"\"{in_path}\"",
129
+ *trim_args,
130
+ "-an",
131
  "-vcodec", "libx264", "-preset", "veryfast", "-crf", "23",
132
  "-movflags", "+faststart",
133
  f"\"{trimmed}\""
134
  ]))
135
 
136
+ # Downscale to TARGET_H; ensure mod2 width, baseline profile for compatibility
 
137
  vf = f"scale=-2:{TARGET_H}:flags=bicubic"
138
  sh(" ".join([
139
  "ffmpeg", "-y", "-i", f"\"{trimmed}\"",
 
150
  return str(processed), final_dur
151
 
152
  # ========= Inference (ZeroGPU) =========
153
+ @spaces.GPU(duration=240) # ~4 minutes per call (fits ZeroGPU window)
154
  def run_model(video_path: str, prompt_text: str) -> str:
155
  """
156
+ Call Tencent's infer.py on GPU and return a 48 kHz WAV path.
157
  """
158
  job_id = uuid.uuid4().hex[:8]
159
  work_out = OUT_DIR / f"job_{job_id}"
160
  work_out.mkdir(parents=True, exist_ok=True)
161
 
162
  cmd = [
163
+ "python", str(REPO_DIR / "infer.py"),
164
  "--model_path", str(WEIGHTS_DIR),
165
+ "--config_path", str(REPO_DIR / "configs" / "hunyuanvideo-foley-xxl.yaml"),
166
  "--single_video", video_path,
167
  "--single_prompt", json.dumps(prompt_text or ""),
168
  "--output_dir", str(work_out),
169
+ "--device", "cuda",
170
  ]
171
  sh(" ".join(cmd))
172
 
 
178
  if not wav:
179
  raise RuntimeError("No audio produced by the model.")
180
 
181
+ # Normalize / resample to SR stereo
182
  fixed = work_out / "foley_48k.wav"
183
  sh(" ".join([
184
  "ffmpeg", "-y", "-i", f"\"{str(wav)}\"",
 
187
  ]))
188
  return str(fixed)
189
 
190
+ # ========= Optional: Mux Foley back to video =========
191
  def mux_audio_with_video(video_path: str, audio_path: str) -> str:
192
  out_path = Path(tempfile.mkdtemp(prefix="mux_")) / "with_foley.mp4"
 
193
  sh(" ".join([
194
  "ffmpeg", "-y",
195
  "-i", f"\"{video_path}\"",
 
201
  ]))
202
  return str(out_path)
203
 
204
+ # ========= UI Handlers =========
205
  def single_generate(video: str, prompt: str, want_mux: bool, project_name: str) -> Tuple[Optional[str], Optional[str], str, list]:
 
 
 
206
  history = []
207
  try:
208
  if not video:
209
  return None, None, "⚠️ Please upload a video.", history
 
210
  history.append(["Preprocess", "Downscaling / trimming…"])
211
  pre_path, final_dur = preprocess_video(video)
 
212
  history.append(["Inference", "Generating foley on GPU…"])
213
  wav = run_model(pre_path, prompt or "")
 
214
  muxed = None
215
  if want_mux:
216
  history.append(["Mux", "Combining foley with video…"])
 
222
  return None, None, f"❌ {type(e).__name__}: {e}", history
223
 
224
  def batch_lite_generate(files: List[str], prompt: str, want_mux: bool) -> Tuple[str, list]:
 
 
 
 
225
  log = []
226
  if not files:
227
  return "⚠️ Please upload 1–3 videos.", log
 
241
  log.append([f"Done {i}", "OK"])
242
  except Exception as e:
243
  log.append([f"Error {i}", str(e)])
 
244
  manifest = OUT_DIR / f"batchlite_{uuid.uuid4().hex[:6]}.json"
245
  manifest.write_text(json.dumps(
246
  [{"wav": w, "video": v} for (w, v) in outputs], ensure_ascii=False, indent=2
 
260
  color: white; padding: 12px 16px; border-radius: 12px;
261
  }}
262
  #brandbar strong {{ letter-spacing: .3px; }}
 
263
  """
264
 
265
+ with gr.Blocks(css=THEME_CSS, title="Foley Studio · ZeroGPU") as demo:
 
 
 
266
  with gr.Row():
267
  gr.HTML(f'<div id="brandbar"><strong>{BILS_BRAND}</strong> — HunyuanVideo-Foley on ZeroGPU</div>')
268
 
 
271
  with gr.Group():
272
  project_name = gr.Textbox(label="Project name (optional)", placeholder="e.g., JawharaFM Teaser 09-2025")
273
  with gr.Row():
274
+ v_single = gr.Video(label=f"Video (≤ ~{MAX_SECS}s recommended)")
275
+ p_single = gr.Textbox(label="Sound prompt (optional)", placeholder="e.g., soft footsteps, indoor reverb, light rain")
276
  with gr.Row():
277
  want_mux_single = gr.Checkbox(value=True, label="Mux foley back into video (MP4)")
278
  run_btn = gr.Button("Generate", variant="primary")
279
  with gr.Row():
280
+ out_audio = gr.Audio(label=f"Generated Foley ({SR//1000} kHz WAV)", type="filepath")
281
  out_mux = gr.Video(label="Video + Foley (MP4)", visible=True)
282
  status_md = gr.Markdown()
283
  history_table = gr.Dataframe(headers=["Step", "Note"], datatype=["str","str"], interactive=False, wrap=True, label="Activity")
 
304
 
305
  with gr.Tab("⚙️ Settings / Tips"):
306
  gr.Markdown(f"""
307
+ **ZeroGPU Tips**
308
+ - Clips are trimmed to **≤ {MAX_SECS}s** automatically.
309
+ - Video downscaled to **{TARGET_H}p** to fit the GPU time window.
310
+ - If you hit a quota error, try again later (ZeroGPU limits minutes per visitor).
311
+
312
+ **Branding via ENV**
313
+ - `BILS_BRAND` header text
314
+ - `PRIMARY_COLOR` → hex color
315
+ - `MAX_SECS`, `TARGET_H`, `TARGET_SR` processing behavior
 
 
 
316
  """)
317
 
318
  demo.queue(max_size=24).launch()