openfree commited on
Commit
b1cb088
·
verified ·
1 Parent(s): 0178f77

Update sonic.py

Browse files
Files changed (1) hide show
  1. sonic.py +114 -94
sonic.py CHANGED
@@ -1,7 +1,9 @@
1
- import os, math, torch, cv2
 
2
  from PIL import Image
3
  from omegaconf import OmegaConf
4
  from tqdm import tqdm
 
5
 
6
  from diffusers import AutoencoderKLTemporalDecoder
7
  from diffusers.schedulers import EulerDiscreteScheduler
@@ -9,7 +11,9 @@ from transformers import WhisperModel, CLIPVisionModelWithProjection, AutoFeatur
9
 
10
  from src.utils.util import save_videos_grid, seed_everything
11
  from src.dataset.test_preprocess import process_bbox, image_audio_to_tensor
12
- from src.models.base.unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel, add_ip_adapters
 
 
13
  from src.pipelines.pipeline_sonic import SonicPipeline
14
  from src.models.audio_adapter.audio_proj import AudioProjModel
15
  from src.models.audio_adapter.audio_to_bucket import Audio2bucketModel
@@ -18,49 +22,49 @@ from src.dataset.face_align.align import AlignImage
18
 
19
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
20
 
21
-
22
  # ------------------------------------------------------------------
23
- # single image + speech video-tensor generator
24
  # ------------------------------------------------------------------
25
- def test(pipe, cfg, wav_enc, audio_pe, audio2bucket, image_encoder,
26
- width, height, batch):
27
-
28
- # -------- 배치 차원 정리 ---------------------------------------------
 
29
  for k, v in batch.items():
30
  if isinstance(v, torch.Tensor):
31
  batch[k] = v.unsqueeze(0).to(pipe.device).float()
32
 
33
- ref_img = batch["ref_img"] # (1,C,H,W)
34
  clip_img = batch["clip_images"]
35
  face_mask = batch["face_mask"]
36
- image_embeds = image_encoder(clip_img).image_embeds
37
 
38
- audio_feature = batch["audio_feature"] # (1,80,T)
39
- audio_len = int(batch["audio_len"])
40
- step = max(1, int(cfg.step)) # 최소 1 보장
41
 
42
- # -------- Whisper 인코딩 --------------------------------------------
43
- window = 16_000 # 1-초 단위
44
  audio_prompts, last_prompts = [], []
45
 
46
  for i in range(0, audio_feature.shape[-1], window):
47
- chunk = audio_feature[:, :, i:i+window]
48
 
49
- hs_all = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
50
- last_hid = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2) # (1,t,1,384)
51
 
52
- audio_prompts.append(torch.stack(hs_all, dim=2)) # (1,t,12,384)
53
- last_prompts.append(last_hid) # (1,t,1,384)
54
 
55
  if not audio_prompts:
56
  raise ValueError("[ERROR] No speech recognised in the provided audio.")
57
 
58
- audio_prompts = torch.cat(audio_prompts, dim=1) # (1,T,12,384)
59
- last_prompts = torch.cat(last_prompts, dim=1) # (1,T,1,384)
60
 
61
- # -------- 앞뒤 padding ----------------------------------------------
62
  audio_prompts = torch.cat(
63
- [torch.zeros_like(audio_prompts[:, :4]),
64
  audio_prompts,
65
  torch.zeros_like(audio_prompts[:, :6])], dim=1)
66
  last_prompts = torch.cat(
@@ -68,71 +72,65 @@ def test(pipe, cfg, wav_enc, audio_pe, audio2bucket, image_encoder,
68
  last_prompts,
69
  torch.zeros_like(last_prompts[:, :26])], dim=1)
70
 
 
71
  total_tokens = audio_prompts.shape[1]
72
  num_chunks = max(1, math.ceil(total_tokens / (2 * step)))
73
 
74
  ref_list, audio_list, uncond_list, motion_buckets = [], [], [], []
75
 
76
-
77
  for i in tqdm(range(num_chunks)):
78
  start = i * 2 * step
79
 
80
- # ------------------------------------------------------------
81
- # cond_clip : (bz, f=1, w=10, b=5, c=384)
82
- # bucket_clip: (bz, f=1, w=50, b=1, c=384)
83
- # Whisper-tiny 는 hidden_state 층 수가 2 → 5 로 패딩
84
- # ------------------------------------------------------------
85
- clip_raw = audio_prompts[:, start:start + 10] # (1, ≤10, L, 384)
86
- if clip_raw.shape[1] < 10: # w 패딩
87
  pad_w = torch.zeros_like(clip_raw[:, :10 - clip_raw.shape[1]])
88
  clip_raw = torch.cat([clip_raw, pad_w], dim=1)
89
 
90
- # ---- L(=layers) 패딩: 부족하면 마지막 layer 를 반복 ----------
91
- L_now = clip_raw.shape[2]
92
- if L_now < 5:
93
- pad_L = clip_raw[:, :, -1:].repeat(1, 1, 5 - L_now, 1)
94
  clip_raw = torch.cat([clip_raw, pad_L], dim=2)
95
- clip_raw = clip_raw[:, :, :5] # (1,10,5,384)
96
 
97
- cond_clip = clip_raw.unsqueeze(1) # (1,1,10,5,384)
 
98
 
99
- # ------------------------------------------------------------
100
- bucket_raw = last_prompts[:, start:start + 50] # (1, ≤50, 1, 384)
101
  if bucket_raw.shape[1] < 50:
102
  pad_w = torch.zeros_like(bucket_raw[:, :50 - bucket_raw.shape[1]])
103
  bucket_raw = torch.cat([bucket_raw, pad_w], dim=1)
104
- bucket_clip = bucket_raw.unsqueeze(1) # (1,1,50,1,384)
 
105
 
106
  motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
107
 
108
  ref_list.append(ref_img[0])
109
- audio_list.append(audio_pe(cond_clip).squeeze(0)[0])
110
  uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0)[0])
111
  motion_buckets.append(motion[0])
112
 
113
-
114
-
115
-
116
-
117
- # -------- diffusion --------------------------------------------------
118
  video = pipe(
119
  ref_img, clip_img, face_mask,
120
  audio_list, uncond_list, motion_buckets,
121
  height=height, width=width,
122
  num_frames=len(audio_list),
123
- decode_chunk_size=cfg.decode_chunk_size,
124
- motion_bucket_scale=cfg.motion_bucket_scale,
125
- fps=cfg.fps,
126
- noise_aug_strength=cfg.noise_aug_strength,
127
- min_guidance_scale1=cfg.min_appearance_guidance_scale,
128
- max_guidance_scale1=cfg.max_appearance_guidance_scale,
129
- min_guidance_scale2=cfg.audio_guidance_scale,
130
- max_guidance_scale2=cfg.audio_guidance_scale,
131
- overlap=cfg.overlap,
132
- shift_offset=cfg.shift_offset,
133
- frames_per_batch=cfg.n_sample_frames,
134
- num_inference_steps=cfg.num_inference_steps,
135
- i2i_noise_strength=cfg.i2i_noise_strength,
136
  ).frames
137
 
138
  video = (video * 0.5 + 0.5).clamp(0, 1)
@@ -140,7 +138,7 @@ def test(pipe, cfg, wav_enc, audio_pe, audio2bucket, image_encoder,
140
 
141
 
142
  # ------------------------------------------------------------------
143
- # Sonic class
144
  # ------------------------------------------------------------------
145
  class Sonic:
146
  config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
@@ -159,18 +157,18 @@ class Sonic:
159
  def _load_models(self, cfg):
160
  dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[cfg.weight_dtype]
161
 
162
- vae = AutoencoderKLTemporalDecoder.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", variant="fp16")
163
- sched = EulerDiscreteScheduler.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="scheduler")
164
- imgE = CLIPVisionModelWithProjection.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="image_encoder", variant="fp16")
165
  unet = UNetSpatioTemporalConditionModel.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="unet", variant="fp16")
166
  add_ip_adapters(unet, [32], [cfg.ip_audio_scale])
167
 
168
  a2t = AudioProjModel(10, 5, 384, 1024, 1024, 32).to(self.device)
169
  a2b = Audio2bucketModel(50, 1, 384, 1024, 1024, 1, 2).to(self.device)
170
 
171
- unet.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path), map_location="cpu"))
172
- a2t.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path), map_location="cpu"))
173
- a2b.load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path), map_location="cpu"))
174
 
175
  whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
176
  whisper.requires_grad_(False)
@@ -181,11 +179,11 @@ class Sonic:
181
  self.rife = RIFEModel(device=self.device)
182
  self.rife.load_model(os.path.join(BASE_DIR, "checkpoints/RIFE/"))
183
 
184
- for m in (imgE, vae, unet):
185
  m.to(dtype)
186
 
187
- self.pipe = SonicPipeline(unet=unet, image_encoder=imgE, vae=vae, scheduler=sched).to(device=self.device, dtype=dtype)
188
- self.image_encoder = imgE
189
  self.audio2token = a2t
190
  self.audio2bucket = a2b
191
  self.whisper = whisper
@@ -197,50 +195,72 @@ class Sonic:
197
  _, _, bboxes = self.face_det(img, maxface=True)
198
  if bboxes:
199
  x1, y1, ww, hh = bboxes[0]
200
- return {"face_num": 1, "crop_bbox": process_bbox((x1, y1, x1+ww, y1+hh), expand_ratio, h, w)}
201
  return {"face_num": 0, "crop_bbox": None}
202
 
203
  # --------------------------------------------------------------
204
  @torch.no_grad()
205
- def process(self, image_path: str, audio_path: str, output_path: str,
206
- min_resolution: int = 512, inference_steps: int = 25,
207
- dynamic_scale: float = 1.0, keep_resolution: bool = False,
208
- seed: int | None = None):
209
-
 
 
 
 
 
 
210
  cfg = self.config
211
  if seed is not None:
212
  cfg.seed = seed
213
- cfg.num_inference_steps = inference_steps
214
- cfg.motion_bucket_scale = dynamic_scale
215
  seed_everything(cfg.seed)
216
 
 
217
  test_data = image_audio_to_tensor(
218
- self.face_det, self.feature_extractor,
219
- image_path, audio_path, limit=-1,
220
- image_size=min_resolution, area=cfg.area,
 
 
 
 
221
  )
222
  if test_data is None:
223
  return -1
224
 
225
  h, w = test_data["ref_img"].shape[-2:]
226
- resolution = (f"{(Image.open(image_path).width//2)*2}x{(Image.open(image_path).height//2)*2}"
227
- if keep_resolution else f"{w}x{h}")
 
 
 
228
 
229
- video = test(self.pipe, cfg, self.whisper, self.audio2token,
230
- self.audio2bucket, self.image_encoder, w, h, test_data)
 
 
 
 
231
 
 
232
  if cfg.use_interframe:
233
  out = video.to(self.device)
234
  frames = []
235
- for i in tqdm(range(out.shape[2]-1), ncols=0):
236
- mid = self.rife.inference(out[:,:,i], out[:,:,i+1]).clamp(0,1).detach()
237
- frames.extend([out[:,:,i], mid])
238
- frames.append(out[:,:,-1])
239
  video = torch.stack(frames, 2).cpu()
240
 
241
- tmp = output_path.replace(".mp4", "_noaudio.mp4")
242
- save_videos_grid(video, tmp, n_rows=video.shape[0], fps=cfg.fps*(2 if cfg.use_interframe else 1))
243
- os.system(f"ffmpeg -i '{tmp}' -i '{audio_path}' -s {resolution} "
244
- f"-vcodec libx264 -acodec aac -crf 18 -shortest '{output_path}' -y -loglevel error")
245
- os.remove(tmp)
 
 
 
246
  return 0
 
1
+ import os, math
2
+ import torch
3
  from PIL import Image
4
  from omegaconf import OmegaConf
5
  from tqdm import tqdm
6
+ import cv2
7
 
8
  from diffusers import AutoencoderKLTemporalDecoder
9
  from diffusers.schedulers import EulerDiscreteScheduler
 
11
 
12
  from src.utils.util import save_videos_grid, seed_everything
13
  from src.dataset.test_preprocess import process_bbox, image_audio_to_tensor
14
+ from src.models.base.unet_spatio_temporal_condition import (
15
+ UNetSpatioTemporalConditionModel, add_ip_adapters,
16
+ )
17
  from src.pipelines.pipeline_sonic import SonicPipeline
18
  from src.models.audio_adapter.audio_proj import AudioProjModel
19
  from src.models.audio_adapter.audio_to_bucket import Audio2bucketModel
 
22
 
23
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
24
 
 
25
  # ------------------------------------------------------------------
26
+ # single image + speech video-tensor generator
27
  # ------------------------------------------------------------------
28
+ def test(
29
+ pipe, config, wav_enc, audio_pe, audio2bucket, image_encoder,
30
+ width, height, batch,
31
+ ):
32
+ # ---------------- batch 차원 맞추기 -----------------------------
33
  for k, v in batch.items():
34
  if isinstance(v, torch.Tensor):
35
  batch[k] = v.unsqueeze(0).to(pipe.device).float()
36
 
37
+ ref_img = batch["ref_img"] # (1,C,H,W)
38
  clip_img = batch["clip_images"]
39
  face_mask = batch["face_mask"]
40
+ image_embeds = image_encoder(clip_img).image_embeds # (1,1024)
41
 
42
+ audio_feature = batch["audio_feature"] # (1,80,T)
43
+ audio_len = int(batch["audio_len"]) # python int
44
+ step = int(config.step)
45
 
46
+ # ---------- window 단위 Whisper 인코딩 --------------------------
47
+ window = 16_000 # 1
48
  audio_prompts, last_prompts = [], []
49
 
50
  for i in range(0, audio_feature.shape[-1], window):
51
+ chunk = audio_feature[:, :, i : i + window]
52
 
53
+ layers = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
54
+ last = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)
55
 
56
+ audio_prompts.append(torch.stack(layers, dim=2)) # (1,?,L,384)
57
+ last_prompts.append(last) # (1,?,1,384)
58
 
59
  if not audio_prompts:
60
  raise ValueError("[ERROR] No speech recognised in the provided audio.")
61
 
62
+ audio_prompts = torch.cat(audio_prompts, dim=1)
63
+ last_prompts = torch.cat(last_prompts, dim=1)
64
 
65
+ # ---------- 모델 입력 규칙에 맞춰 padding -----------------------
66
  audio_prompts = torch.cat(
67
+ [torch.zeros_like(audio_prompts[:, :4]), # head pad
68
  audio_prompts,
69
  torch.zeros_like(audio_prompts[:, :6])], dim=1)
70
  last_prompts = torch.cat(
 
72
  last_prompts,
73
  torch.zeros_like(last_prompts[:, :26])], dim=1)
74
 
75
+ # ---------- 음성 길이에 따라 chunk 횟수 산정 ---------------------
76
  total_tokens = audio_prompts.shape[1]
77
  num_chunks = max(1, math.ceil(total_tokens / (2 * step)))
78
 
79
  ref_list, audio_list, uncond_list, motion_buckets = [], [], [], []
80
 
 
81
  for i in tqdm(range(num_chunks)):
82
  start = i * 2 * step
83
 
84
+ # ---------------- cond_clip (w=10,L=5) --------------------
85
+ clip_raw = audio_prompts[:, start : start + 10] # (1,≤10,L,384)
86
+
87
+ # w-pad
88
+ if clip_raw.shape[1] < 10:
 
 
89
  pad_w = torch.zeros_like(clip_raw[:, :10 - clip_raw.shape[1]])
90
  clip_raw = torch.cat([clip_raw, pad_w], dim=1)
91
 
92
+ # L-pad (Whisper-tiny L=2 5로 확장)
93
+ if clip_raw.shape[2] < 5:
94
+ pad_L = clip_raw[:, :, -1:].repeat(1, 1, 5 - clip_raw.shape[2], 1)
 
95
  clip_raw = torch.cat([clip_raw, pad_L], dim=2)
 
96
 
97
+ clip_raw = clip_raw[:, :, :5] # (1,10,5,384)
98
+ cond_clip = clip_raw.unsqueeze(1) # (1,1,10,5,384)
99
 
100
+ # ---------------- bucket_clip (w=50,L=1) ------------------
101
+ bucket_raw = last_prompts[:, start : start + 50] # (1,≤50,1,384)
102
  if bucket_raw.shape[1] < 50:
103
  pad_w = torch.zeros_like(bucket_raw[:, :50 - bucket_raw.shape[1]])
104
  bucket_raw = torch.cat([bucket_raw, pad_w], dim=1)
105
+
106
+ bucket_clip = bucket_raw.unsqueeze(1) # (1,1,50,1,384)
107
 
108
  motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
109
 
110
  ref_list.append(ref_img[0])
111
+ audio_list.append(audio_pe(cond_clip).squeeze(0)[0]) # (10,1024)
112
  uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0)[0])
113
  motion_buckets.append(motion[0])
114
 
115
+ # ---------- Stable-Video-Diffusion 호출 -------------------------
 
 
 
 
116
  video = pipe(
117
  ref_img, clip_img, face_mask,
118
  audio_list, uncond_list, motion_buckets,
119
  height=height, width=width,
120
  num_frames=len(audio_list),
121
+ decode_chunk_size=config.decode_chunk_size,
122
+ motion_bucket_scale=config.motion_bucket_scale,
123
+ fps=config.fps,
124
+ noise_aug_strength=config.noise_aug_strength,
125
+ min_guidance_scale1=config.min_appearance_guidance_scale,
126
+ max_guidance_scale1=config.max_appearance_guidance_scale,
127
+ min_guidance_scale2=config.audio_guidance_scale,
128
+ max_guidance_scale2=config.audio_guidance_scale,
129
+ overlap=config.overlap,
130
+ shift_offset=config.shift_offset,
131
+ frames_per_batch=config.n_sample_frames,
132
+ num_inference_steps=config.num_inference_steps,
133
+ i2i_noise_strength=config.i2i_noise_strength,
134
  ).frames
135
 
136
  video = (video * 0.5 + 0.5).clamp(0, 1)
 
138
 
139
 
140
  # ------------------------------------------------------------------
141
+ # Sonic class
142
  # ------------------------------------------------------------------
143
  class Sonic:
144
  config_file = os.path.join(BASE_DIR, "config/inference/sonic.yaml")
 
157
  def _load_models(self, cfg):
158
  dtype = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}[cfg.weight_dtype]
159
 
160
+ vae = AutoencoderKLTemporalDecoder.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", variant="fp16")
161
+ sched = EulerDiscreteScheduler .from_pretrained(cfg.pretrained_model_name_or_path, subfolder="scheduler")
162
+ imgenc= CLIPVisionModelWithProjection .from_pretrained(cfg.pretrained_model_name_or_path, subfolder="image_encoder", variant="fp16")
163
  unet = UNetSpatioTemporalConditionModel.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="unet", variant="fp16")
164
  add_ip_adapters(unet, [32], [cfg.ip_audio_scale])
165
 
166
  a2t = AudioProjModel(10, 5, 384, 1024, 1024, 32).to(self.device)
167
  a2b = Audio2bucketModel(50, 1, 384, 1024, 1024, 1, 2).to(self.device)
168
 
169
+ unet .load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.unet_checkpoint_path), map_location="cpu"))
170
+ a2t .load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2token_checkpoint_path), map_location="cpu"))
171
+ a2b .load_state_dict(torch.load(os.path.join(BASE_DIR, cfg.audio2bucket_checkpoint_path), map_location="cpu"))
172
 
173
  whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, "checkpoints/whisper-tiny")).to(self.device).eval()
174
  whisper.requires_grad_(False)
 
179
  self.rife = RIFEModel(device=self.device)
180
  self.rife.load_model(os.path.join(BASE_DIR, "checkpoints/RIFE/"))
181
 
182
+ for m in (imgenc, vae, unet):
183
  m.to(dtype)
184
 
185
+ self.pipe = SonicPipeline(unet=unet, image_encoder=imgenc, vae=vae, scheduler=sched).to(device=self.device, dtype=dtype)
186
+ self.image_encoder = imgenc
187
  self.audio2token = a2t
188
  self.audio2bucket = a2b
189
  self.whisper = whisper
 
195
  _, _, bboxes = self.face_det(img, maxface=True)
196
  if bboxes:
197
  x1, y1, ww, hh = bboxes[0]
198
+ return {"face_num": 1, "crop_bbox": process_bbox((x1, y1, x1 + ww, y1 + hh), expand_ratio, h, w)}
199
  return {"face_num": 0, "crop_bbox": None}
200
 
201
  # --------------------------------------------------------------
202
  @torch.no_grad()
203
+ def process(
204
+ self,
205
+ image_path: str,
206
+ audio_path: str,
207
+ output_path: str,
208
+ min_resolution: int = 512,
209
+ inference_steps: int = 25,
210
+ dynamic_scale: float = 1.0,
211
+ keep_resolution: bool = False,
212
+ seed: int | None = None,
213
+ ):
214
  cfg = self.config
215
  if seed is not None:
216
  cfg.seed = seed
217
+ cfg.num_inference_steps = inference_steps
218
+ cfg.motion_bucket_scale = dynamic_scale
219
  seed_everything(cfg.seed)
220
 
221
+ # 이미지·오디오 → tensor
222
  test_data = image_audio_to_tensor(
223
+ self.face_det,
224
+ self.feature_extractor,
225
+ image_path,
226
+ audio_path,
227
+ limit=-1,
228
+ image_size=min_resolution,
229
+ area=cfg.area,
230
  )
231
  if test_data is None:
232
  return -1
233
 
234
  h, w = test_data["ref_img"].shape[-2:]
235
+ resolution = (
236
+ f"{(Image.open(image_path).width // 2) * 2}x{(Image.open(image_path).height // 2) * 2}"
237
+ if keep_resolution
238
+ else f"{w}x{h}"
239
+ )
240
 
241
+ # 비디오 프레임 생성
242
+ video = test(
243
+ self.pipe, cfg, self.whisper, self.audio2token,
244
+ self.audio2bucket, self.image_encoder,
245
+ width=w, height=h, batch=test_data,
246
+ )
247
 
248
+ # 중간 프레임 보간
249
  if cfg.use_interframe:
250
  out = video.to(self.device)
251
  frames = []
252
+ for i in tqdm(range(out.shape[2] - 1), ncols=0):
253
+ mid = self.rife.inference(out[:, :, i], out[:, :, i + 1]).clamp(0, 1).detach()
254
+ frames.extend([out[:, :, i], mid])
255
+ frames.append(out[:, :, -1])
256
  video = torch.stack(frames, 2).cpu()
257
 
258
+ # 저장
259
+ tmp_mp4 = output_path.replace(".mp4", "_noaudio.mp4")
260
+ save_videos_grid(video, tmp_mp4, n_rows=video.shape[0], fps=cfg.fps * (2 if cfg.use_interframe else 1))
261
+ os.system(
262
+ f"ffmpeg -i '{tmp_mp4}' -i '{audio_path}' -s {resolution} "
263
+ f"-vcodec libx264 -acodec aac -crf 18 -shortest '{output_path}' -y -loglevel error"
264
+ )
265
+ os.remove(tmp_mp4)
266
  return 0