roychao19477 commited on
Commit
f28da5d
·
1 Parent(s): 6ca32b7

First commit

Browse files
Files changed (1) hide show
  1. app.py +36 -2
app.py CHANGED
@@ -28,6 +28,9 @@ Upload or record a noisy clip and click **Enhance** to hear + see its spectrogra
28
 
29
 
30
  import torch
 
 
 
31
  import yaml
32
  import librosa
33
  import librosa.display
@@ -51,6 +54,23 @@ from moviepy import ImageSequenceClip
51
  # Load face detector
52
  model = YOLO("yolov8n-face.pt").cuda() # assumes CUDA available
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  @spaces.GPU
55
  def extract_faces(video_file):
56
  cap = cv2.VideoCapture(video_file)
@@ -90,8 +110,22 @@ def extract_faces(video_file):
90
  # Save as video
91
  tmpdir = tempfile.mkdtemp()
92
  output_path = os.path.join(tmpdir, "face_only_video.mp4")
93
- clip = ImageSequenceClip([cv2.cvtColor(f, cv2.COLOR_BGR2RGB) for f in frames], fps=25)
94
- clip.write_videofile(output_path, codec="libx264", audio=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  return output_path
97
 
 
28
 
29
 
30
  import torch
31
+ import ffmpeg
32
+ import torchaudio
33
+ import torchaudio.transforms as T
34
  import yaml
35
  import librosa
36
  import librosa.display
 
54
  # Load face detector
55
  model = YOLO("yolov8n-face.pt").cuda() # assumes CUDA available
56
 
57
+ def extract_resampled_audio(video_path, target_sr=16000):
58
+ # Step 1: extract audio via torchaudio
59
+ # (moviepy will still extract it to wav temp file)
60
+ tmp_audio_path = tempfile.mktemp(suffix=".wav")
61
+ subprocess.run(["ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "pcm_s16le", "-ar", "44100", tmp_audio_path])
62
+
63
+ # Step 2: Load and resample
64
+ waveform, sr = torchaudio.load(tmp_audio_path)
65
+ if sr != target_sr:
66
+ resampler = T.Resample(orig_freq=sr, new_freq=target_sr)
67
+ waveform = resampler(waveform)
68
+
69
+ # Step 3: Save resampled audio
70
+ resampled_audio_path = tempfile.mktemp(suffix="_16k.wav")
71
+ torchaudio.save(resampled_audio_path, waveform, sample_rate=target_sr)
72
+ return resampled_audio_path
73
+
74
  @spaces.GPU
75
  def extract_faces(video_file):
76
  cap = cv2.VideoCapture(video_file)
 
110
  # Save as video
111
  tmpdir = tempfile.mkdtemp()
112
  output_path = os.path.join(tmpdir, "face_only_video.mp4")
113
+ #clip = ImageSequenceClip([cv2.cvtColor(f, cv2.COLOR_BGR2RGB) for f in frames], fps=25)
114
+ clip = ImageSequenceClip([cv2.cvtColor(f, cv2.COLOR_BGR2RGB) for f in frames], fps=fps)
115
+ clip.write_videofile(output_path, codec="libx264", audio=False, fps=25)
116
+
117
+ # Save audio from original, resampled to 16kHz
118
+ audio_path = os.path.join(tmpdir, "audio_16k.wav")
119
+
120
+ # Extract audio using ffmpeg-python (more robust than moviepy)
121
+ ffmpeg.input(video_file).output(
122
+ audio_path,
123
+ ar=16000, # resample to 16k
124
+ ac=1, # mono
125
+ format='wav',
126
+ vn=None # no video
127
+ ).run(overwrite_output=True)
128
+
129
 
130
  return output_path
131