fffiloni commited on
Commit
62c47fb
·
verified ·
1 Parent(s): e7708ce

Update gradio_app.py

Browse files
Files changed (1) hide show
  1. gradio_app.py +36 -54
gradio_app.py CHANGED
@@ -32,6 +32,32 @@ def attach_audio_to_video(original_video, audio_path, out_path):
32
  new_video.write_videofile(out_path, audio_codec='aac', verbose=False, logger=None)
33
  return out_path
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  @spaces.GPU()
36
  def separate_dnr(audio_file):
37
  audio, sr = torchaudio.load(audio_file)
@@ -58,27 +84,7 @@ def separate_dnr(audio_file):
58
 
59
  @spaces.GPU()
60
  def separate_speakers(audio_path):
61
- waveform, original_sr = torchaudio.load(audio_path)
62
- if original_sr != TARGET_SR:
63
- waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform)
64
-
65
- if waveform.dim() == 1:
66
- waveform = waveform.unsqueeze(0)
67
- audio_input = waveform.unsqueeze(0).to(device)
68
-
69
- with torch.no_grad():
70
- ests_speech = sep_model(audio_input).squeeze(0)
71
-
72
- session_id = uuid.uuid4().hex[:8]
73
- output_dir = os.path.join("output_sep", session_id)
74
- os.makedirs(output_dir, exist_ok=True)
75
-
76
- output_files = []
77
- for i in range(ests_speech.shape[0]):
78
- path = os.path.join(output_dir, f"speaker_{i+1}.wav")
79
- sf.write(path, ests_speech[i].cpu().numpy(), TARGET_SR)
80
- output_files.append(path)
81
-
82
  updates = []
83
  for i in range(MAX_SPEAKERS):
84
  if i < len(output_files):
@@ -102,40 +108,16 @@ def separate_dnr_video(video_path):
102
 
103
  return dialog_video, effect_video, music_video
104
 
105
-
106
-
107
  @spaces.GPU()
108
- def separate_speakers_video(video_path):
109
- audio_path, video = extract_audio_from_video(video_path)
110
- waveform, original_sr = torchaudio.load(audio_path)
111
-
112
- if original_sr != TARGET_SR:
113
- waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform)
114
-
115
- if waveform.dim() == 1:
116
- waveform = waveform.unsqueeze(0)
117
- audio_input = waveform.unsqueeze(0).to(device)
118
-
119
- with torch.no_grad():
120
- ests_speech = sep_model(audio_input).squeeze(0)
121
-
122
- session_id = uuid.uuid4().hex[:8]
123
- output_dir = os.path.join("output_sep_video", session_id)
124
- os.makedirs(output_dir, exist_ok=True)
125
-
126
- output_files = []
127
- for i in range(ests_speech.shape[0]):
128
- separated_audio_path = os.path.join(output_dir, f"speaker_{i+1}.wav")
129
- mono_audio = ests_speech[i].cpu().unsqueeze(0) # Shape: [1, time]
130
- torchaudio.save(separated_audio_path, mono_audio.contiguous(), TARGET_SR, format="wav", encoding="PCM_S") # safest combo
131
-
132
- # Attach audio back to video
133
- out_video_path = os.path.join(output_dir, f"speaker_{i+1}.mp4")
134
- attach_audio_to_video(video, separated_audio_path, out_video_path)
135
- output_files.append(out_video_path)
136
-
137
- return output_files + [None] * (MAX_SPEAKERS - len(output_files))
138
-
139
 
140
 
141
  # --- Gradio UI ---
 
32
  new_video.write_videofile(out_path, audio_codec='aac', verbose=False, logger=None)
33
  return out_path
34
 
35
+
36
+ def separate_speakers_core(audio_path):
37
+ waveform, original_sr = torchaudio.load(audio_path)
38
+ if original_sr != TARGET_SR:
39
+ waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform)
40
+
41
+ if waveform.dim() == 1:
42
+ waveform = waveform.unsqueeze(0)
43
+ audio_input = waveform.unsqueeze(0).to(device)
44
+
45
+ with torch.no_grad():
46
+ ests_speech = sep_model(audio_input).squeeze(0)
47
+
48
+ session_id = uuid.uuid4().hex[:8]
49
+ output_dir = os.path.join("output_sep", session_id)
50
+ os.makedirs(output_dir, exist_ok=True)
51
+
52
+ output_files = []
53
+ for i in range(ests_speech.shape[0]):
54
+ path = os.path.join(output_dir, f"speaker_{i+1}.wav")
55
+ sf.write(path, ests_speech[i].cpu().numpy(), TARGET_SR)
56
+ output_files.append(path)
57
+
58
+ return output_files
59
+
60
+
61
  @spaces.GPU()
62
  def separate_dnr(audio_file):
63
  audio, sr = torchaudio.load(audio_file)
 
84
 
85
  @spaces.GPU()
86
  def separate_speakers(audio_path):
87
+ output_files = separate_speakers_core(audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  updates = []
89
  for i in range(MAX_SPEAKERS):
90
  if i < len(output_files):
 
108
 
109
  return dialog_video, effect_video, music_video
110
 
 
 
111
  @spaces.GPU()
112
+ def separate_speakers(audio_path):
113
+ output_files = separate_speakers_core(audio_path)
114
+ updates = []
115
+ for i in range(MAX_SPEAKERS):
116
+ if i < len(output_files):
117
+ updates.append(gr.update(value=output_files[i], visible=True, label=f"Speaker {i+1}"))
118
+ else:
119
+ updates.append(gr.update(value=None, visible=False))
120
+ return updates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
 
123
  # --- Gradio UI ---