qqwjq1981 commited on
Commit
8d5a056
·
verified ·
1 Parent(s): cc355be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -24
app.py CHANGED
@@ -125,32 +125,59 @@ def handle_feedback(feedback):
125
  return "Thank you for your feedback!", None
126
 
127
  def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
128
- pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
129
- vad_result = pipeline(audio_path)
130
-
131
- full_audio = AudioSegment.from_wav(audio_path)
132
- full_duration_sec = len(full_audio) / 1000.0
133
-
134
- current_time = 0.0
135
- result_audio = AudioSegment.empty()
136
-
137
- for segment in vad_result.itersegments():
138
- # Background segment before the speech
139
- if current_time < segment.start:
140
- bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
141
- result_audio += bg
142
- # Add silence for the speech duration
143
- silence_duration = segment.end - segment.start
144
- result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
145
- current_time = segment.end
146
-
147
- # Handle any remaining background after the last speech
148
- if current_time < full_duration_sec:
149
- result_audio += full_audio[int(current_time * 1000):]
150
-
151
- result_audio.export(background_audio_path, format="wav")
152
  return background_audio_path
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  def transcribe_video_with_speakers(video_path):
155
  # Extract audio from video
156
  video = VideoFileClip(video_path)
 
125
  return "Thank you for your feedback!", None
126
 
127
  def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
128
+ """
129
+ Uses Demucs to separate audio and extract background (non-vocal) parts.
130
+ Merges drums, bass, and other stems into a single background track.
131
+ """
132
+ # Step 1: Run Demucs using the 4-stem model
133
+ subprocess.run([
134
+ "demucs",
135
+ "-n", "htdemucs", # 4-stem model
136
+ audio_path
137
+ ], check=True)
138
+
139
+ # Step 2: Locate separated stem files
140
+ filename = os.path.splitext(os.path.basename(audio_path))[0]
141
+ stem_dir = os.path.join("separated", "htdemucs", filename)
142
+
143
+ # Step 3: Load and merge background stems
144
+ drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
145
+ bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
146
+ other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
147
+
148
+ background = drums.overlay(bass).overlay(other)
149
+
150
+ # Step 4: Export the merged background
151
+ background.export(background_audio_path, format="wav")
152
  return background_audio_path
153
 
154
+ # def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
155
+ # pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
156
+ # vad_result = pipeline(audio_path)
157
+
158
+ # full_audio = AudioSegment.from_wav(audio_path)
159
+ # full_duration_sec = len(full_audio) / 1000.0
160
+
161
+ # current_time = 0.0
162
+ # result_audio = AudioSegment.empty()
163
+
164
+ # for segment in vad_result.itersegments():
165
+ # # Background segment before the speech
166
+ # if current_time < segment.start:
167
+ # bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
168
+ # result_audio += bg
169
+ # # Add silence for the speech duration
170
+ # silence_duration = segment.end - segment.start
171
+ # result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
172
+ # current_time = segment.end
173
+
174
+ # # Handle any remaining background after the last speech
175
+ # if current_time < full_duration_sec:
176
+ # result_audio += full_audio[int(current_time * 1000):]
177
+
178
+ # result_audio.export(background_audio_path, format="wav")
179
+ # return background_audio_path
180
+
181
  def transcribe_video_with_speakers(video_path):
182
  # Extract audio from video
183
  video = VideoFileClip(video_path)