qqwjq1981 commited on
Commit
9666193
·
verified ·
1 Parent(s): 4c58229

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -0
app.py CHANGED
@@ -33,6 +33,8 @@ import traceback
33
  from TTS.api import TTS
34
  import torch
35
  from TTS.tts.configs.xtts_config import XttsConfig
 
 
36
 
37
  # Accept license terms for Coqui XTTS
38
  os.environ["COQUI_TOS_AGREED"] = "1"
@@ -128,12 +130,47 @@ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %
128
  logger = logging.getLogger(__name__)
129
  logger.info(f"MoviePy Version: {moviepy.__version__}")
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def transcribe_video_with_speakers(video_path):
132
  # Extract audio from video
133
  video = VideoFileClip(video_path)
134
  audio_path = "audio.wav"
135
  video.audio.write_audiofile(audio_path)
136
  logger.info(f"Audio extracted from video: {audio_path}")
 
 
 
137
 
138
  # Set up device
139
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
33
  from TTS.api import TTS
34
  import torch
35
  from TTS.tts.configs.xtts_config import XttsConfig
36
+ from pydub import AudioSegment
37
+ from pyannote.audio import Pipeline
38
 
39
  # Accept license terms for Coqui XTTS
40
  os.environ["COQUI_TOS_AGREED"] = "1"
 
130
  logger = logging.getLogger(__name__)
131
  logger.info(f"MoviePy Version: {moviepy.__version__}")
132
 
133
+ def segment_background_audio(audio_path, output_path="background_segments.wav"):
134
+
135
+ # Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
136
+
137
+ pipeline = Pipeline.from_pretrained(
138
+ "pyannote/voice-activity-detection",
139
+ use_auth_token=HUGGINGFACE_TOKEN
140
+ )
141
+
142
+ # Step 3: Run VAD to get speech segments
143
+ vad_result = pipeline(audio_path)
144
+ print(f"Detected speech segments: {vad_result}")
145
+
146
+ # Step 4: Load full audio and subtract speech segments
147
+ full_audio = AudioSegment.from_wav(audio_path)
148
+ background_audio = AudioSegment.silent(duration=len(full_audio))
149
+
150
+ for segment in vad_result.itersegments():
151
+ start_ms = int(segment.start * 1000)
152
+ end_ms = int(segment.end * 1000)
153
+ # Remove speech by muting that portion
154
+ background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
155
+
156
+ # Step 5: Subtract background_audio from full_audio
157
+ result_audio = full_audio.overlay(background_audio)
158
+
159
+ # Step 6: Export non-speech segments
160
+ result_audio.export(output_path, format="wav")
161
+ print(f"Saved non-speech (background) audio to: {output_path}")
162
+
163
+ return output_path
164
+
165
  def transcribe_video_with_speakers(video_path):
166
  # Extract audio from video
167
  video = VideoFileClip(video_path)
168
  audio_path = "audio.wav"
169
  video.audio.write_audiofile(audio_path)
170
  logger.info(f"Audio extracted from video: {audio_path}")
171
+
172
+ segment_background_audio(audio_path)
173
+ print(f"Saved non-speech (background) audio to local")
174
 
175
  # Set up device
176
  device = "cuda" if torch.cuda.is_available() else "cpu"