Update app.py
Browse files
app.py
CHANGED
@@ -33,6 +33,8 @@ import traceback
|
|
33 |
from TTS.api import TTS
|
34 |
import torch
|
35 |
from TTS.tts.configs.xtts_config import XttsConfig
|
|
|
|
|
36 |
|
37 |
# Accept license terms for Coqui XTTS
|
38 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
@@ -128,12 +130,47 @@ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %
|
|
128 |
logger = logging.getLogger(__name__)
|
129 |
logger.info(f"MoviePy Version: {moviepy.__version__}")
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
def transcribe_video_with_speakers(video_path):
|
132 |
# Extract audio from video
|
133 |
video = VideoFileClip(video_path)
|
134 |
audio_path = "audio.wav"
|
135 |
video.audio.write_audiofile(audio_path)
|
136 |
logger.info(f"Audio extracted from video: {audio_path}")
|
|
|
|
|
|
|
137 |
|
138 |
# Set up device
|
139 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
33 |
from TTS.api import TTS
|
34 |
import torch
|
35 |
from TTS.tts.configs.xtts_config import XttsConfig
|
36 |
+
from pydub import AudioSegment
|
37 |
+
from pyannote.audio import Pipeline
|
38 |
|
39 |
# Accept license terms for Coqui XTTS
|
40 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
|
|
130 |
logger = logging.getLogger(__name__)
|
131 |
logger.info(f"MoviePy Version: {moviepy.__version__}")
|
132 |
|
133 |
+
def segment_background_audio(audio_path, output_path="background_segments.wav"):
|
134 |
+
|
135 |
+
# Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
|
136 |
+
|
137 |
+
pipeline = Pipeline.from_pretrained(
|
138 |
+
"pyannote/voice-activity-detection",
|
139 |
+
use_auth_token=HUGGINGFACE_TOKEN
|
140 |
+
)
|
141 |
+
|
142 |
+
# Step 3: Run VAD to get speech segments
|
143 |
+
vad_result = pipeline(audio_path)
|
144 |
+
print(f"Detected speech segments: {vad_result}")
|
145 |
+
|
146 |
+
# Step 4: Load full audio and subtract speech segments
|
147 |
+
full_audio = AudioSegment.from_wav(audio_path)
|
148 |
+
background_audio = AudioSegment.silent(duration=len(full_audio))
|
149 |
+
|
150 |
+
for segment in vad_result.itersegments():
|
151 |
+
start_ms = int(segment.start * 1000)
|
152 |
+
end_ms = int(segment.end * 1000)
|
153 |
+
# Remove speech by muting that portion
|
154 |
+
background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
|
155 |
+
|
156 |
+
# Step 5: Subtract background_audio from full_audio
|
157 |
+
result_audio = full_audio.overlay(background_audio)
|
158 |
+
|
159 |
+
# Step 6: Export non-speech segments
|
160 |
+
result_audio.export(output_path, format="wav")
|
161 |
+
print(f"Saved non-speech (background) audio to: {output_path}")
|
162 |
+
|
163 |
+
return output_path
|
164 |
+
|
165 |
def transcribe_video_with_speakers(video_path):
|
166 |
# Extract audio from video
|
167 |
video = VideoFileClip(video_path)
|
168 |
audio_path = "audio.wav"
|
169 |
video.audio.write_audiofile(audio_path)
|
170 |
logger.info(f"Audio extracted from video: {audio_path}")
|
171 |
+
|
172 |
+
segment_background_audio(audio_path)
|
173 |
+
print(f"Saved non-speech (background) audio to local")
|
174 |
|
175 |
# Set up device
|
176 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|