Update app.py
Browse files
app.py
CHANGED
@@ -126,59 +126,57 @@ def handle_feedback(feedback):
|
|
126 |
return "Thank you for your feedback!", None
|
127 |
|
128 |
def segment_background_audio(audio_path, output_path="background_segments.wav", hf_token=None):
|
129 |
-
return 10
|
130 |
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
# Parameters:
|
135 |
-
# - audio_path (str): Path to input audio (.wav).
|
136 |
-
# - output_path (str): Path to save the output non-speech audio.
|
137 |
-
# - hf_token (str): Hugging Face auth token for pyannote.
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
# raise ValueError("Hugging Face token is required for pyannote pipeline.")
|
144 |
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
147 |
|
148 |
-
#
|
149 |
-
|
150 |
-
# print("β
Speech segments detected.")
|
151 |
|
152 |
-
#
|
153 |
-
|
154 |
-
|
155 |
|
156 |
-
#
|
157 |
-
|
158 |
-
|
159 |
|
160 |
-
#
|
161 |
-
|
162 |
-
|
163 |
-
# current_time = segment.end
|
164 |
|
165 |
-
|
166 |
-
|
|
|
|
|
167 |
|
168 |
-
|
|
|
169 |
|
170 |
-
|
171 |
-
# non_speech_audio = AudioSegment.empty()
|
172 |
-
# for start, end in background_segments:
|
173 |
-
# segment = full_audio[int(start * 1000):int(end * 1000)]
|
174 |
-
# non_speech_audio += segment
|
175 |
|
176 |
-
#
|
177 |
-
|
178 |
-
|
|
|
|
|
179 |
|
180 |
-
#
|
|
|
|
|
181 |
|
|
|
182 |
|
183 |
def transcribe_video_with_speakers(video_path):
|
184 |
# Extract audio from video
|
@@ -476,7 +474,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
|
|
476 |
## Need to implmenet backup option.
|
477 |
|
478 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
479 |
-
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths)
|
480 |
for i, entry in enumerate(translated_json)]
|
481 |
|
482 |
results = []
|
@@ -500,15 +498,27 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
|
|
500 |
final_video = CompositeVideoClip([video] + text_clips)
|
501 |
|
502 |
if add_voiceover and audio_segments:
|
503 |
-
|
504 |
-
|
505 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
506 |
logger.info(f"Saving the final video to: {output_path}")
|
507 |
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
508 |
|
509 |
logger.info("Video processing completed successfully.")
|
510 |
|
511 |
-
# Optional: return errors
|
512 |
if error_messages:
|
513 |
logger.warning("β οΈ Errors encountered during processing:")
|
514 |
for msg in error_messages:
|
@@ -516,7 +526,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
|
|
516 |
|
517 |
return error_messages
|
518 |
|
519 |
-
def generate_voiceover_clone(translated_json, tts_model, desired_duration, target_language, speaker_wav_path, output_audio_path):
|
520 |
try:
|
521 |
full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
|
522 |
if not full_text.strip():
|
@@ -529,13 +539,6 @@ def generate_voiceover_clone(translated_json, tts_model, desired_duration, targe
|
|
529 |
logger.error(msg)
|
530 |
return None, msg, msg
|
531 |
|
532 |
-
# # Truncate text based on max token assumption (~60 tokens)
|
533 |
-
# MAX_TTS_TOKENS = 60
|
534 |
-
# tokens = full_text.split() # crude token count
|
535 |
-
# if len(tokens) > MAX_TTS_TOKENS:
|
536 |
-
# logger.warning(f"β οΈ Text too long for TTS model ({len(tokens)} tokens). Truncating to {MAX_TTS_TOKENS} tokens.")
|
537 |
-
# full_text = " ".join(tokens[:MAX_TTS_TOKENS])
|
538 |
-
|
539 |
speed_tts = calibrated_speed(full_text, desired_duration)
|
540 |
tts_model.tts_to_file(
|
541 |
text=full_text,
|
|
|
126 |
return "Thank you for your feedback!", None
|
127 |
|
128 |
def segment_background_audio(audio_path, output_path="background_segments.wav", hf_token=None):
|
|
|
129 |
|
130 |
+
"""
|
131 |
+
Detects and extracts non-speech (background) segments from audio using pyannote VAD.
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
+
Parameters:
|
134 |
+
- audio_path (str): Path to input audio (.wav).
|
135 |
+
- output_path (str): Path to save the output non-speech audio.
|
136 |
+
- hf_token (str): Hugging Face auth token for pyannote.
|
|
|
137 |
|
138 |
+
Returns:
|
139 |
+
- List of non-speech timestamp tuples (start, end) in seconds.
|
140 |
+
"""
|
141 |
+
if not hf_token:
|
142 |
+
raise ValueError("Hugging Face token is required for pyannote pipeline.")
|
143 |
|
144 |
+
# Step 1: Load pipeline
|
145 |
+
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_token)
|
|
|
146 |
|
147 |
+
# Step 2: Apply VAD to get speech segments
|
148 |
+
vad_result = pipeline(audio_path)
|
149 |
+
print("β
Speech segments detected.")
|
150 |
|
151 |
+
# Step 3: Get full duration of the audio
|
152 |
+
full_audio = AudioSegment.from_wav(audio_path)
|
153 |
+
full_duration_sec = len(full_audio) / 1000.0
|
154 |
|
155 |
+
# Step 4: Compute non-speech segments
|
156 |
+
background_segments = []
|
157 |
+
current_time = 0.0
|
|
|
158 |
|
159 |
+
for segment in vad_result.itersegments():
|
160 |
+
if current_time < segment.start:
|
161 |
+
background_segments.append((current_time, segment.start))
|
162 |
+
current_time = segment.end
|
163 |
|
164 |
+
if current_time < full_duration_sec:
|
165 |
+
background_segments.append((current_time, full_duration_sec))
|
166 |
|
167 |
+
print(f"π Non-speech segments: {background_segments}")
|
|
|
|
|
|
|
|
|
168 |
|
169 |
+
# Step 5: Extract and combine non-speech segments
|
170 |
+
non_speech_audio = AudioSegment.empty()
|
171 |
+
for start, end in background_segments:
|
172 |
+
segment = full_audio[int(start * 1000):int(end * 1000)]
|
173 |
+
non_speech_audio += segment
|
174 |
|
175 |
+
# Step 6: Export the non-speech audio
|
176 |
+
non_speech_audio.export(output_path, format="wav")
|
177 |
+
print(f"π΅ Non-speech audio saved to: {output_path}")
|
178 |
|
179 |
+
return background_segments
|
180 |
|
181 |
def transcribe_video_with_speakers(video_path):
|
182 |
# Extract audio from video
|
|
|
474 |
## Need to implmenet backup option.
|
475 |
|
476 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
477 |
+
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths, background_audio_path="background_segments.wav")
|
478 |
for i, entry in enumerate(translated_json)]
|
479 |
|
480 |
results = []
|
|
|
498 |
final_video = CompositeVideoClip([video] + text_clips)
|
499 |
|
500 |
if add_voiceover and audio_segments:
|
501 |
+
try:
|
502 |
+
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
503 |
|
504 |
+
if background_audio_path and os.path.exists(background_audio_path):
|
505 |
+
background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
|
506 |
+
final_audio = CompositeAudioClip([voice_audio, background_audio])
|
507 |
+
logger.info("β
Background audio loaded and merged with voiceover.")
|
508 |
+
else:
|
509 |
+
final_audio = voice_audio
|
510 |
+
logger.info("β οΈ No background audio found. Using voiceover only.")
|
511 |
+
|
512 |
+
final_video = final_video.set_audio(final_audio)
|
513 |
+
|
514 |
+
except Exception as e:
|
515 |
+
logger.error(f"β Failed to set audio: {e}")
|
516 |
+
|
517 |
logger.info(f"Saving the final video to: {output_path}")
|
518 |
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
519 |
|
520 |
logger.info("Video processing completed successfully.")
|
521 |
|
|
|
522 |
if error_messages:
|
523 |
logger.warning("β οΈ Errors encountered during processing:")
|
524 |
for msg in error_messages:
|
|
|
526 |
|
527 |
return error_messages
|
528 |
|
529 |
+
def generate_voiceover_clone(translated_json, tts_model, desired_duration, target_language, speaker_wav_path, output_audio_path, use_clone=False):
|
530 |
try:
|
531 |
full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
|
532 |
if not full_text.strip():
|
|
|
539 |
logger.error(msg)
|
540 |
return None, msg, msg
|
541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
speed_tts = calibrated_speed(full_text, desired_duration)
|
543 |
tts_model.tts_to_file(
|
544 |
text=full_text,
|