Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import numpy as np
|
2 |
-
import cvxpy as cp
|
3 |
import re
|
4 |
import concurrent.futures
|
5 |
import gradio as gr
|
@@ -79,29 +78,24 @@ css = """
|
|
79 |
.dataframe-container tr {
|
80 |
height: 50px !important;
|
81 |
}
|
82 |
-
|
83 |
/* Ensure text wrapping and prevent overflow */
|
84 |
.dataframe-container td {
|
85 |
white-space: normal !important;
|
86 |
word-break: break-word !important;
|
87 |
}
|
88 |
-
|
89 |
/* Set column widths */
|
90 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
|
91 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
|
92 |
width: 6%; /* Start column */
|
93 |
}
|
94 |
-
|
95 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
|
96 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
|
97 |
width: 47%; /* Original text */
|
98 |
}
|
99 |
-
|
100 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
|
101 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
|
102 |
width: 47%; /* Translated text */
|
103 |
}
|
104 |
-
|
105 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
|
106 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
|
107 |
display: none !important;
|
@@ -173,7 +167,7 @@ def transcribe_video_with_speakers(video_path):
|
|
173 |
logger.info("WhisperX model loaded")
|
174 |
|
175 |
# Transcribe
|
176 |
-
result = model.transcribe(audio_path, chunk_size=
|
177 |
logger.info("Audio transcription completed")
|
178 |
|
179 |
# Get the detected language
|
@@ -238,7 +232,6 @@ def transcribe_video_with_speakers(video_path):
|
|
238 |
def get_translation_model(source_language, target_language):
|
239 |
"""
|
240 |
Get the translation model based on the source and target language.
|
241 |
-
|
242 |
Parameters:
|
243 |
- target_language (str): The language to translate the content into (e.g., 'es', 'fr').
|
244 |
- source_language (str): The language of the input content (default is 'en' for English).
|
@@ -383,44 +376,6 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
|
|
383 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
384 |
return None
|
385 |
|
386 |
-
def solve_optimal_alignment(original_segments, generated_durations, total_duration):
|
387 |
-
"""
|
388 |
-
Robust version: Aligns generated speech segments, falls back to greedy allocation if solver fails.
|
389 |
-
Modifies and returns the translated_json with updated 'start' and 'end'.
|
390 |
-
"""
|
391 |
-
N = len(original_segments)
|
392 |
-
d = np.array(generated_durations)
|
393 |
-
m = np.array([(seg['start'] + seg['end']) / 2 for seg in original_segments])
|
394 |
-
|
395 |
-
try:
|
396 |
-
s = cp.Variable(N)
|
397 |
-
objective = cp.Minimize(cp.sum_squares(s + d / 2 - m))
|
398 |
-
|
399 |
-
constraints = [s[0] >= 0]
|
400 |
-
for i in range(N - 1):
|
401 |
-
constraints.append(s[i] + d[i] <= s[i + 1])
|
402 |
-
constraints.append(s[N - 1] + d[N - 1] == total_duration)
|
403 |
-
|
404 |
-
problem = cp.Problem(objective, constraints)
|
405 |
-
problem.solve()
|
406 |
-
|
407 |
-
if s.value is None:
|
408 |
-
raise ValueError("Solver failed")
|
409 |
-
|
410 |
-
for i in range(N):
|
411 |
-
original_segments[i]['start'] = round(s.value[i], 3)
|
412 |
-
original_segments[i]['end'] = round(s.value[i] + d[i], 3)
|
413 |
-
|
414 |
-
except Exception as e:
|
415 |
-
print(f"β οΈ Optimization failed: {e}, falling back to greedy alignment.")
|
416 |
-
|
417 |
-
current_time = 0.0
|
418 |
-
for i in range(N):
|
419 |
-
original_segments[i]['start'] = round(current_time, 3)
|
420 |
-
original_segments[i]['end'] = round(current_time + generated_durations[i], 3)
|
421 |
-
current_time += generated_durations[i]
|
422 |
-
|
423 |
-
return original_segments
|
424 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
425 |
logger.debug(f"Processing entry {i}: {entry}")
|
426 |
error_message = None
|
@@ -433,7 +388,6 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
433 |
txt_clip = None
|
434 |
|
435 |
audio_segment = None
|
436 |
-
actual_duration = 0.0
|
437 |
if process_mode > 1:
|
438 |
try:
|
439 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
@@ -442,9 +396,10 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
442 |
|
443 |
speaker = entry.get("speaker", "default")
|
444 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
445 |
-
|
|
|
446 |
supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
|
447 |
-
|
448 |
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
|
449 |
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
450 |
else:
|
@@ -454,9 +409,14 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
454 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
455 |
|
456 |
audio_clip = AudioFileClip(segment_audio_path)
|
457 |
-
|
|
|
|
|
|
|
|
|
|
|
458 |
|
459 |
-
audio_segment = audio_clip
|
460 |
|
461 |
except Exception as e:
|
462 |
err = f"β Failed to generate audio segment for entry {i}: {e}"
|
@@ -464,31 +424,28 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
464 |
error_message = error_message + " | " + err if error_message else err
|
465 |
audio_segment = None
|
466 |
|
467 |
-
return i, txt_clip, audio_segment,
|
468 |
-
|
469 |
-
|
470 |
def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
|
471 |
-
|
472 |
video = VideoFileClip(video_path)
|
473 |
font_path = "./NotoSansSC-Regular.ttf"
|
474 |
|
475 |
text_clips = []
|
476 |
audio_segments = []
|
477 |
-
actual_durations = []
|
478 |
error_messages = []
|
479 |
-
|
480 |
if process_mode == 3:
|
481 |
global tts_model
|
482 |
if tts_model is None:
|
483 |
try:
|
484 |
print("π Loading XTTS model...")
|
485 |
-
from TTS.api import TTS
|
486 |
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
|
487 |
print("β
XTTS model loaded successfully.")
|
488 |
except Exception as e:
|
489 |
print("β Error loading XTTS model:")
|
490 |
traceback.print_exc()
|
491 |
return f"Error loading XTTS model: {e}"
|
|
|
492 |
|
493 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
494 |
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
@@ -497,48 +454,51 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
|
|
497 |
results = []
|
498 |
for future in concurrent.futures.as_completed(futures):
|
499 |
try:
|
500 |
-
i, txt_clip, audio_segment,
|
501 |
-
results.append((i, txt_clip, audio_segment
|
502 |
if error:
|
503 |
error_messages.append(f"[Entry {i}] {error}")
|
504 |
except Exception as e:
|
505 |
err = f"β Unexpected error in future result: {e}"
|
|
|
506 |
error_messages.append(err)
|
507 |
|
|
|
508 |
results.sort(key=lambda x: x[0])
|
509 |
-
text_clips = [clip for _, clip, _
|
510 |
-
|
511 |
-
|
512 |
-
# Align using optimization (modifies translated_json in-place)
|
513 |
-
translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
|
514 |
-
|
515 |
-
# Set aligned timings
|
516 |
-
audio_segments = []
|
517 |
-
for i, entry in enumerate(translated_json):
|
518 |
-
segment = results[i][2] # AudioFileClip
|
519 |
-
if segment:
|
520 |
-
segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
|
521 |
-
audio_segments.append(segment)
|
522 |
|
523 |
final_video = CompositeVideoClip([video] + text_clips)
|
524 |
|
525 |
-
if process_mode
|
526 |
try:
|
527 |
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
528 |
|
529 |
if background_audio_path and os.path.exists(background_audio_path):
|
530 |
background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
|
531 |
final_audio = CompositeAudioClip([voice_audio, background_audio])
|
|
|
|
|
532 |
else:
|
533 |
final_audio = voice_audio
|
|
|
534 |
|
535 |
final_video = final_video.set_audio(final_audio)
|
536 |
|
537 |
except Exception as e:
|
538 |
-
|
539 |
-
|
|
|
540 |
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
return error_messages
|
543 |
|
544 |
def generate_voiceover_OpenAI(full_text, language, desired_speed, output_audio_path):
|
@@ -737,5 +697,4 @@ def build_interface():
|
|
737 |
|
738 |
tts_model = None
|
739 |
# Launch the Gradio interface
|
740 |
-
demo = build_interface()
|
741 |
-
demo.launch()
|
|
|
1 |
import numpy as np
|
|
|
2 |
import re
|
3 |
import concurrent.futures
|
4 |
import gradio as gr
|
|
|
78 |
.dataframe-container tr {
|
79 |
height: 50px !important;
|
80 |
}
|
|
|
81 |
/* Ensure text wrapping and prevent overflow */
|
82 |
.dataframe-container td {
|
83 |
white-space: normal !important;
|
84 |
word-break: break-word !important;
|
85 |
}
|
|
|
86 |
/* Set column widths */
|
87 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
|
88 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
|
89 |
width: 6%; /* Start column */
|
90 |
}
|
|
|
91 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
|
92 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
|
93 |
width: 47%; /* Original text */
|
94 |
}
|
|
|
95 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
|
96 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
|
97 |
width: 47%; /* Translated text */
|
98 |
}
|
|
|
99 |
[data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
|
100 |
[data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
|
101 |
display: none !important;
|
|
|
167 |
logger.info("WhisperX model loaded")
|
168 |
|
169 |
# Transcribe
|
170 |
+
result = model.transcribe(audio_path, chunk_size=10, print_progress = True)
|
171 |
logger.info("Audio transcription completed")
|
172 |
|
173 |
# Get the detected language
|
|
|
232 |
def get_translation_model(source_language, target_language):
|
233 |
"""
|
234 |
Get the translation model based on the source and target language.
|
|
|
235 |
Parameters:
|
236 |
- target_language (str): The language to translate the content into (e.g., 'es', 'fr').
|
237 |
- source_language (str): The language of the input content (default is 'en' for English).
|
|
|
376 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
377 |
return None
|
378 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
379 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
380 |
logger.debug(f"Processing entry {i}: {entry}")
|
381 |
error_message = None
|
|
|
388 |
txt_clip = None
|
389 |
|
390 |
audio_segment = None
|
|
|
391 |
if process_mode > 1:
|
392 |
try:
|
393 |
segment_audio_path = f"segment_{i}_voiceover.wav"
|
|
|
396 |
|
397 |
speaker = entry.get("speaker", "default")
|
398 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
399 |
+
|
400 |
+
# Assume this is the list of supported languages for the TTS model
|
401 |
supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
|
402 |
+
|
403 |
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
|
404 |
generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
|
405 |
else:
|
|
|
409 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
410 |
|
411 |
audio_clip = AudioFileClip(segment_audio_path)
|
412 |
+
logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
|
413 |
+
|
414 |
+
if audio_clip.duration < desired_duration:
|
415 |
+
silence_duration = desired_duration - audio_clip.duration
|
416 |
+
audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
|
417 |
+
logger.info(f"Padded audio with {silence_duration} seconds of silence.")
|
418 |
|
419 |
+
audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
|
420 |
|
421 |
except Exception as e:
|
422 |
err = f"β Failed to generate audio segment for entry {i}: {e}"
|
|
|
424 |
error_message = error_message + " | " + err if error_message else err
|
425 |
audio_segment = None
|
426 |
|
427 |
+
return i, txt_clip, audio_segment, error_message
|
428 |
+
|
|
|
429 |
def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
|
|
|
430 |
video = VideoFileClip(video_path)
|
431 |
font_path = "./NotoSansSC-Regular.ttf"
|
432 |
|
433 |
text_clips = []
|
434 |
audio_segments = []
|
|
|
435 |
error_messages = []
|
436 |
+
|
437 |
if process_mode == 3:
|
438 |
global tts_model
|
439 |
if tts_model is None:
|
440 |
try:
|
441 |
print("π Loading XTTS model...")
|
|
|
442 |
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
|
443 |
print("β
XTTS model loaded successfully.")
|
444 |
except Exception as e:
|
445 |
print("β Error loading XTTS model:")
|
446 |
traceback.print_exc()
|
447 |
return f"Error loading XTTS model: {e}"
|
448 |
+
## Need to implmenet backup option.
|
449 |
|
450 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
451 |
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
|
|
454 |
results = []
|
455 |
for future in concurrent.futures.as_completed(futures):
|
456 |
try:
|
457 |
+
i, txt_clip, audio_segment, error = future.result()
|
458 |
+
results.append((i, txt_clip, audio_segment))
|
459 |
if error:
|
460 |
error_messages.append(f"[Entry {i}] {error}")
|
461 |
except Exception as e:
|
462 |
err = f"β Unexpected error in future result: {e}"
|
463 |
+
logger.error(err)
|
464 |
error_messages.append(err)
|
465 |
|
466 |
+
# Sort by entry index to ensure order
|
467 |
results.sort(key=lambda x: x[0])
|
468 |
+
text_clips = [clip for _, clip, _ in results if clip]
|
469 |
+
if process_mode>1:
|
470 |
+
audio_segments = [segment for _, _, segment in results if segment]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
471 |
|
472 |
final_video = CompositeVideoClip([video] + text_clips)
|
473 |
|
474 |
+
if process_mode>1 and audio_segments:
|
475 |
try:
|
476 |
voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
|
477 |
|
478 |
if background_audio_path and os.path.exists(background_audio_path):
|
479 |
background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
|
480 |
final_audio = CompositeAudioClip([voice_audio, background_audio])
|
481 |
+
# final_audio = voice_audio
|
482 |
+
logger.info("β
Background audio loaded and merged with voiceover.")
|
483 |
else:
|
484 |
final_audio = voice_audio
|
485 |
+
logger.info("β οΈ No background audio found. Using voiceover only.")
|
486 |
|
487 |
final_video = final_video.set_audio(final_audio)
|
488 |
|
489 |
except Exception as e:
|
490 |
+
logger.error(f"β Failed to set audio: {e}")
|
491 |
+
|
492 |
+
logger.info(f"Saving the final video to: {output_path}")
|
493 |
final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
|
494 |
|
495 |
+
logger.info("Video processing completed successfully.")
|
496 |
+
|
497 |
+
if error_messages:
|
498 |
+
logger.warning("β οΈ Errors encountered during processing:")
|
499 |
+
for msg in error_messages:
|
500 |
+
logger.warning(msg)
|
501 |
+
|
502 |
return error_messages
|
503 |
|
504 |
def generate_voiceover_OpenAI(full_text, language, desired_speed, output_audio_path):
|
|
|
697 |
|
698 |
tts_model = None
|
699 |
# Launch the Gradio interface
|
700 |
+
demo = build_interface()
|
|