qqwjq1981 commited on
Commit
40e96a3
Β·
verified Β·
1 Parent(s): bfa4d77

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -78
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import numpy as np
2
- import cvxpy as cp
3
  import re
4
  import concurrent.futures
5
  import gradio as gr
@@ -79,29 +78,24 @@ css = """
79
  .dataframe-container tr {
80
  height: 50px !important;
81
  }
82
-
83
  /* Ensure text wrapping and prevent overflow */
84
  .dataframe-container td {
85
  white-space: normal !important;
86
  word-break: break-word !important;
87
  }
88
-
89
  /* Set column widths */
90
  [data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
91
  [data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
92
  width: 6%; /* Start column */
93
  }
94
-
95
  [data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
96
  [data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
97
  width: 47%; /* Original text */
98
  }
99
-
100
  [data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
101
  [data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
102
  width: 47%; /* Translated text */
103
  }
104
-
105
  [data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
106
  [data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
107
  display: none !important;
@@ -173,7 +167,7 @@ def transcribe_video_with_speakers(video_path):
173
  logger.info("WhisperX model loaded")
174
 
175
  # Transcribe
176
- result = model.transcribe(audio_path, chunk_size=6, print_progress = True)
177
  logger.info("Audio transcription completed")
178
 
179
  # Get the detected language
@@ -238,7 +232,6 @@ def transcribe_video_with_speakers(video_path):
238
  def get_translation_model(source_language, target_language):
239
  """
240
  Get the translation model based on the source and target language.
241
-
242
  Parameters:
243
  - target_language (str): The language to translate the content into (e.g., 'es', 'fr').
244
  - source_language (str): The language of the input content (default is 'en' for English).
@@ -383,44 +376,6 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
383
  logger.error(f"\u274c Failed to create subtitle clip: {e}")
384
  return None
385
 
386
- def solve_optimal_alignment(original_segments, generated_durations, total_duration):
387
- """
388
- Robust version: Aligns generated speech segments, falls back to greedy allocation if solver fails.
389
- Modifies and returns the translated_json with updated 'start' and 'end'.
390
- """
391
- N = len(original_segments)
392
- d = np.array(generated_durations)
393
- m = np.array([(seg['start'] + seg['end']) / 2 for seg in original_segments])
394
-
395
- try:
396
- s = cp.Variable(N)
397
- objective = cp.Minimize(cp.sum_squares(s + d / 2 - m))
398
-
399
- constraints = [s[0] >= 0]
400
- for i in range(N - 1):
401
- constraints.append(s[i] + d[i] <= s[i + 1])
402
- constraints.append(s[N - 1] + d[N - 1] == total_duration)
403
-
404
- problem = cp.Problem(objective, constraints)
405
- problem.solve()
406
-
407
- if s.value is None:
408
- raise ValueError("Solver failed")
409
-
410
- for i in range(N):
411
- original_segments[i]['start'] = round(s.value[i], 3)
412
- original_segments[i]['end'] = round(s.value[i] + d[i], 3)
413
-
414
- except Exception as e:
415
- print(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
416
-
417
- current_time = 0.0
418
- for i in range(N):
419
- original_segments[i]['start'] = round(current_time, 3)
420
- original_segments[i]['end'] = round(current_time + generated_durations[i], 3)
421
- current_time += generated_durations[i]
422
-
423
- return original_segments
424
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
425
  logger.debug(f"Processing entry {i}: {entry}")
426
  error_message = None
@@ -433,7 +388,6 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
433
  txt_clip = None
434
 
435
  audio_segment = None
436
- actual_duration = 0.0
437
  if process_mode > 1:
438
  try:
439
  segment_audio_path = f"segment_{i}_voiceover.wav"
@@ -442,9 +396,10 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
442
 
443
  speaker = entry.get("speaker", "default")
444
  speaker_wav_path = f"speaker_{speaker}_sample.wav"
445
-
 
446
  supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
447
-
448
  if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
449
  generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
450
  else:
@@ -454,9 +409,14 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
454
  raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
455
 
456
  audio_clip = AudioFileClip(segment_audio_path)
457
- actual_duration = audio_clip.duration
 
 
 
 
 
458
 
459
- audio_segment = audio_clip # Do not set start here, alignment happens later
460
 
461
  except Exception as e:
462
  err = f"❌ Failed to generate audio segment for entry {i}: {e}"
@@ -464,31 +424,28 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
464
  error_message = error_message + " | " + err if error_message else err
465
  audio_segment = None
466
 
467
- return i, txt_clip, audio_segment, actual_duration, error_message
468
-
469
-
470
  def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
471
-
472
  video = VideoFileClip(video_path)
473
  font_path = "./NotoSansSC-Regular.ttf"
474
 
475
  text_clips = []
476
  audio_segments = []
477
- actual_durations = []
478
  error_messages = []
479
-
480
  if process_mode == 3:
481
  global tts_model
482
  if tts_model is None:
483
  try:
484
  print("πŸ”„ Loading XTTS model...")
485
- from TTS.api import TTS
486
  tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
487
  print("βœ… XTTS model loaded successfully.")
488
  except Exception as e:
489
  print("❌ Error loading XTTS model:")
490
  traceback.print_exc()
491
  return f"Error loading XTTS model: {e}"
 
492
 
493
  with concurrent.futures.ThreadPoolExecutor() as executor:
494
  futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
@@ -497,48 +454,51 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
497
  results = []
498
  for future in concurrent.futures.as_completed(futures):
499
  try:
500
- i, txt_clip, audio_segment, actual_duration, error = future.result()
501
- results.append((i, txt_clip, audio_segment, actual_duration))
502
  if error:
503
  error_messages.append(f"[Entry {i}] {error}")
504
  except Exception as e:
505
  err = f"❌ Unexpected error in future result: {e}"
 
506
  error_messages.append(err)
507
 
 
508
  results.sort(key=lambda x: x[0])
509
- text_clips = [clip for _, clip, _, _ in results if clip]
510
- generated_durations = [dur for _, _, _, dur in results if dur > 0]
511
-
512
- # Align using optimization (modifies translated_json in-place)
513
- translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
514
-
515
- # Set aligned timings
516
- audio_segments = []
517
- for i, entry in enumerate(translated_json):
518
- segment = results[i][2] # AudioFileClip
519
- if segment:
520
- segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
521
- audio_segments.append(segment)
522
 
523
  final_video = CompositeVideoClip([video] + text_clips)
524
 
525
- if process_mode > 1 and audio_segments:
526
  try:
527
  voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
528
 
529
  if background_audio_path and os.path.exists(background_audio_path):
530
  background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
531
  final_audio = CompositeAudioClip([voice_audio, background_audio])
 
 
532
  else:
533
  final_audio = voice_audio
 
534
 
535
  final_video = final_video.set_audio(final_audio)
536
 
537
  except Exception as e:
538
- print(f"❌ Failed to set audio: {e}")
539
-
 
540
  final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
541
 
 
 
 
 
 
 
 
542
  return error_messages
543
 
544
  def generate_voiceover_OpenAI(full_text, language, desired_speed, output_audio_path):
@@ -737,5 +697,4 @@ def build_interface():
737
 
738
  tts_model = None
739
  # Launch the Gradio interface
740
- demo = build_interface()
741
- demo.launch()
 
1
  import numpy as np
 
2
  import re
3
  import concurrent.futures
4
  import gradio as gr
 
78
  .dataframe-container tr {
79
  height: 50px !important;
80
  }
 
81
  /* Ensure text wrapping and prevent overflow */
82
  .dataframe-container td {
83
  white-space: normal !important;
84
  word-break: break-word !important;
85
  }
 
86
  /* Set column widths */
87
  [data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
88
  [data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
89
  width: 6%; /* Start column */
90
  }
 
91
  [data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
92
  [data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
93
  width: 47%; /* Original text */
94
  }
 
95
  [data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
96
  [data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
97
  width: 47%; /* Translated text */
98
  }
 
99
  [data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
100
  [data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
101
  display: none !important;
 
167
  logger.info("WhisperX model loaded")
168
 
169
  # Transcribe
170
+ result = model.transcribe(audio_path, chunk_size=10, print_progress = True)
171
  logger.info("Audio transcription completed")
172
 
173
  # Get the detected language
 
232
  def get_translation_model(source_language, target_language):
233
  """
234
  Get the translation model based on the source and target language.
 
235
  Parameters:
236
  - target_language (str): The language to translate the content into (e.g., 'es', 'fr').
237
  - source_language (str): The language of the input content (default is 'en' for English).
 
376
  logger.error(f"\u274c Failed to create subtitle clip: {e}")
377
  return None
378
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
380
  logger.debug(f"Processing entry {i}: {entry}")
381
  error_message = None
 
388
  txt_clip = None
389
 
390
  audio_segment = None
 
391
  if process_mode > 1:
392
  try:
393
  segment_audio_path = f"segment_{i}_voiceover.wav"
 
396
 
397
  speaker = entry.get("speaker", "default")
398
  speaker_wav_path = f"speaker_{speaker}_sample.wav"
399
+
400
+ # Assume this is the list of supported languages for the TTS model
401
  supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
402
+
403
  if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
404
  generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
405
  else:
 
409
  raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
410
 
411
  audio_clip = AudioFileClip(segment_audio_path)
412
+ logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
413
+
414
+ if audio_clip.duration < desired_duration:
415
+ silence_duration = desired_duration - audio_clip.duration
416
+ audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
417
+ logger.info(f"Padded audio with {silence_duration} seconds of silence.")
418
 
419
+ audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
420
 
421
  except Exception as e:
422
  err = f"❌ Failed to generate audio segment for entry {i}: {e}"
 
424
  error_message = error_message + " | " + err if error_message else err
425
  audio_segment = None
426
 
427
+ return i, txt_clip, audio_segment, error_message
428
+
 
429
  def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
 
430
  video = VideoFileClip(video_path)
431
  font_path = "./NotoSansSC-Regular.ttf"
432
 
433
  text_clips = []
434
  audio_segments = []
 
435
  error_messages = []
436
+
437
  if process_mode == 3:
438
  global tts_model
439
  if tts_model is None:
440
  try:
441
  print("πŸ”„ Loading XTTS model...")
 
442
  tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts")
443
  print("βœ… XTTS model loaded successfully.")
444
  except Exception as e:
445
  print("❌ Error loading XTTS model:")
446
  traceback.print_exc()
447
  return f"Error loading XTTS model: {e}"
448
+ ## Need to implmenet backup option.
449
 
450
  with concurrent.futures.ThreadPoolExecutor() as executor:
451
  futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
 
454
  results = []
455
  for future in concurrent.futures.as_completed(futures):
456
  try:
457
+ i, txt_clip, audio_segment, error = future.result()
458
+ results.append((i, txt_clip, audio_segment))
459
  if error:
460
  error_messages.append(f"[Entry {i}] {error}")
461
  except Exception as e:
462
  err = f"❌ Unexpected error in future result: {e}"
463
+ logger.error(err)
464
  error_messages.append(err)
465
 
466
+ # Sort by entry index to ensure order
467
  results.sort(key=lambda x: x[0])
468
+ text_clips = [clip for _, clip, _ in results if clip]
469
+ if process_mode>1:
470
+ audio_segments = [segment for _, _, segment in results if segment]
 
 
 
 
 
 
 
 
 
 
471
 
472
  final_video = CompositeVideoClip([video] + text_clips)
473
 
474
+ if process_mode>1 and audio_segments:
475
  try:
476
  voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
477
 
478
  if background_audio_path and os.path.exists(background_audio_path):
479
  background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
480
  final_audio = CompositeAudioClip([voice_audio, background_audio])
481
+ # final_audio = voice_audio
482
+ logger.info("βœ… Background audio loaded and merged with voiceover.")
483
  else:
484
  final_audio = voice_audio
485
+ logger.info("⚠️ No background audio found. Using voiceover only.")
486
 
487
  final_video = final_video.set_audio(final_audio)
488
 
489
  except Exception as e:
490
+ logger.error(f"❌ Failed to set audio: {e}")
491
+
492
+ logger.info(f"Saving the final video to: {output_path}")
493
  final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
494
 
495
+ logger.info("Video processing completed successfully.")
496
+
497
+ if error_messages:
498
+ logger.warning("⚠️ Errors encountered during processing:")
499
+ for msg in error_messages:
500
+ logger.warning(msg)
501
+
502
  return error_messages
503
 
504
  def generate_voiceover_OpenAI(full_text, language, desired_speed, output_audio_path):
 
697
 
698
  tts_model = None
699
  # Launch the Gradio interface
700
+ demo = build_interface()