Update app.py
Browse files
app.py
CHANGED
@@ -370,9 +370,12 @@ def process_entry(entry, i, video_width, video_height, add_voiceover, target_lan
|
|
370 |
speaker = entry.get("speaker", "default")
|
371 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
372 |
|
373 |
-
generate_voiceover_clone([entry], desired_duration, target_language, speaker_wav_path, segment_audio_path)
|
374 |
|
375 |
-
if
|
|
|
|
|
|
|
376 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
377 |
|
378 |
audio_clip = AudioFileClip(segment_audio_path)
|
@@ -392,6 +395,7 @@ def process_entry(entry, i, video_width, video_height, add_voiceover, target_lan
|
|
392 |
audio_segment = None
|
393 |
|
394 |
return i, txt_clip, audio_segment, error_message
|
|
|
395 |
def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None):
|
396 |
video = VideoFileClip(video_path)
|
397 |
font_path = "./NotoSansSC-Regular.ttf"
|
@@ -459,19 +463,17 @@ tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
|
|
459 |
|
460 |
def generate_voiceover_clone(translated_json, desired_duration, target_language, speaker_wav_path, output_audio_path):
|
461 |
try:
|
462 |
-
# 1. Assemble full text
|
463 |
full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
|
464 |
-
|
465 |
if not full_text.strip():
|
466 |
-
|
467 |
-
|
|
|
468 |
|
469 |
-
# 2. Check speaker file path
|
470 |
if not speaker_wav_path or not os.path.exists(speaker_wav_path):
|
471 |
-
|
472 |
-
|
|
|
473 |
|
474 |
-
# Optional: Print speaker audio duration
|
475 |
try:
|
476 |
with wave.open(speaker_wav_path, 'rb') as wav_file:
|
477 |
duration = wav_file.getnframes() / wav_file.getframerate()
|
@@ -479,36 +481,29 @@ def generate_voiceover_clone(translated_json, desired_duration, target_language,
|
|
479 |
except Exception as e:
|
480 |
logger.warning(f"β οΈ Could not read speaker WAV duration: {e}")
|
481 |
|
482 |
-
# 3. Log key inputs
|
483 |
-
logger.info(f"π₯ Received Text: {full_text}")
|
484 |
-
logger.info(f"π Speaker WAV Path: {speaker_wav_path}")
|
485 |
-
logger.info(f"π Target Language: {target_language}")
|
486 |
-
logger.info(f"πΎ Output Path: {output_audio_path}")
|
487 |
-
logger.info(f"β±οΈ Target Duration: {desired_duration:.2f}s")
|
488 |
-
|
489 |
-
# 4. Call TTS to generate audio
|
490 |
speed_tts = calculate_speed(full_text, desired_duration)
|
491 |
tts.tts_to_file(
|
492 |
text=full_text,
|
493 |
speaker_wav=speaker_wav_path,
|
494 |
language=target_language,
|
495 |
file_path=output_audio_path,
|
496 |
-
# Uncomment if your model supports speed:
|
497 |
speed=speed_tts
|
498 |
)
|
499 |
|
500 |
-
# 5. Confirm file was written
|
501 |
if not os.path.exists(output_audio_path):
|
502 |
-
|
503 |
-
|
|
|
504 |
|
505 |
-
|
506 |
-
|
|
|
507 |
|
508 |
except Exception as e:
|
|
|
509 |
logger.error("β Error during voice cloning:")
|
510 |
logger.error(traceback.format_exc())
|
511 |
-
return None,
|
512 |
|
513 |
def truncated_linear(x):
|
514 |
if x < 15:
|
|
|
370 |
speaker = entry.get("speaker", "default")
|
371 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
372 |
|
373 |
+
output_path, status_msg, tts_error = generate_voiceover_clone([entry], desired_duration, target_language, speaker_wav_path, segment_audio_path)
|
374 |
|
375 |
+
if tts_error:
|
376 |
+
error_message = error_message + " | " + tts_error if error_message else tts_error
|
377 |
+
|
378 |
+
if not output_path or not os.path.exists(segment_audio_path):
|
379 |
raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
|
380 |
|
381 |
audio_clip = AudioFileClip(segment_audio_path)
|
|
|
395 |
audio_segment = None
|
396 |
|
397 |
return i, txt_clip, audio_segment, error_message
|
398 |
+
|
399 |
def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None):
|
400 |
video = VideoFileClip(video_path)
|
401 |
font_path = "./NotoSansSC-Regular.ttf"
|
|
|
463 |
|
464 |
def generate_voiceover_clone(translated_json, desired_duration, target_language, speaker_wav_path, output_audio_path):
|
465 |
try:
|
|
|
466 |
full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
|
|
|
467 |
if not full_text.strip():
|
468 |
+
msg = "β Translated text is empty."
|
469 |
+
logger.error(msg)
|
470 |
+
return None, msg, msg
|
471 |
|
|
|
472 |
if not speaker_wav_path or not os.path.exists(speaker_wav_path):
|
473 |
+
msg = f"β Speaker audio not found: {speaker_wav_path}"
|
474 |
+
logger.error(msg)
|
475 |
+
return None, msg, msg
|
476 |
|
|
|
477 |
try:
|
478 |
with wave.open(speaker_wav_path, 'rb') as wav_file:
|
479 |
duration = wav_file.getnframes() / wav_file.getframerate()
|
|
|
481 |
except Exception as e:
|
482 |
logger.warning(f"β οΈ Could not read speaker WAV duration: {e}")
|
483 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
speed_tts = calculate_speed(full_text, desired_duration)
|
485 |
tts.tts_to_file(
|
486 |
text=full_text,
|
487 |
speaker_wav=speaker_wav_path,
|
488 |
language=target_language,
|
489 |
file_path=output_audio_path,
|
|
|
490 |
speed=speed_tts
|
491 |
)
|
492 |
|
|
|
493 |
if not os.path.exists(output_audio_path):
|
494 |
+
msg = f"β Voiceover file not generated at: {output_audio_path}"
|
495 |
+
logger.error(msg)
|
496 |
+
return None, msg, msg
|
497 |
|
498 |
+
msg = "β
Voice cloning completed successfully."
|
499 |
+
logger.info(msg)
|
500 |
+
return output_audio_path, msg, None
|
501 |
|
502 |
except Exception as e:
|
503 |
+
err_msg = f"β An error occurred: {str(e)}"
|
504 |
logger.error("β Error during voice cloning:")
|
505 |
logger.error(traceback.format_exc())
|
506 |
+
return None, err_msg, err_msg
|
507 |
|
508 |
def truncated_linear(x):
|
509 |
if x < 15:
|