Spaces:

Curify
/

studio_V1

Sleeping

App Files Files Community

qqwjq1981 commited on Apr 5

Commit

68eec85

verified ·

1 Parent(s): 35650ec

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -52

app.py CHANGED Viewed

@@ -126,59 +126,57 @@ def handle_feedback(feedback):
         return "Thank you for your feedback!", None
 def segment_background_audio(audio_path, output_path="background_segments.wav", hf_token=None):
-    return 10
-    # """
-    # Detects and extracts non-speech (background) segments from audio using pyannote VAD.
-    # Parameters:
-    # - audio_path (str): Path to input audio (.wav).
-    # - output_path (str): Path to save the output non-speech audio.
-    # - hf_token (str): Hugging Face auth token for pyannote.
-    # Returns:
-    # - List of non-speech timestamp tuples (start, end) in seconds.
-    # """
-    # if not hf_token:
-    #     raise ValueError("Hugging Face token is required for pyannote pipeline.")
-    # # Step 1: Load pipeline
-    # pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_token)
-    # # Step 2: Apply VAD to get speech segments
-    # vad_result = pipeline(audio_path)
-    # print("✅ Speech segments detected.")
-    # # Step 3: Get full duration of the audio
-    # full_audio = AudioSegment.from_wav(audio_path)
-    # full_duration_sec = len(full_audio) / 1000.0
-    # # Step 4: Compute non-speech segments
-    # background_segments = []
-    # current_time = 0.0
-    # for segment in vad_result.itersegments():
-    #     if current_time < segment.start:
-    #         background_segments.append((current_time, segment.start))
-    #     current_time = segment.end
-    # if current_time < full_duration_sec:
-    #     background_segments.append((current_time, full_duration_sec))
-    # print(f"🕒 Non-speech segments: {background_segments}")
-    # # Step 5: Extract and combine non-speech segments
-    # non_speech_audio = AudioSegment.empty()
-    # for start, end in background_segments:
-    #     segment = full_audio[int(start * 1000):int(end * 1000)]
-    #     non_speech_audio += segment
-    # # Step 6: Export the non-speech audio
-    # non_speech_audio.export(output_path, format="wav")
-    # print(f"🎵 Non-speech audio saved to: {output_path}")
-    # return background_segments
 def transcribe_video_with_speakers(video_path):
     # Extract audio from video
@@ -476,7 +474,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
             ## Need to implmenet backup option.
     with concurrent.futures.ThreadPoolExecutor() as executor:
-        futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths)
                    for i, entry in enumerate(translated_json)]
         results = []
@@ -500,15 +498,27 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
     final_video = CompositeVideoClip([video] + text_clips)
     if add_voiceover and audio_segments:
-        final_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
-        final_video = final_video.set_audio(final_audio)
     logger.info(f"Saving the final video to: {output_path}")
     final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
     logger.info("Video processing completed successfully.")
-    # Optional: return errors
     if error_messages:
         logger.warning("⚠️ Errors encountered during processing:")
         for msg in error_messages:
@@ -516,7 +526,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
     return error_messages
-def generate_voiceover_clone(translated_json, tts_model, desired_duration, target_language, speaker_wav_path, output_audio_path):
     try:
         full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
         if not full_text.strip():
@@ -529,13 +539,6 @@ def generate_voiceover_clone(translated_json, tts_model, desired_duration, targe
             logger.error(msg)
             return None, msg, msg
-        # # Truncate text based on max token assumption (~60 tokens)
-        # MAX_TTS_TOKENS = 60
-        # tokens = full_text.split()  # crude token count
-        # if len(tokens) > MAX_TTS_TOKENS:
-        #     logger.warning(f"⚠️ Text too long for TTS model ({len(tokens)} tokens). Truncating to {MAX_TTS_TOKENS} tokens.")
-        #     full_text = " ".join(tokens[:MAX_TTS_TOKENS])
         speed_tts = calibrated_speed(full_text, desired_duration)
         tts_model.tts_to_file(
             text=full_text,

         return "Thank you for your feedback!", None
 def segment_background_audio(audio_path, output_path="background_segments.wav", hf_token=None):
+    """
+    Detects and extracts non-speech (background) segments from audio using pyannote VAD.
+    Parameters:
+    - audio_path (str): Path to input audio (.wav).
+    - output_path (str): Path to save the output non-speech audio.
+    - hf_token (str): Hugging Face auth token for pyannote.
+    Returns:
+    - List of non-speech timestamp tuples (start, end) in seconds.
+    """
+    if not hf_token:
+        raise ValueError("Hugging Face token is required for pyannote pipeline.")
+    # Step 1: Load pipeline
+    pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_token)
+    # Step 2: Apply VAD to get speech segments
+    vad_result = pipeline(audio_path)
+    print("✅ Speech segments detected.")
+    # Step 3: Get full duration of the audio
+    full_audio = AudioSegment.from_wav(audio_path)
+    full_duration_sec = len(full_audio) / 1000.0
+    # Step 4: Compute non-speech segments
+    background_segments = []
+    current_time = 0.0
+    for segment in vad_result.itersegments():
+        if current_time < segment.start:
+            background_segments.append((current_time, segment.start))
+        current_time = segment.end
+    if current_time < full_duration_sec:
+        background_segments.append((current_time, full_duration_sec))
+    print(f"🕒 Non-speech segments: {background_segments}")
+    # Step 5: Extract and combine non-speech segments
+    non_speech_audio = AudioSegment.empty()
+    for start, end in background_segments:
+        segment = full_audio[int(start * 1000):int(end * 1000)]
+        non_speech_audio += segment
+    # Step 6: Export the non-speech audio
+    non_speech_audio.export(output_path, format="wav")
+    print(f"🎵 Non-speech audio saved to: {output_path}")
+    return background_segments
 def transcribe_video_with_speakers(video_path):
     # Extract audio from video
             ## Need to implmenet backup option.
     with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths, background_audio_path="background_segments.wav")
                    for i, entry in enumerate(translated_json)]
         results = []
     final_video = CompositeVideoClip([video] + text_clips)
     if add_voiceover and audio_segments:
+        try:
+            voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
+            if background_audio_path and os.path.exists(background_audio_path):
+                background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
+                final_audio = CompositeAudioClip([voice_audio, background_audio])
+                logger.info("✅ Background audio loaded and merged with voiceover.")
+            else:
+                final_audio = voice_audio
+                logger.info("⚠️ No background audio found. Using voiceover only.")
+            final_video = final_video.set_audio(final_audio)
+        except Exception as e:
+            logger.error(f"❌ Failed to set audio: {e}")
     logger.info(f"Saving the final video to: {output_path}")
     final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
     logger.info("Video processing completed successfully.")
     if error_messages:
         logger.warning("⚠️ Errors encountered during processing:")
         for msg in error_messages:
     return error_messages
+def generate_voiceover_clone(translated_json, tts_model, desired_duration, target_language, speaker_wav_path, output_audio_path, use_clone=False):
     try:
         full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
         if not full_text.strip():
             logger.error(msg)
             return None, msg, msg
         speed_tts = calibrated_speed(full_text, desired_duration)
         tts_model.tts_to_file(
             text=full_text,