qqwjq1981 commited on
Commit
68eec85
Β·
verified Β·
1 Parent(s): 35650ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -52
app.py CHANGED
@@ -126,59 +126,57 @@ def handle_feedback(feedback):
126
  return "Thank you for your feedback!", None
127
 
128
  def segment_background_audio(audio_path, output_path="background_segments.wav", hf_token=None):
129
- return 10
130
 
131
- # """
132
- # Detects and extracts non-speech (background) segments from audio using pyannote VAD.
133
-
134
- # Parameters:
135
- # - audio_path (str): Path to input audio (.wav).
136
- # - output_path (str): Path to save the output non-speech audio.
137
- # - hf_token (str): Hugging Face auth token for pyannote.
138
 
139
- # Returns:
140
- # - List of non-speech timestamp tuples (start, end) in seconds.
141
- # """
142
- # if not hf_token:
143
- # raise ValueError("Hugging Face token is required for pyannote pipeline.")
144
 
145
- # # Step 1: Load pipeline
146
- # pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_token)
 
 
 
147
 
148
- # # Step 2: Apply VAD to get speech segments
149
- # vad_result = pipeline(audio_path)
150
- # print("βœ… Speech segments detected.")
151
 
152
- # # Step 3: Get full duration of the audio
153
- # full_audio = AudioSegment.from_wav(audio_path)
154
- # full_duration_sec = len(full_audio) / 1000.0
155
 
156
- # # Step 4: Compute non-speech segments
157
- # background_segments = []
158
- # current_time = 0.0
159
 
160
- # for segment in vad_result.itersegments():
161
- # if current_time < segment.start:
162
- # background_segments.append((current_time, segment.start))
163
- # current_time = segment.end
164
 
165
- # if current_time < full_duration_sec:
166
- # background_segments.append((current_time, full_duration_sec))
 
 
167
 
168
- # print(f"πŸ•’ Non-speech segments: {background_segments}")
 
169
 
170
- # # Step 5: Extract and combine non-speech segments
171
- # non_speech_audio = AudioSegment.empty()
172
- # for start, end in background_segments:
173
- # segment = full_audio[int(start * 1000):int(end * 1000)]
174
- # non_speech_audio += segment
175
 
176
- # # Step 6: Export the non-speech audio
177
- # non_speech_audio.export(output_path, format="wav")
178
- # print(f"🎡 Non-speech audio saved to: {output_path}")
 
 
179
 
180
- # return background_segments
 
 
181
 
 
182
 
183
  def transcribe_video_with_speakers(video_path):
184
  # Extract audio from video
@@ -476,7 +474,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
476
  ## Need to implmenet backup option.
477
 
478
  with concurrent.futures.ThreadPoolExecutor() as executor:
479
- futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths)
480
  for i, entry in enumerate(translated_json)]
481
 
482
  results = []
@@ -500,15 +498,27 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
500
  final_video = CompositeVideoClip([video] + text_clips)
501
 
502
  if add_voiceover and audio_segments:
503
- final_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
504
- final_video = final_video.set_audio(final_audio)
505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
  logger.info(f"Saving the final video to: {output_path}")
507
  final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
508
 
509
  logger.info("Video processing completed successfully.")
510
 
511
- # Optional: return errors
512
  if error_messages:
513
  logger.warning("⚠️ Errors encountered during processing:")
514
  for msg in error_messages:
@@ -516,7 +526,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
516
 
517
  return error_messages
518
 
519
- def generate_voiceover_clone(translated_json, tts_model, desired_duration, target_language, speaker_wav_path, output_audio_path):
520
  try:
521
  full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
522
  if not full_text.strip():
@@ -529,13 +539,6 @@ def generate_voiceover_clone(translated_json, tts_model, desired_duration, targe
529
  logger.error(msg)
530
  return None, msg, msg
531
 
532
- # # Truncate text based on max token assumption (~60 tokens)
533
- # MAX_TTS_TOKENS = 60
534
- # tokens = full_text.split() # crude token count
535
- # if len(tokens) > MAX_TTS_TOKENS:
536
- # logger.warning(f"⚠️ Text too long for TTS model ({len(tokens)} tokens). Truncating to {MAX_TTS_TOKENS} tokens.")
537
- # full_text = " ".join(tokens[:MAX_TTS_TOKENS])
538
-
539
  speed_tts = calibrated_speed(full_text, desired_duration)
540
  tts_model.tts_to_file(
541
  text=full_text,
 
126
  return "Thank you for your feedback!", None
127
 
128
  def segment_background_audio(audio_path, output_path="background_segments.wav", hf_token=None):
 
129
 
130
+ """
131
+ Detects and extracts non-speech (background) segments from audio using pyannote VAD.
 
 
 
 
 
132
 
133
+ Parameters:
134
+ - audio_path (str): Path to input audio (.wav).
135
+ - output_path (str): Path to save the output non-speech audio.
136
+ - hf_token (str): Hugging Face auth token for pyannote.
 
137
 
138
+ Returns:
139
+ - List of non-speech timestamp tuples (start, end) in seconds.
140
+ """
141
+ if not hf_token:
142
+ raise ValueError("Hugging Face token is required for pyannote pipeline.")
143
 
144
+ # Step 1: Load pipeline
145
+ pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_token)
 
146
 
147
+ # Step 2: Apply VAD to get speech segments
148
+ vad_result = pipeline(audio_path)
149
+ print("βœ… Speech segments detected.")
150
 
151
+ # Step 3: Get full duration of the audio
152
+ full_audio = AudioSegment.from_wav(audio_path)
153
+ full_duration_sec = len(full_audio) / 1000.0
154
 
155
+ # Step 4: Compute non-speech segments
156
+ background_segments = []
157
+ current_time = 0.0
 
158
 
159
+ for segment in vad_result.itersegments():
160
+ if current_time < segment.start:
161
+ background_segments.append((current_time, segment.start))
162
+ current_time = segment.end
163
 
164
+ if current_time < full_duration_sec:
165
+ background_segments.append((current_time, full_duration_sec))
166
 
167
+ print(f"πŸ•’ Non-speech segments: {background_segments}")
 
 
 
 
168
 
169
+ # Step 5: Extract and combine non-speech segments
170
+ non_speech_audio = AudioSegment.empty()
171
+ for start, end in background_segments:
172
+ segment = full_audio[int(start * 1000):int(end * 1000)]
173
+ non_speech_audio += segment
174
 
175
+ # Step 6: Export the non-speech audio
176
+ non_speech_audio.export(output_path, format="wav")
177
+ print(f"🎡 Non-speech audio saved to: {output_path}")
178
 
179
+ return background_segments
180
 
181
  def transcribe_video_with_speakers(video_path):
182
  # Extract audio from video
 
474
  ## Need to implmenet backup option.
475
 
476
  with concurrent.futures.ThreadPoolExecutor() as executor:
477
+ futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths, background_audio_path="background_segments.wav")
478
  for i, entry in enumerate(translated_json)]
479
 
480
  results = []
 
498
  final_video = CompositeVideoClip([video] + text_clips)
499
 
500
  if add_voiceover and audio_segments:
501
+ try:
502
+ voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
503
 
504
+ if background_audio_path and os.path.exists(background_audio_path):
505
+ background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
506
+ final_audio = CompositeAudioClip([voice_audio, background_audio])
507
+ logger.info("βœ… Background audio loaded and merged with voiceover.")
508
+ else:
509
+ final_audio = voice_audio
510
+ logger.info("⚠️ No background audio found. Using voiceover only.")
511
+
512
+ final_video = final_video.set_audio(final_audio)
513
+
514
+ except Exception as e:
515
+ logger.error(f"❌ Failed to set audio: {e}")
516
+
517
  logger.info(f"Saving the final video to: {output_path}")
518
  final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
519
 
520
  logger.info("Video processing completed successfully.")
521
 
 
522
  if error_messages:
523
  logger.warning("⚠️ Errors encountered during processing:")
524
  for msg in error_messages:
 
526
 
527
  return error_messages
528
 
529
+ def generate_voiceover_clone(translated_json, tts_model, desired_duration, target_language, speaker_wav_path, output_audio_path, use_clone=False):
530
  try:
531
  full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
532
  if not full_text.strip():
 
539
  logger.error(msg)
540
  return None, msg, msg
541
 
 
 
 
 
 
 
 
542
  speed_tts = calibrated_speed(full_text, desired_duration)
543
  tts_model.tts_to_file(
544
  text=full_text,