qqwjq1981 commited on
Commit
54c4dc6
Β·
verified Β·
1 Parent(s): e00a203

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -44
app.py CHANGED
@@ -33,10 +33,10 @@ import traceback
33
  from TTS.api import TTS
34
  import torch
35
  from TTS.tts.configs.xtts_config import XttsConfig
36
- # from pydub import AudioSegment
37
- # from pyannote.audio import Pipeline
38
- # import traceback
39
- # import wave
40
 
41
  logger = logging.getLogger(__name__)
42
 
@@ -126,34 +126,34 @@ def handle_feedback(feedback):
126
  conn.commit()
127
  return "Thank you for your feedback!", None
128
 
129
- # def segment_background_audio(audio_path, output_path="background_segments.wav"):
130
- # # Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
131
- # pipeline = Pipeline.from_pretrained(
132
- # "pyannote/voice-activity-detection",
133
- # use_auth_token=hf_api_key
134
- # )
135
- # # Step 3: Run VAD to get speech segments
136
- # vad_result = pipeline(audio_path)
137
- # print(f"Detected speech segments: {vad_result}")
138
 
139
- # # Step 4: Load full audio and subtract speech segments
140
- # full_audio = AudioSegment.from_wav(audio_path)
141
- # background_audio = AudioSegment.silent(duration=len(full_audio))
142
 
143
- # for segment in vad_result.itersegments():
144
- # start_ms = int(segment.start * 1000)
145
- # end_ms = int(segment.end * 1000)
146
- # # Remove speech by muting that portion
147
- # background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
148
 
149
- # # Step 5: Subtract background_audio from full_audio
150
- # result_audio = full_audio.overlay(background_audio)
151
 
152
- # # Step 6: Export non-speech segments
153
- # result_audio.export(output_path, format="wav")
154
- # print(f"Saved non-speech (background) audio to: {output_path}")
155
 
156
- # return True
157
 
158
  def transcribe_video_with_speakers(video_path):
159
  # Extract audio from video
@@ -162,8 +162,8 @@ def transcribe_video_with_speakers(video_path):
162
  video.audio.write_audiofile(audio_path)
163
  logger.info(f"Audio extracted from video: {audio_path}")
164
 
165
- # segment_result = segment_background_audio(audio_path)
166
- # print(f"Saved non-speech (background) audio to local")
167
 
168
  # Set up device
169
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -385,7 +385,7 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
385
  logger.error(f"\u274c Failed to create subtitle clip: {e}")
386
  return None
387
 
388
- def process_entry(entry, i, video_width, video_height, add_voiceover, target_language, font_path, speaker_sample_paths=None):
389
  logger.debug(f"Processing entry {i}: {entry}")
390
  error_message = None
391
 
@@ -404,7 +404,7 @@ def process_entry(entry, i, video_width, video_height, add_voiceover, target_lan
404
  speaker = entry.get("speaker", "default")
405
  speaker_wav_path = f"speaker_{speaker}_sample.wav"
406
 
407
- output_path, status_msg, tts_error = generate_voiceover_clone([entry], desired_duration, target_language, speaker_wav_path, segment_audio_path)
408
 
409
  if tts_error:
410
  error_message = error_message + " | " + tts_error if error_message else tts_error
@@ -438,8 +438,22 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
438
  audio_segments = []
439
  error_messages = []
440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  with concurrent.futures.ThreadPoolExecutor() as executor:
442
- futures = [executor.submit(process_entry, entry, i, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths)
443
  for i, entry in enumerate(translated_json)]
444
 
445
  results = []
@@ -484,7 +498,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
484
 
485
  return error_messages
486
 
487
- def generate_voiceover_clone(translated_json, desired_duration, target_language, speaker_wav_path, output_audio_path):
488
  try:
489
  full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
490
  if not full_text.strip():
@@ -505,7 +519,7 @@ def generate_voiceover_clone(translated_json, desired_duration, target_language,
505
  # full_text = " ".join(tokens[:MAX_TTS_TOKENS])
506
 
507
  speed_tts = calibrated_speed(full_text, desired_duration)
508
- tts.tts_to_file(
509
  text=full_text,
510
  speaker_wav=speaker_wav_path,
511
  language=target_language,
@@ -667,16 +681,7 @@ def build_interface():
667
 
668
  return demo
669
 
670
- # Load XTTS model
671
- try:
672
- print("πŸ”„ Loading XTTS model...")
673
- tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
674
- print("βœ… XTTS model loaded successfully.")
675
- except Exception as e:
676
- print("❌ Error loading XTTS model:")
677
- traceback.print_exc()
678
- raise e
679
-
680
  # Launch the Gradio interface
681
  demo = build_interface()
682
  demo.launch()
 
33
  from TTS.api import TTS
34
  import torch
35
  from TTS.tts.configs.xtts_config import XttsConfig
36
+ from pydub import AudioSegment
37
+ from pyannote.audio import Pipeline
38
+ import traceback
39
+ import wave
40
 
41
  logger = logging.getLogger(__name__)
42
 
 
126
  conn.commit()
127
  return "Thank you for your feedback!", None
128
 
129
+ def segment_background_audio(audio_path, output_path="background_segments.wav"):
130
+ # Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
131
+ pipeline = Pipeline.from_pretrained(
132
+ "pyannote/voice-activity-detection",
133
+ use_auth_token=hf_api_key
134
+ )
135
+ # Step 3: Run VAD to get speech segments
136
+ vad_result = pipeline(audio_path)
137
+ print(f"Detected speech segments: {vad_result}")
138
 
139
+ # Step 4: Load full audio and subtract speech segments
140
+ full_audio = AudioSegment.from_wav(audio_path)
141
+ background_audio = AudioSegment.silent(duration=len(full_audio))
142
 
143
+ for segment in vad_result.itersegments():
144
+ start_ms = int(segment.start * 1000)
145
+ end_ms = int(segment.end * 1000)
146
+ # Remove speech by muting that portion
147
+ background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
148
 
149
+ # Step 5: Subtract background_audio from full_audio
150
+ result_audio = full_audio.overlay(background_audio)
151
 
152
+ # Step 6: Export non-speech segments
153
+ result_audio.export(output_path, format="wav")
154
+ print(f"Saved non-speech (background) audio to: {output_path}")
155
 
156
+ return True
157
 
158
  def transcribe_video_with_speakers(video_path):
159
  # Extract audio from video
 
162
  video.audio.write_audiofile(audio_path)
163
  logger.info(f"Audio extracted from video: {audio_path}")
164
 
165
+ segment_result = segment_background_audio(audio_path)
166
+ print(f"Saved non-speech (background) audio to local")
167
 
168
  # Set up device
169
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
385
  logger.error(f"\u274c Failed to create subtitle clip: {e}")
386
  return None
387
 
388
+ def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover, target_language, font_path, speaker_sample_paths=None):
389
  logger.debug(f"Processing entry {i}: {entry}")
390
  error_message = None
391
 
 
404
  speaker = entry.get("speaker", "default")
405
  speaker_wav_path = f"speaker_{speaker}_sample.wav"
406
 
407
+ output_path, status_msg, tts_error = generate_voiceover_clone([entry], tts_model, desired_duration, target_language, speaker_wav_path, segment_audio_path)
408
 
409
  if tts_error:
410
  error_message = error_message + " | " + tts_error if error_message else tts_error
 
438
  audio_segments = []
439
  error_messages = []
440
 
441
+ global tts_model
442
+ if tts_model is None:
443
+ try:
444
+ print("πŸ”„ Loading XTTS model...")
445
+ tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
446
+ print("βœ… XTTS model loaded successfully.")
447
+ return "XTTS model loaded successfully."
448
+ except Exception as e:
449
+ print("❌ Error loading XTTS model:")
450
+ traceback.print_exc()
451
+ return f"Error loading XTTS model: {e}"
452
+ else:
453
+ return "XTTS model is already loaded."
454
+
455
  with concurrent.futures.ThreadPoolExecutor() as executor:
456
+ futures = [executor.submit(process_entry, tts_model, entry, i, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths)
457
  for i, entry in enumerate(translated_json)]
458
 
459
  results = []
 
498
 
499
  return error_messages
500
 
501
+ def generate_voiceover_clone(translated_json, tts_model, desired_duration, target_language, speaker_wav_path, output_audio_path):
502
  try:
503
  full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
504
  if not full_text.strip():
 
519
  # full_text = " ".join(tokens[:MAX_TTS_TOKENS])
520
 
521
  speed_tts = calibrated_speed(full_text, desired_duration)
522
+ tts_model.tts_to_file(
523
  text=full_text,
524
  speaker_wav=speaker_wav_path,
525
  language=target_language,
 
681
 
682
  return demo
683
 
684
+ tts_model = None
 
 
 
 
 
 
 
 
 
685
  # Launch the Gradio interface
686
  demo = build_interface()
687
  demo.launch()