Update app.py
Browse files
app.py
CHANGED
|
@@ -33,10 +33,10 @@ import traceback
|
|
| 33 |
from TTS.api import TTS
|
| 34 |
import torch
|
| 35 |
from TTS.tts.configs.xtts_config import XttsConfig
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
|
| 41 |
logger = logging.getLogger(__name__)
|
| 42 |
|
|
@@ -126,34 +126,34 @@ def handle_feedback(feedback):
|
|
| 126 |
conn.commit()
|
| 127 |
return "Thank you for your feedback!", None
|
| 128 |
|
| 129 |
-
|
| 130 |
-
#
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
#
|
| 136 |
-
|
| 137 |
-
|
| 138 |
|
| 139 |
-
#
|
| 140 |
-
|
| 141 |
-
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
#
|
| 147 |
-
|
| 148 |
|
| 149 |
-
#
|
| 150 |
-
|
| 151 |
|
| 152 |
-
#
|
| 153 |
-
|
| 154 |
-
|
| 155 |
|
| 156 |
-
|
| 157 |
|
| 158 |
def transcribe_video_with_speakers(video_path):
|
| 159 |
# Extract audio from video
|
|
@@ -162,8 +162,8 @@ def transcribe_video_with_speakers(video_path):
|
|
| 162 |
video.audio.write_audiofile(audio_path)
|
| 163 |
logger.info(f"Audio extracted from video: {audio_path}")
|
| 164 |
|
| 165 |
-
|
| 166 |
-
|
| 167 |
|
| 168 |
# Set up device
|
| 169 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
@@ -385,7 +385,7 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
|
|
| 385 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
| 386 |
return None
|
| 387 |
|
| 388 |
-
def process_entry(entry, i, video_width, video_height, add_voiceover, target_language, font_path, speaker_sample_paths=None):
|
| 389 |
logger.debug(f"Processing entry {i}: {entry}")
|
| 390 |
error_message = None
|
| 391 |
|
|
@@ -404,7 +404,7 @@ def process_entry(entry, i, video_width, video_height, add_voiceover, target_lan
|
|
| 404 |
speaker = entry.get("speaker", "default")
|
| 405 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
| 406 |
|
| 407 |
-
output_path, status_msg, tts_error = generate_voiceover_clone([entry], desired_duration, target_language, speaker_wav_path, segment_audio_path)
|
| 408 |
|
| 409 |
if tts_error:
|
| 410 |
error_message = error_message + " | " + tts_error if error_message else tts_error
|
|
@@ -438,8 +438,22 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
|
|
| 438 |
audio_segments = []
|
| 439 |
error_messages = []
|
| 440 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 442 |
-
futures = [executor.submit(process_entry, entry, i, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths)
|
| 443 |
for i, entry in enumerate(translated_json)]
|
| 444 |
|
| 445 |
results = []
|
|
@@ -484,7 +498,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
|
|
| 484 |
|
| 485 |
return error_messages
|
| 486 |
|
| 487 |
-
def generate_voiceover_clone(translated_json, desired_duration, target_language, speaker_wav_path, output_audio_path):
|
| 488 |
try:
|
| 489 |
full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
|
| 490 |
if not full_text.strip():
|
|
@@ -505,7 +519,7 @@ def generate_voiceover_clone(translated_json, desired_duration, target_language,
|
|
| 505 |
# full_text = " ".join(tokens[:MAX_TTS_TOKENS])
|
| 506 |
|
| 507 |
speed_tts = calibrated_speed(full_text, desired_duration)
|
| 508 |
-
|
| 509 |
text=full_text,
|
| 510 |
speaker_wav=speaker_wav_path,
|
| 511 |
language=target_language,
|
|
@@ -667,16 +681,7 @@ def build_interface():
|
|
| 667 |
|
| 668 |
return demo
|
| 669 |
|
| 670 |
-
|
| 671 |
-
try:
|
| 672 |
-
print("π Loading XTTS model...")
|
| 673 |
-
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
|
| 674 |
-
print("β
XTTS model loaded successfully.")
|
| 675 |
-
except Exception as e:
|
| 676 |
-
print("β Error loading XTTS model:")
|
| 677 |
-
traceback.print_exc()
|
| 678 |
-
raise e
|
| 679 |
-
|
| 680 |
# Launch the Gradio interface
|
| 681 |
demo = build_interface()
|
| 682 |
demo.launch()
|
|
|
|
| 33 |
from TTS.api import TTS
|
| 34 |
import torch
|
| 35 |
from TTS.tts.configs.xtts_config import XttsConfig
|
| 36 |
+
from pydub import AudioSegment
|
| 37 |
+
from pyannote.audio import Pipeline
|
| 38 |
+
import traceback
|
| 39 |
+
import wave
|
| 40 |
|
| 41 |
logger = logging.getLogger(__name__)
|
| 42 |
|
|
|
|
| 126 |
conn.commit()
|
| 127 |
return "Thank you for your feedback!", None
|
| 128 |
|
| 129 |
+
def segment_background_audio(audio_path, output_path="background_segments.wav"):
|
| 130 |
+
# Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
|
| 131 |
+
pipeline = Pipeline.from_pretrained(
|
| 132 |
+
"pyannote/voice-activity-detection",
|
| 133 |
+
use_auth_token=hf_api_key
|
| 134 |
+
)
|
| 135 |
+
# Step 3: Run VAD to get speech segments
|
| 136 |
+
vad_result = pipeline(audio_path)
|
| 137 |
+
print(f"Detected speech segments: {vad_result}")
|
| 138 |
|
| 139 |
+
# Step 4: Load full audio and subtract speech segments
|
| 140 |
+
full_audio = AudioSegment.from_wav(audio_path)
|
| 141 |
+
background_audio = AudioSegment.silent(duration=len(full_audio))
|
| 142 |
|
| 143 |
+
for segment in vad_result.itersegments():
|
| 144 |
+
start_ms = int(segment.start * 1000)
|
| 145 |
+
end_ms = int(segment.end * 1000)
|
| 146 |
+
# Remove speech by muting that portion
|
| 147 |
+
background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
|
| 148 |
|
| 149 |
+
# Step 5: Subtract background_audio from full_audio
|
| 150 |
+
result_audio = full_audio.overlay(background_audio)
|
| 151 |
|
| 152 |
+
# Step 6: Export non-speech segments
|
| 153 |
+
result_audio.export(output_path, format="wav")
|
| 154 |
+
print(f"Saved non-speech (background) audio to: {output_path}")
|
| 155 |
|
| 156 |
+
return True
|
| 157 |
|
| 158 |
def transcribe_video_with_speakers(video_path):
|
| 159 |
# Extract audio from video
|
|
|
|
| 162 |
video.audio.write_audiofile(audio_path)
|
| 163 |
logger.info(f"Audio extracted from video: {audio_path}")
|
| 164 |
|
| 165 |
+
segment_result = segment_background_audio(audio_path)
|
| 166 |
+
print(f"Saved non-speech (background) audio to local")
|
| 167 |
|
| 168 |
# Set up device
|
| 169 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 385 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
| 386 |
return None
|
| 387 |
|
| 388 |
+
def process_entry(entry, i, tts_model, video_width, video_height, add_voiceover, target_language, font_path, speaker_sample_paths=None):
|
| 389 |
logger.debug(f"Processing entry {i}: {entry}")
|
| 390 |
error_message = None
|
| 391 |
|
|
|
|
| 404 |
speaker = entry.get("speaker", "default")
|
| 405 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
| 406 |
|
| 407 |
+
output_path, status_msg, tts_error = generate_voiceover_clone([entry], tts_model, desired_duration, target_language, speaker_wav_path, segment_audio_path)
|
| 408 |
|
| 409 |
if tts_error:
|
| 410 |
error_message = error_message + " | " + tts_error if error_message else tts_error
|
|
|
|
| 438 |
audio_segments = []
|
| 439 |
error_messages = []
|
| 440 |
|
| 441 |
+
global tts_model
|
| 442 |
+
if tts_model is None:
|
| 443 |
+
try:
|
| 444 |
+
print("π Loading XTTS model...")
|
| 445 |
+
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
|
| 446 |
+
print("β
XTTS model loaded successfully.")
|
| 447 |
+
return "XTTS model loaded successfully."
|
| 448 |
+
except Exception as e:
|
| 449 |
+
print("β Error loading XTTS model:")
|
| 450 |
+
traceback.print_exc()
|
| 451 |
+
return f"Error loading XTTS model: {e}"
|
| 452 |
+
else:
|
| 453 |
+
return "XTTS model is already loaded."
|
| 454 |
+
|
| 455 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 456 |
+
futures = [executor.submit(process_entry, tts_model, entry, i, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths)
|
| 457 |
for i, entry in enumerate(translated_json)]
|
| 458 |
|
| 459 |
results = []
|
|
|
|
| 498 |
|
| 499 |
return error_messages
|
| 500 |
|
| 501 |
+
def generate_voiceover_clone(translated_json, tts_model, desired_duration, target_language, speaker_wav_path, output_audio_path):
|
| 502 |
try:
|
| 503 |
full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
|
| 504 |
if not full_text.strip():
|
|
|
|
| 519 |
# full_text = " ".join(tokens[:MAX_TTS_TOKENS])
|
| 520 |
|
| 521 |
speed_tts = calibrated_speed(full_text, desired_duration)
|
| 522 |
+
tts_model.tts_to_file(
|
| 523 |
text=full_text,
|
| 524 |
speaker_wav=speaker_wav_path,
|
| 525 |
language=target_language,
|
|
|
|
| 681 |
|
| 682 |
return demo
|
| 683 |
|
| 684 |
+
tts_model = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
# Launch the Gradio interface
|
| 686 |
demo = build_interface()
|
| 687 |
demo.launch()
|