qqwjq1981 commited on
Commit
a4f3333
Β·
verified Β·
1 Parent(s): 5ffc8c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -99
app.py CHANGED
@@ -30,6 +30,24 @@ import time
30
  import os
31
  import openai
32
  from openai import OpenAI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  client = OpenAI(
35
  api_key= os.environ.get("openAI_api_key"), # This is the default and can be omitted
@@ -110,52 +128,80 @@ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %
110
  logger = logging.getLogger(__name__)
111
  logger.info(f"MoviePy Version: {moviepy.__version__}")
112
 
113
- def transcribe_video(video_path):
114
- # Load the video file and extract audio
115
  video = VideoFileClip(video_path)
116
  audio_path = "audio.wav"
117
  video.audio.write_audiofile(audio_path)
 
118
 
119
- # Load Whisper model
120
- model = whisper.load_model("base") # Options: tiny, base, small, medium, large
121
-
122
- # Transcribe with Whisper
123
- result = model.transcribe(audio_path, word_timestamps=True)
124
-
125
- # Extract timestamps, text, and compute word count
126
- total_words = 0
127
- total_duration = 0
128
- transcript_with_timestamps = []
129
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  for segment in result["segments"]:
131
- start = segment["start"]
132
- end = segment["end"]
133
- text = segment["text"]
134
-
135
- transcript_with_timestamps.append({
136
- "start": start,
137
- "end": end,
138
- "text": text
139
- })
140
-
141
- word_count = count_words_or_characters(text)
142
- total_words += word_count
143
- total_duration += (end - start)
144
-
145
- # Compute average words per second
146
- avg_words_per_second = total_words / total_duration if total_duration > 0 else 0
147
-
148
- # Add total statistics to the result
149
- transcript_stats = {
150
- "total_words": total_words,
151
- "total_duration": total_duration,
152
- "avg_words_per_second": avg_words_per_second
153
- }
154
- logger.debug(f"Transcription stats:\n{transcript_stats}")
155
  # Get the detected language
156
  detected_language = result["language"]
157
- logger.debug(f"Detected language:\n{detected_language}")
158
- return transcript_with_timestamps, detected_language
 
 
 
 
 
 
159
 
160
  # Function to get the appropriate translation model based on target language
161
  def get_translation_model(source_language, target_language):
@@ -259,7 +305,7 @@ def update_translations(file, edited_table, mode):
259
  except Exception as e:
260
  raise ValueError(f"Error updating translations: {e}")
261
 
262
- def process_entry(entry, i, video_width, video_height, add_voiceover, target_language):
263
  logger.debug(f"Processing entry {i}: {entry}")
264
 
265
  # Create text clip for subtitles
@@ -278,7 +324,9 @@ def process_entry(entry, i, video_width, video_height, add_voiceover, target_lan
278
  if add_voiceover:
279
  segment_audio_path = f"segment_{i}_voiceover.wav"
280
  desired_duration = entry["end"] - entry["start"]
281
- generate_voiceover_OpenAI([entry], target_language, desired_duration, segment_audio_path)
 
 
282
  audio_clip = AudioFileClip(segment_audio_path)
283
  # Get and log all methods in AudioFileClip
284
  logger.info("Methods in AudioFileClip:")
@@ -301,7 +349,7 @@ def process_entry(entry, i, video_width, video_height, add_voiceover, target_lan
301
 
302
  return i, txt_clip, audio_segment
303
 
304
- def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en"):
305
  """
306
  Add transcript and voiceover to a video, segment by segment.
307
  """
@@ -312,7 +360,7 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
312
  audio_segments = []
313
 
314
  with concurrent.futures.ThreadPoolExecutor() as executor:
315
- futures = [executor.submit(process_entry, entry, i, video.w, video.h, add_voiceover, target_language)
316
  for i, entry in enumerate(translated_json)]
317
 
318
  # Collect results with original index i
@@ -348,19 +396,35 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
348
 
349
  logger.info("Video processing completed successfully.")
350
 
351
- def generate_voiceover(translated_json, language, output_audio_path):
352
- """
353
- Generate voiceover from translated text for a given language.
354
- """
355
- # Concatenate translated text into a single string
356
- full_text = " ".join(entry["translated"] for entry in translated_json)
357
-
358
  try:
359
- tts = gTTS(text=full_text, lang=language)
360
- time.sleep(10) # Add a delay of 10 seconds between requests
361
- tts.save(output_audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  except Exception as e:
363
- raise ValueError(f"Error generating voiceover: {e}")
 
 
 
364
 
365
  def truncated_linear(x):
366
  if x < 15:
@@ -381,47 +445,6 @@ def calculate_speed(text, desired_duration):
381
 
382
  return speed
383
 
384
- def generate_voiceover_OpenAI(translated_json, language, desired_duration, output_audio_path):
385
- """
386
- Generate voiceover from translated text for a given language using OpenAI TTS API.
387
- """
388
- # Concatenate translated text into a single string
389
- full_text = " ".join(entry["translated"] for entry in translated_json)
390
-
391
- # Define the voice based on the language (for now, use 'alloy' as default)
392
- voice = "alloy" # Adjust based on language if needed
393
-
394
- # Define the model (use tts-1 for real-time applications)
395
- model = "tts-1"
396
-
397
- max_retries = 3
398
- retry_count = 0
399
-
400
- while retry_count < max_retries:
401
- try:
402
- speed_tts = calculate_speed(full_text, desired_duration)
403
- # Create the speech using OpenAI TTS API
404
- response = client.audio.speech.create(
405
- model=model,
406
- voice=voice,
407
- input=full_text,
408
- speed=speed_tts
409
- )
410
- # Save the audio to the specified path
411
- with open(output_audio_path, 'wb') as f:
412
- for chunk in response.iter_bytes():
413
- f.write(chunk)
414
- logging.info(f"Voiceover generated successfully for {output_audio_path}")
415
- break
416
-
417
- except Exception as e:
418
- retry_count += 1
419
- logging.error(f"Error generating voiceover (retry {retry_count}/{max_retries}): {e}")
420
- time.sleep(5) # Wait 5 seconds before retrying
421
-
422
- if retry_count == max_retries:
423
- raise ValueError(f"Failed to generate voiceover after {max_retries} retries.")
424
-
425
  def upload_and_manage(file, target_language, mode="transcription"):
426
  if file is None:
427
  logger.info("No file uploaded. Please upload a video/audio file.")
@@ -439,7 +462,7 @@ def upload_and_manage(file, target_language, mode="transcription"):
439
 
440
  # Step 1: Transcribe audio from uploaded media file and get timestamps
441
  logger.info("Transcribing audio...")
442
- transcription_json, source_language = transcribe_video(file.name)
443
  logger.info(f"Transcription completed. Detected source language: {source_language}")
444
 
445
  # Step 2: Translate the transcription
@@ -449,13 +472,13 @@ def upload_and_manage(file, target_language, mode="transcription"):
449
 
450
  # Step 3: Add transcript to video based on timestamps
451
  logger.info("Adding translated transcript to video...")
452
- add_transcript_voiceover(file.name, translated_json, output_video_path, mode == "Transcription with Voiceover", target_language)
453
  logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
454
 
455
  # Convert translated JSON into a format for the editable table
456
  logger.info("Converting translated JSON into editable table format...")
457
  editable_table = [
458
- [float(entry["start"]), entry["original"], entry["translated"], float(entry["end"])]
459
  for entry in translated_json
460
  ]
461
 
@@ -519,7 +542,7 @@ def build_interface():
519
 
520
  save_changes_button.click(
521
  update_translations,
522
- inputs=[file_input, editable_table, process_mode],
523
  outputs=[processed_video_output, elapsed_time_display]
524
  )
525
 
 
30
  import os
31
  import openai
32
  from openai import OpenAI
33
+ import traceback
34
+ from TTS.api import TTS
35
+ import torch
36
+ from TTS.tts.configs.xtts_config import XttsConfig
37
+
38
+ # Accept license terms for Coqui XTTS
39
+ os.environ["COQUI_TOS_AGREED"] = "1"
40
+ torch.serialization.add_safe_globals([XttsConfig])
41
+
42
+ # Load XTTS model
43
+ try:
44
+ print("πŸ”„ Loading XTTS model...")
45
+ tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
46
+ print("βœ… XTTS model loaded successfully.")
47
+ except Exception as e:
48
+ print("❌ Error loading XTTS model:")
49
+ traceback.print_exc()
50
+ raise e
51
 
52
  client = OpenAI(
53
  api_key= os.environ.get("openAI_api_key"), # This is the default and can be omitted
 
128
  logger = logging.getLogger(__name__)
129
  logger.info(f"MoviePy Version: {moviepy.__version__}")
130
 
131
+ def transcribe_video_with_speakers(video_path):
132
+ # Extract audio from video
133
  video = VideoFileClip(video_path)
134
  audio_path = "audio.wav"
135
  video.audio.write_audiofile(audio_path)
136
+ logger.info(f"Audio extracted from video: {audio_path}")
137
 
138
+ # Set up device
139
+ device = "cuda" if torch.cuda.is_available() else "cpu"
140
+ logger.info(f"Using device: {device}")
141
+
142
+ # Load WhisperX model
143
+ model = whisperx.load_model("large-v2", device)
144
+ logger.info("WhisperX model loaded")
145
+
146
+ # Transcribe with WhisperX
147
+ result = model.transcribe(audio_path)
148
+ logger.info("Audio transcription completed")
149
+
150
+ # Align transcription
151
+ model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
152
+ result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
153
+ logger.info("Transcription alignment completed")
154
+
155
+ # Perform speaker diarization
156
+ diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
157
+ diarize_segments = diarize_model(audio_path)
158
+ logger.info("Speaker diarization completed")
159
+
160
+ # Assign speakers to transcribed segments
161
+ result = whisperx.assign_word_speakers(diarize_segments, result)
162
+ logger.info("Speakers assigned to transcribed segments")
163
+
164
+ # Extract timestamps, text, and speaker IDs
165
+ transcript_with_speakers = [
166
+ {
167
+ "start": segment["start"],
168
+ "end": segment["end"],
169
+ "text": segment["text"],
170
+ "speaker": segment["speaker"]
171
+ }
172
+ for segment in result["segments"]
173
+ ]
174
+
175
+ # Collect audio for each speaker
176
+ speaker_audio = {}
177
  for segment in result["segments"]:
178
+ speaker = segment["speaker"]
179
+ if speaker not in speaker_audio:
180
+ speaker_audio[speaker] = []
181
+ speaker_audio[speaker].append((segment["start"], segment["end"]))
182
+
183
+ # Collapse and truncate speaker audio
184
+ speaker_sample_paths = {}
185
+ audio_clip = AudioFileClip(audio_path)
186
+ for speaker, segments in speaker_audio.items():
187
+ speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
188
+ combined_clip = concatenate_audioclips(speaker_clips)
189
+ truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
190
+ sample_path = f"speaker_{speaker}_sample.wav"
191
+ truncated_clip.write_audiofile(sample_path)
192
+ speaker_sample_paths[speaker] = sample_path
193
+ logger.info(f"Created sample for {speaker}: {sample_path}")
194
+
 
 
 
 
 
 
 
195
  # Get the detected language
196
  detected_language = result["language"]
197
+ logger.debug(f"Detected language: {detected_language}")
198
+
199
+ # Clean up
200
+ video.close()
201
+ audio_clip.close()
202
+ os.remove(audio_path)
203
+
204
+ return transcript_with_speakers, detected_language
205
 
206
  # Function to get the appropriate translation model based on target language
207
  def get_translation_model(source_language, target_language):
 
305
  except Exception as e:
306
  raise ValueError(f"Error updating translations: {e}")
307
 
308
+ def process_entry(entry, i, video_width, video_height, add_voiceover, target_language, speaker_sample_paths=None):
309
  logger.debug(f"Processing entry {i}: {entry}")
310
 
311
  # Create text clip for subtitles
 
324
  if add_voiceover:
325
  segment_audio_path = f"segment_{i}_voiceover.wav"
326
  desired_duration = entry["end"] - entry["start"]
327
+ speaker_wav_path = f"speaker_{entry["speaker"]}_sample.wav"
328
+ generate_voiceover_clone([entry], desired_duration, target_language, speaker_wav_path, segment_audio_path):
329
+
330
  audio_clip = AudioFileClip(segment_audio_path)
331
  # Get and log all methods in AudioFileClip
332
  logger.info("Methods in AudioFileClip:")
 
349
 
350
  return i, txt_clip, audio_segment
351
 
352
+ def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None):
353
  """
354
  Add transcript and voiceover to a video, segment by segment.
355
  """
 
360
  audio_segments = []
361
 
362
  with concurrent.futures.ThreadPoolExecutor() as executor:
363
+ futures = [executor.submit(process_entry, entry, i, video.w, video.h, add_voiceover, target_language, speaker_sample_paths)
364
  for i, entry in enumerate(translated_json)]
365
 
366
  # Collect results with original index i
 
396
 
397
  logger.info("Video processing completed successfully.")
398
 
399
+ # Voice cloning function with debug and error handling
400
+ def generate_voiceover_clone(translated_json, desired_duration, target_language, speaker_wav_path, output_audio_path):
 
 
 
 
 
401
  try:
402
+ full_text = " ".join(entry["translated"] for entry in translated_json)
403
+ speed_tts = calculate_speed(full_text, desired_duration)
404
+ if not speaker_wav_path or not os.path.exists(speaker_wav_path):
405
+ return None, "❌ Please upload a valid speaker audio file."
406
+
407
+ print(f"πŸ“₯ Received text: {full_text}")
408
+ print(f"πŸ“ Speaker audio path: {speaker_wav_path}")
409
+ print(f"🌐 Selected language: {target_language}")
410
+ print(f"⏱️ Target speed: {speed_tts}")
411
+
412
+ # Run TTS with speed control (if supported by model)
413
+ tts.tts_to_file(
414
+ text=full_text,
415
+ speaker_wav=speaker_wav_path,
416
+ language=language,
417
+ file_path=output_audio_path,
418
+ speed=speed_tts # <- add speed control
419
+ )
420
+ print("βœ… Voice cloning completed.")
421
+ return output_path, "βœ… Voice cloning completed successfully."
422
+
423
  except Exception as e:
424
+ print("❌ Error during voice cloning:")
425
+ traceback.print_exc()
426
+ error_msg = f"❌ An error occurred: {str(e)}"
427
+ return None, error_msg
428
 
429
  def truncated_linear(x):
430
  if x < 15:
 
445
 
446
  return speed
447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  def upload_and_manage(file, target_language, mode="transcription"):
449
  if file is None:
450
  logger.info("No file uploaded. Please upload a video/audio file.")
 
462
 
463
  # Step 1: Transcribe audio from uploaded media file and get timestamps
464
  logger.info("Transcribing audio...")
465
+ transcription_json, source_language = transcribe_video_with_speakers(file.name)
466
  logger.info(f"Transcription completed. Detected source language: {source_language}")
467
 
468
  # Step 2: Translate the transcription
 
472
 
473
  # Step 3: Add transcript to video based on timestamps
474
  logger.info("Adding translated transcript to video...")
475
+ add_transcript_voiceover(file.name, translated_json, output_video_path, mode == "Transcription with Voiceover", target_language, speaker_sample_path)
476
  logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
477
 
478
  # Convert translated JSON into a format for the editable table
479
  logger.info("Converting translated JSON into editable table format...")
480
  editable_table = [
481
+ [float(entry["start"]), entry["original"], entry["translated"], float(entry["end"]), entry["speaker"]]
482
  for entry in translated_json
483
  ]
484
 
 
542
 
543
  save_changes_button.click(
544
  update_translations,
545
+ inputs=[file_input, editable_table, process_mode],
546
  outputs=[processed_video_output, elapsed_time_display]
547
  )
548