qqwjq1981 commited on
Commit
0f69689
·
verified ·
1 Parent(s): 39837b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +602 -615
app.py CHANGED
@@ -62,634 +62,621 @@ client = OpenAI(
62
  hf_api_key = os.environ.get("hf_token")
63
 
64
 
65
- # def silence(duration, fps=44100):
66
- # """
67
- # Returns a silent AudioClip of the specified duration.
68
- # """
69
- # return AudioArrayClip(np.zeros((int(fps*duration), 2)), fps=fps)
70
-
71
- # def count_words_or_characters(text):
72
- # # Count non-Chinese words
73
- # non_chinese_words = len(re.findall(r'\b[a-zA-Z0-9]+\b', text))
74
 
75
- # # Count Chinese characters
76
- # chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
77
 
78
- # return non_chinese_words + chinese_chars
79
 
80
- # # Define the passcode
81
- # PASSCODE = "show_feedback_db"
82
-
83
- # css = """
84
- # /* Adjust row height */
85
- # .dataframe-container tr {
86
- # height: 50px !important;
87
- # }
88
-
89
- # /* Ensure text wrapping and prevent overflow */
90
- # .dataframe-container td {
91
- # white-space: normal !important;
92
- # word-break: break-word !important;
93
- # }
94
-
95
- # /* Set column widths */
96
- # [data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
97
- # [data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
98
- # width: 6%; /* Start column */
99
- # }
100
-
101
- # [data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
102
- # [data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
103
- # width: 47%; /* Original text */
104
- # }
105
-
106
- # [data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
107
- # [data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
108
- # width: 47%; /* Translated text */
109
- # }
110
-
111
- # [data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
112
- # [data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
113
- # display: none !important;
114
- # }
115
- # """
116
-
117
- # # Function to save feedback or provide access to the database file
118
- # def handle_feedback(feedback):
119
- # feedback = feedback.strip() # Clean up leading/trailing whitespace
120
- # if not feedback:
121
- # return "Feedback cannot be empty.", None
122
-
123
- # if feedback == PASSCODE:
124
- # # Provide access to the feedback.db file
125
- # return "Access granted! Download the database file below.", "feedback.db"
126
- # else:
127
- # # Save feedback to the database
128
- # with sqlite3.connect("feedback.db") as conn:
129
- # cursor = conn.cursor()
130
- # cursor.execute("CREATE TABLE IF NOT EXISTS studio_feedback (id INTEGER PRIMARY KEY, comment TEXT)")
131
- # cursor.execute("INSERT INTO studio_feedback (comment) VALUES (?)", (feedback,))
132
- # conn.commit()
133
- # return "Thank you for your feedback!", None
134
-
135
- # # Configure logging
136
- # logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
137
- # logger = logging.getLogger(__name__)
138
- # logger.info(f"MoviePy Version: {moviepy.__version__}")
139
-
140
- # # def segment_background_audio(audio_path, output_path="background_segments.wav"):
141
- # # # Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
142
- # # pipeline = Pipeline.from_pretrained(
143
- # # "pyannote/voice-activity-detection",
144
- # # use_auth_token=hf_api_key
145
- # # )
146
- # # # Step 3: Run VAD to get speech segments
147
- # # vad_result = pipeline(audio_path)
148
- # # print(f"Detected speech segments: {vad_result}")
149
-
150
- # # # Step 4: Load full audio and subtract speech segments
151
- # # full_audio = AudioSegment.from_wav(audio_path)
152
- # # background_audio = AudioSegment.silent(duration=len(full_audio))
153
-
154
- # # for segment in vad_result.itersegments():
155
- # # start_ms = int(segment.start * 1000)
156
- # # end_ms = int(segment.end * 1000)
157
- # # # Remove speech by muting that portion
158
- # # background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
159
-
160
- # # # Step 5: Subtract background_audio from full_audio
161
- # # result_audio = full_audio.overlay(background_audio)
162
-
163
- # # # Step 6: Export non-speech segments
164
- # # result_audio.export(output_path, format="wav")
165
- # # print(f"Saved non-speech (background) audio to: {output_path}")
166
-
167
- # # return True
168
-
169
- # def transcribe_video_with_speakers(video_path):
170
- # # Extract audio from video
171
- # video = VideoFileClip(video_path)
172
- # audio_path = "audio.wav"
173
- # video.audio.write_audiofile(audio_path)
174
- # logger.info(f"Audio extracted from video: {audio_path}")
175
-
176
- # # segment_result = segment_background_audio(audio_path)
177
- # # print(f"Saved non-speech (background) audio to local")
178
 
179
- # # Set up device
180
- # device = "cuda" if torch.cuda.is_available() else "cpu"
181
- # logger.info(f"Using device: {device}")
182
 
183
- # try:
184
- # # Load a medium model with float32 for broader compatibility
185
- # model = whisperx.load_model("medium", device=device, compute_type="float32")
186
- # logger.info("WhisperX model loaded")
187
 
188
- # # Transcribe
189
- # result = model.transcribe(audio_path, chunk_size=5, print_progress = True)
190
- # logger.info("Audio transcription completed")
191
-
192
- # # Get the detected language
193
- # detected_language = result["language"]
194
- # logger.debug(f"Detected language: {detected_language}")
195
- # # Alignment
196
- # model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
197
- # result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
198
- # logger.info("Transcription alignment completed")
199
 
200
- # # Diarization (works independently of Whisper model size)
201
- # diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
202
- # diarize_segments = diarize_model(audio_path)
203
- # logger.info("Speaker diarization completed")
204
 
205
- # # Assign speakers
206
- # result = whisperx.assign_word_speakers(diarize_segments, result)
207
- # logger.info("Speakers assigned to transcribed segments")
208
 
209
- # except Exception as e:
210
- # logger.error(f"❌ WhisperX pipeline failed: {e}")
211
-
212
- # # Extract timestamps, text, and speaker IDs
213
- # transcript_with_speakers = [
214
- # {
215
- # "start": segment["start"],
216
- # "end": segment["end"],
217
- # "text": segment["text"],
218
- # "speaker": segment["speaker"]
219
- # }
220
- # for segment in result["segments"]
221
- # ]
222
-
223
- # # Collect audio for each speaker
224
- # speaker_audio = {}
225
- # for segment in result["segments"]:
226
- # speaker = segment["speaker"]
227
- # if speaker not in speaker_audio:
228
- # speaker_audio[speaker] = []
229
- # speaker_audio[speaker].append((segment["start"], segment["end"]))
230
-
231
- # # Collapse and truncate speaker audio
232
- # speaker_sample_paths = {}
233
- # audio_clip = AudioFileClip(audio_path)
234
- # for speaker, segments in speaker_audio.items():
235
- # speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
236
- # combined_clip = concatenate_audioclips(speaker_clips)
237
- # truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
238
- # sample_path = f"speaker_{speaker}_sample.wav"
239
- # truncated_clip.write_audiofile(sample_path)
240
- # speaker_sample_paths[speaker] = sample_path
241
- # logger.info(f"Created sample for {speaker}: {sample_path}")
242
-
243
- # # Clean up
244
- # video.close()
245
- # audio_clip.close()
246
- # os.remove(audio_path)
247
-
248
- # return transcript_with_speakers, detected_language
249
-
250
- # # Function to get the appropriate translation model based on target language
251
- # def get_translation_model(source_language, target_language):
252
- # """
253
- # Get the translation model based on the source and target language.
254
-
255
- # Parameters:
256
- # - target_language (str): The language to translate the content into (e.g., 'es', 'fr').
257
- # - source_language (str): The language of the input content (default is 'en' for English).
258
 
259
- # Returns:
260
- # - str: The translation model identifier.
261
- # """
262
- # # List of allowable languages
263
- # allowable_languages = ["en", "es", "fr", "zh", "de", "it", "pt", "ja", "ko", "ru"]
264
-
265
- # # Validate source and target languages
266
- # if source_language not in allowable_languages:
267
- # logger.debug(f"Invalid source language '{source_language}'. Supported languages are: {', '.join(allowable_languages)}")
268
- # # Return a default model if source language is invalid
269
- # source_language = "en" # Default to 'en'
270
-
271
- # if target_language not in allowable_languages:
272
- # logger.debug(f"Invalid target language '{target_language}'. Supported languages are: {', '.join(allowable_languages)}")
273
- # # Return a default model if target language is invalid
274
- # target_language = "zh" # Default to 'zh'
275
-
276
- # if source_language == target_language:
277
- # source_language = "en" # Default to 'en'
278
- # target_language = "zh" # Default to 'zh'
279
-
280
- # # Return the model using string concatenation
281
- # return f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
282
-
283
- # def translate_single_entry(entry, translator):
284
- # original_text = entry["text"]
285
- # translated_text = translator(original_text)[0]['translation_text']
286
- # return {
287
- # "start": entry["start"],
288
- # "original": original_text,
289
- # "translated": translated_text,
290
- # "end": entry["end"],
291
- # "speaker": entry["speaker"]
292
- # }
293
-
294
- # def translate_text(transcription_json, source_language, target_language):
295
- # # Load the translation model for the specified target language
296
- # translation_model_id = get_translation_model(source_language, target_language)
297
- # logger.debug(f"Translation model: {translation_model_id}")
298
- # translator = pipeline("translation", model=translation_model_id)
299
-
300
- # # Use ThreadPoolExecutor to parallelize translations
301
- # with concurrent.futures.ThreadPoolExecutor() as executor:
302
- # # Submit all translation tasks and collect results
303
- # translate_func = lambda entry: translate_single_entry(entry, translator)
304
- # translated_json = list(executor.map(translate_func, transcription_json))
305
-
306
- # # Sort the translated_json by start time
307
- # translated_json.sort(key=lambda x: x["start"])
308
-
309
- # # Log the components being added to translated_json
310
- # for entry in translated_json:
311
- # logger.debug("Added to translated_json: start=%s, original=%s, translated=%s, end=%s, speaker=%s",
312
- # entry["start"], entry["original"], entry["translated"], entry["end"], entry["speaker"])
313
-
314
- # return translated_json
315
-
316
- # def update_translations(file, edited_table, mode):
317
- # """
318
- # Update the translations based on user edits in the Gradio Dataframe.
319
- # """
320
- # output_video_path = "output_video.mp4"
321
- # logger.debug(f"Editable Table: {edited_table}")
322
-
323
- # if file is None:
324
- # logger.info("No file uploaded. Please upload a video/audio file.")
325
- # return None, [], None, "No file uploaded. Please upload a video/audio file."
326
 
327
- # try:
328
- # start_time = time.time() # Start the timer
329
-
330
- # # Convert the edited_table (list of lists) back to list of dictionaries
331
- # updated_translations = [
332
- # {
333
- # "start": row["start"], # Access by column name
334
- # "original": row["original"],
335
- # "translated": row["translated"],
336
- # "end": row["end"]
337
- # }
338
- # for _, row in edited_table.iterrows()
339
- # ]
340
-
341
- # # Call the function to process the video with updated translations
342
- # add_transcript_voiceover(file.name, updated_translations, output_video_path, mode=="Transcription with Voiceover")
343
-
344
- # # Calculate elapsed time
345
- # elapsed_time = time.time() - start_time
346
- # elapsed_time_display = f"Updates applied successfully in {elapsed_time:.2f} seconds."
347
-
348
- # return output_video_path, elapsed_time_display
349
-
350
- # except Exception as e:
351
- # raise ValueError(f"Error updating translations: {e}")
352
-
353
- # def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_height, font_path):
354
- # try:
355
- # subtitle_width = int(video_width * 0.8)
356
- # subtitle_font_size = int(video_height // 20)
357
- # font = ImageFont.truetype(font_path, subtitle_font_size)
358
-
359
- # dummy_img = Image.new("RGBA", (subtitle_width, 1), (0, 0, 0, 0))
360
- # draw = ImageDraw.Draw(dummy_img)
361
-
362
- # lines = []
363
- # line = ""
364
- # for word in text.split():
365
- # test_line = f"{line} {word}".strip()
366
- # bbox = draw.textbbox((0, 0), test_line, font=font)
367
- # w = bbox[2] - bbox[0]
368
- # if w <= subtitle_width - 10:
369
- # line = test_line
370
- # else:
371
- # lines.append(line)
372
- # line = word
373
- # lines.append(line)
374
-
375
- # line_heights = [draw.textbbox((0, 0), l, font=font)[3] - draw.textbbox((0, 0), l, font=font)[1] for l in lines]
376
- # total_height = sum(line_heights) + (len(lines) - 1) * 5
377
- # img = Image.new("RGBA", (subtitle_width, total_height), (0, 0, 0, 0))
378
- # draw = ImageDraw.Draw(img)
379
-
380
- # y = 0
381
- # for idx, line in enumerate(lines):
382
- # bbox = draw.textbbox((0, 0), line, font=font)
383
- # w = bbox[2] - bbox[0]
384
- # draw.text(((subtitle_width - w) // 2, y), line, font=font, fill="yellow")
385
- # y += line_heights[idx] + 5
386
 
387
- # img_np = np.array(img) # <- ✅ Fix: convert to NumPy
388
- # txt_clip = ImageClip(img_np).set_start(start_time).set_duration(end_time - start_time).set_position("bottom").set_opacity(0.8)
389
- # return txt_clip
390
- # except Exception as e:
391
- # logger.error(f"\u274c Failed to create subtitle clip: {e}")
392
- # return None
393
-
394
- # def process_entry(entry, i, video_width, video_height, add_voiceover, target_language, font_path, speaker_sample_paths=None):
395
- # logger.debug(f"Processing entry {i}: {entry}")
396
- # error_message = None
397
-
398
- # try:
399
- # txt_clip = create_subtitle_clip_pil(entry["translated"], entry["start"], entry["end"], video_width, video_height, font_path)
400
- # except Exception as e:
401
- # error_message = f"❌ Failed to create subtitle clip for entry {i}: {e}"
402
- # logger.error(error_message)
403
- # txt_clip = None
404
-
405
- # audio_segment = None
406
- # if add_voiceover:
407
- # try:
408
- # segment_audio_path = f"segment_{i}_voiceover.wav"
409
- # desired_duration = entry["end"] - entry["start"]
410
- # speaker = entry.get("speaker", "default")
411
- # speaker_wav_path = f"speaker_{speaker}_sample.wav"
412
-
413
- # output_path, status_msg, tts_error = generate_voiceover_clone([entry], desired_duration, target_language, speaker_wav_path, segment_audio_path)
414
-
415
- # if tts_error:
416
- # error_message = error_message + " | " + tts_error if error_message else tts_error
417
-
418
- # if not output_path or not os.path.exists(segment_audio_path):
419
- # raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
420
-
421
- # audio_clip = AudioFileClip(segment_audio_path)
422
- # logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
423
-
424
- # if audio_clip.duration < desired_duration:
425
- # silence_duration = desired_duration - audio_clip.duration
426
- # audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
427
- # logger.info(f"Padded audio with {silence_duration} seconds of silence.")
428
-
429
- # audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
430
-
431
- # except Exception as e:
432
- # err = f"❌ Failed to generate audio segment for entry {i}: {e}"
433
- # logger.error(err)
434
- # error_message = error_message + " | " + err if error_message else err
435
- # audio_segment = None
436
-
437
- # return i, txt_clip, audio_segment, error_message
438
 
439
- # def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None):
440
- # video = VideoFileClip(video_path)
441
- # font_path = "./NotoSansSC-Regular.ttf"
442
-
443
- # text_clips = []
444
- # audio_segments = []
445
- # error_messages = []
446
-
447
- # with concurrent.futures.ThreadPoolExecutor() as executor:
448
- # futures = [executor.submit(process_entry, entry, i, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths)
449
- # for i, entry in enumerate(translated_json)]
450
-
451
- # results = []
452
- # for future in concurrent.futures.as_completed(futures):
453
- # try:
454
- # i, txt_clip, audio_segment, error = future.result()
455
- # results.append((i, txt_clip, audio_segment))
456
- # if error:
457
- # error_messages.append(f"[Entry {i}] {error}")
458
- # except Exception as e:
459
- # err = f"❌ Unexpected error in future result: {e}"
460
- # logger.error(err)
461
- # error_messages.append(err)
462
-
463
- # # Sort by entry index to ensure order
464
- # results.sort(key=lambda x: x[0])
465
- # text_clips = [clip for _, clip, _ in results if clip]
466
- # if add_voiceover:
467
- # audio_segments = [segment for _, _, segment in results if segment]
468
-
469
- # final_video = CompositeVideoClip([video] + text_clips)
470
-
471
- # if add_voiceover:
472
- # if audio_segments:
473
- # final_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
474
- # final_video = final_video.set_audio(final_audio)
475
- # else:
476
- # logger.warning("⚠️ No audio segments available. Adding silent fallback.")
477
- # silent_audio = AudioClip(lambda t: 0, duration=video.duration)
478
- # final_video = final_video.set_audio(silent_audio)
479
-
480
- # logger.info(f"Saving the final video to: {output_path}")
481
- # final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
482
-
483
- # logger.info("Video processing completed successfully.")
484
-
485
- # # Optional: return errors
486
- # if error_messages:
487
- # logger.warning("⚠️ Errors encountered during processing:")
488
- # for msg in error_messages:
489
- # logger.warning(msg)
490
-
491
- # return error_messages
492
-
493
- # # Initialize TTS model only once (outside the function)
494
- # tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
495
-
496
- # def generate_voiceover_clone(translated_json, desired_duration, target_language, speaker_wav_path, output_audio_path):
497
- # try:
498
- # full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
499
- # if not full_text.strip():
500
- # msg = "❌ Translated text is empty."
501
- # logger.error(msg)
502
- # return None, msg, msg
503
-
504
- # if not speaker_wav_path or not os.path.exists(speaker_wav_path):
505
- # msg = f"❌ Speaker audio not found: {speaker_wav_path}"
506
- # logger.error(msg)
507
- # return None, msg, msg
508
-
509
- # # # Truncate text based on max token assumption (~60 tokens)
510
- # # MAX_TTS_TOKENS = 60
511
- # # tokens = full_text.split() # crude token count
512
- # # if len(tokens) > MAX_TTS_TOKENS:
513
- # # logger.warning(f"⚠️ Text too long for TTS model ({len(tokens)} tokens). Truncating to {MAX_TTS_TOKENS} tokens.")
514
- # # full_text = " ".join(tokens[:MAX_TTS_TOKENS])
515
-
516
- # speed_tts = calibrated_speed(full_text, desired_duration)
517
- # tts.tts_to_file(
518
- # text=full_text,
519
- # speaker_wav=speaker_wav_path,
520
- # language=target_language,
521
- # file_path=output_audio_path,
522
- # speed=speed_tts,
523
- # split_sentences=True
524
- # )
525
-
526
- # if not os.path.exists(output_audio_path):
527
- # msg = f"❌ Voiceover file not generated at: {output_audio_path}"
528
- # logger.error(msg)
529
- # return None, msg, msg
530
-
531
- # msg = "✅ Voice cloning completed successfully."
532
- # logger.info(msg)
533
- # return output_audio_path, msg, None
534
-
535
- # except Exception as e:
536
- # err_msg = f"❌ An error occurred: {str(e)}"
537
- # logger.error("❌ Error during voice cloning:")
538
- # logger.error(traceback.format_exc())
539
- # return None, err_msg, err_msg
540
-
541
- # def calibrated_speed(text, desired_duration):
542
- # """
543
- # Compute a speed factor to help TTS fit audio into desired duration,
544
- # using a simple truncated linear function of characters per second.
545
- # """
546
- # char_count = len(text.strip())
547
- # if char_count == 0 or desired_duration <= 0:
548
- # return 1.0 # fallback
549
-
550
- # cps = char_count / desired_duration # characters per second
551
-
552
- # # Truncated linear mapping
553
- # if cps < 10:
554
- # return 1.0
555
- # elif cps > 25:
556
- # return 1.4
557
- # else:
558
- # # Linearly scale between cps 10 -> 25 and speed 1.0 -> 1.3
559
- # slope = (1.4 - 1.0) / (25 - 10)
560
- # return 1.0 + slope * (cps - 10)
561
-
562
-
563
- # def upload_and_manage(file, target_language, mode="transcription"):
564
- # if file is None:
565
- # logger.info("No file uploaded. Please upload a video/audio file.")
566
- # return None, [], None, "No file uploaded. Please upload a video/audio file."
567
-
568
- # try:
569
- # start_time = time.time() # Start the timer
570
- # logger.info(f"Started processing file: {file.name}")
571
-
572
- # # Define paths for audio and output files
573
- # audio_path = "audio.wav"
574
- # output_video_path = "output_video.mp4"
575
- # voiceover_path = "voiceover.wav"
576
- # logger.info(f"Using audio path: {audio_path}, output video path: {output_video_path}, voiceover path: {voiceover_path}")
577
-
578
- # # Step 1: Transcribe audio from uploaded media file and get timestamps
579
- # logger.info("Transcribing audio...")
580
- # transcription_json, source_language = transcribe_video_with_speakers(file.name)
581
- # logger.info(f"Transcription completed. Detected source language: {source_language}")
582
-
583
- # # Step 2: Translate the transcription
584
- # logger.info(f"Translating transcription from {source_language} to {target_language}...")
585
- # translated_json = translate_text(transcription_json, source_language, target_language)
586
- # logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
587
-
588
- # # Step 3: Add transcript to video based on timestamps
589
- # logger.info("Adding translated transcript to video...")
590
- # add_transcript_voiceover(file.name, translated_json, output_video_path, mode == "Transcription with Voiceover", target_language)
591
- # logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
592
-
593
- # # Convert translated JSON into a format for the editable table
594
- # logger.info("Converting translated JSON into editable table format...")
595
- # editable_table = [
596
- # [float(entry["start"]), entry["original"], entry["translated"], float(entry["end"]), entry["speaker"]]
597
- # for entry in translated_json
598
- # ]
599
-
600
- # # Calculate elapsed time
601
- # elapsed_time = time.time() - start_time
602
- # elapsed_time_display = f"Processing completed in {elapsed_time:.2f} seconds."
603
- # logger.info(f"Processing completed in {elapsed_time:.2f} seconds.")
604
-
605
- # return translated_json, editable_table, output_video_path, elapsed_time_display
606
-
607
- # except Exception as e:
608
- # logger.error(f"An error occurred: {str(e)}")
609
- # return None, [], None, f"An error occurred: {str(e)}"
610
- # # Gradio Interface with Tabs
611
- # def build_interface():
612
- # with gr.Blocks(css=css) as demo:
613
- # gr.Markdown("## Video Localization")
614
- # with gr.Row():
615
- # with gr.Column(scale=4):
616
- # file_input = gr.File(label="Upload Video/Audio File")
617
- # language_input = gr.Dropdown(["en", "es", "fr", "zh"], label="Select Language") # Language codes
618
- # process_mode = gr.Radio(choices=["Transcription", "Transcription with Voiceover"], label="Choose Processing Type", value="Transcription")
619
- # submit_button = gr.Button("Post and Process")
620
- # editable_translations = gr.State(value=[])
621
-
622
- # with gr.Column(scale=8):
623
- # gr.Markdown("## Edit Translations")
 
624
 
625
- # # Editable JSON Data
626
- # editable_table = gr.Dataframe(
627
- # value=[], # Default to an empty list to avoid undefined values
628
- # headers=["start", "original", "translated", "end", "speaker"],
629
- # datatype=["number", "str", "str", "number", "str"],
630
- # row_count=1, # Initially empty
631
- # col_count=5,
632
- # interactive=[False, True, True, False, False], # Control editability
633
- # label="Edit Translations",
634
- # wrap=True # Enables text wrapping if supported
635
- # )
636
- # save_changes_button = gr.Button("Save Changes")
637
- # processed_video_output = gr.File(label="Download Processed Video", interactive=True) # Download button
638
- # elapsed_time_display = gr.Textbox(label="Elapsed Time", lines=1, interactive=False)
639
-
640
- # with gr.Column(scale=1):
641
- # gr.Markdown("**Feedback**")
642
- # feedback_input = gr.Textbox(
643
- # placeholder="Leave your feedback here...",
644
- # label=None,
645
- # lines=3,
646
- # )
647
- # feedback_btn = gr.Button("Submit Feedback")
648
- # response_message = gr.Textbox(label=None, lines=1, interactive=False)
649
- # db_download = gr.File(label="Download Database File", visible=False)
650
 
651
- # # Link the feedback handling
652
- # def feedback_submission(feedback):
653
- # message, file_path = handle_feedback(feedback)
654
- # if file_path:
655
- # return message, gr.update(value=file_path, visible=True)
656
- # return message, gr.update(visible=False)
657
-
658
- # save_changes_button.click(
659
- # update_translations,
660
- # inputs=[file_input, editable_table, process_mode],
661
- # outputs=[processed_video_output, elapsed_time_display]
662
- # )
663
-
664
- # submit_button.click(
665
- # upload_and_manage,
666
- # inputs=[file_input, language_input, process_mode],
667
- # outputs=[editable_translations, editable_table, processed_video_output, elapsed_time_display]
668
- # )
669
-
670
- # # Connect submit button to save_feedback_db function
671
- # feedback_btn.click(
672
- # feedback_submission,
673
- # inputs=[feedback_input],
674
- # outputs=[response_message, db_download]
675
- # )
676
-
677
- # return demo
678
-
679
- # # Launch the Gradio interface
680
- # demo = build_interface()
681
- # demo.launch()
682
-
683
- import gradio as gr
684
-
685
- def dummy_func(x):
686
- return x, "Success"
687
-
688
- with gr.Blocks() as demo:
689
- inp = gr.Textbox()
690
- out1 = gr.Textbox()
691
- out2 = gr.Textbox()
692
- btn = gr.Button("Run")
693
- btn.click(dummy_func, inputs=inp, outputs=[out1, out2])
694
-
695
  demo.launch()
 
62
  hf_api_key = os.environ.get("hf_token")
63
 
64
 
65
+ def silence(duration, fps=44100):
66
+ """
67
+ Returns a silent AudioClip of the specified duration.
68
+ """
69
+ return AudioArrayClip(np.zeros((int(fps*duration), 2)), fps=fps)
70
+
71
+ def count_words_or_characters(text):
72
+ # Count non-Chinese words
73
+ non_chinese_words = len(re.findall(r'\b[a-zA-Z0-9]+\b', text))
74
 
75
+ # Count Chinese characters
76
+ chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
77
 
78
+ return non_chinese_words + chinese_chars
79
 
80
+ # Define the passcode
81
+ PASSCODE = "show_feedback_db"
82
+
83
+ css = """
84
+ /* Adjust row height */
85
+ .dataframe-container tr {
86
+ height: 50px !important;
87
+ }
88
+
89
+ /* Ensure text wrapping and prevent overflow */
90
+ .dataframe-container td {
91
+ white-space: normal !important;
92
+ word-break: break-word !important;
93
+ }
94
+
95
+ /* Set column widths */
96
+ [data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
97
+ [data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
98
+ width: 6%; /* Start column */
99
+ }
100
+
101
+ [data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
102
+ [data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
103
+ width: 47%; /* Original text */
104
+ }
105
+
106
+ [data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
107
+ [data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
108
+ width: 47%; /* Translated text */
109
+ }
110
+
111
+ [data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
112
+ [data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
113
+ display: none !important;
114
+ }
115
+ """
116
+
117
+ # Function to save feedback or provide access to the database file
118
+ def handle_feedback(feedback):
119
+ feedback = feedback.strip() # Clean up leading/trailing whitespace
120
+ if not feedback:
121
+ return "Feedback cannot be empty.", None
122
+
123
+ if feedback == PASSCODE:
124
+ # Provide access to the feedback.db file
125
+ return "Access granted! Download the database file below.", "feedback.db"
126
+ else:
127
+ # Save feedback to the database
128
+ with sqlite3.connect("feedback.db") as conn:
129
+ cursor = conn.cursor()
130
+ cursor.execute("CREATE TABLE IF NOT EXISTS studio_feedback (id INTEGER PRIMARY KEY, comment TEXT)")
131
+ cursor.execute("INSERT INTO studio_feedback (comment) VALUES (?)", (feedback,))
132
+ conn.commit()
133
+ return "Thank you for your feedback!", None
134
+
135
+ # Configure logging
136
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
137
+ logger = logging.getLogger(__name__)
138
+ logger.info(f"MoviePy Version: {moviepy.__version__}")
139
+
140
+ # def segment_background_audio(audio_path, output_path="background_segments.wav"):
141
+ # # Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
142
+ # pipeline = Pipeline.from_pretrained(
143
+ # "pyannote/voice-activity-detection",
144
+ # use_auth_token=hf_api_key
145
+ # )
146
+ # # Step 3: Run VAD to get speech segments
147
+ # vad_result = pipeline(audio_path)
148
+ # print(f"Detected speech segments: {vad_result}")
149
+
150
+ # # Step 4: Load full audio and subtract speech segments
151
+ # full_audio = AudioSegment.from_wav(audio_path)
152
+ # background_audio = AudioSegment.silent(duration=len(full_audio))
153
+
154
+ # for segment in vad_result.itersegments():
155
+ # start_ms = int(segment.start * 1000)
156
+ # end_ms = int(segment.end * 1000)
157
+ # # Remove speech by muting that portion
158
+ # background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
159
+
160
+ # # Step 5: Subtract background_audio from full_audio
161
+ # result_audio = full_audio.overlay(background_audio)
162
+
163
+ # # Step 6: Export non-speech segments
164
+ # result_audio.export(output_path, format="wav")
165
+ # print(f"Saved non-speech (background) audio to: {output_path}")
166
+
167
+ # return True
168
+
169
+ def transcribe_video_with_speakers(video_path):
170
+ # Extract audio from video
171
+ video = VideoFileClip(video_path)
172
+ audio_path = "audio.wav"
173
+ video.audio.write_audiofile(audio_path)
174
+ logger.info(f"Audio extracted from video: {audio_path}")
175
+
176
+ # segment_result = segment_background_audio(audio_path)
177
+ # print(f"Saved non-speech (background) audio to local")
178
 
179
+ # Set up device
180
+ device = "cuda" if torch.cuda.is_available() else "cpu"
181
+ logger.info(f"Using device: {device}")
182
 
183
+ try:
184
+ # Load a medium model with float32 for broader compatibility
185
+ model = whisperx.load_model("medium", device=device, compute_type="float32")
186
+ logger.info("WhisperX model loaded")
187
 
188
+ # Transcribe
189
+ result = model.transcribe(audio_path, chunk_size=5, print_progress = True)
190
+ logger.info("Audio transcription completed")
191
+
192
+ # Get the detected language
193
+ detected_language = result["language"]
194
+ logger.debug(f"Detected language: {detected_language}")
195
+ # Alignment
196
+ model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
197
+ result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
198
+ logger.info("Transcription alignment completed")
199
 
200
+ # Diarization (works independently of Whisper model size)
201
+ diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
202
+ diarize_segments = diarize_model(audio_path)
203
+ logger.info("Speaker diarization completed")
204
 
205
+ # Assign speakers
206
+ result = whisperx.assign_word_speakers(diarize_segments, result)
207
+ logger.info("Speakers assigned to transcribed segments")
208
 
209
+ except Exception as e:
210
+ logger.error(f"❌ WhisperX pipeline failed: {e}")
211
+
212
+ # Extract timestamps, text, and speaker IDs
213
+ transcript_with_speakers = [
214
+ {
215
+ "start": segment["start"],
216
+ "end": segment["end"],
217
+ "text": segment["text"],
218
+ "speaker": segment["speaker"]
219
+ }
220
+ for segment in result["segments"]
221
+ ]
222
+
223
+ # Collect audio for each speaker
224
+ speaker_audio = {}
225
+ for segment in result["segments"]:
226
+ speaker = segment["speaker"]
227
+ if speaker not in speaker_audio:
228
+ speaker_audio[speaker] = []
229
+ speaker_audio[speaker].append((segment["start"], segment["end"]))
230
+
231
+ # Collapse and truncate speaker audio
232
+ speaker_sample_paths = {}
233
+ audio_clip = AudioFileClip(audio_path)
234
+ for speaker, segments in speaker_audio.items():
235
+ speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
236
+ combined_clip = concatenate_audioclips(speaker_clips)
237
+ truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
238
+ sample_path = f"speaker_{speaker}_sample.wav"
239
+ truncated_clip.write_audiofile(sample_path)
240
+ speaker_sample_paths[speaker] = sample_path
241
+ logger.info(f"Created sample for {speaker}: {sample_path}")
242
+
243
+ # Clean up
244
+ video.close()
245
+ audio_clip.close()
246
+ os.remove(audio_path)
247
+
248
+ return transcript_with_speakers, detected_language
249
+
250
+ # Function to get the appropriate translation model based on target language
251
+ def get_translation_model(source_language, target_language):
252
+ """
253
+ Get the translation model based on the source and target language.
254
+
255
+ Parameters:
256
+ - target_language (str): The language to translate the content into (e.g., 'es', 'fr').
257
+ - source_language (str): The language of the input content (default is 'en' for English).
258
 
259
+ Returns:
260
+ - str: The translation model identifier.
261
+ """
262
+ # List of allowable languages
263
+ allowable_languages = ["en", "es", "fr", "zh", "de", "it", "pt", "ja", "ko", "ru"]
264
+
265
+ # Validate source and target languages
266
+ if source_language not in allowable_languages:
267
+ logger.debug(f"Invalid source language '{source_language}'. Supported languages are: {', '.join(allowable_languages)}")
268
+ # Return a default model if source language is invalid
269
+ source_language = "en" # Default to 'en'
270
+
271
+ if target_language not in allowable_languages:
272
+ logger.debug(f"Invalid target language '{target_language}'. Supported languages are: {', '.join(allowable_languages)}")
273
+ # Return a default model if target language is invalid
274
+ target_language = "zh" # Default to 'zh'
275
+
276
+ if source_language == target_language:
277
+ source_language = "en" # Default to 'en'
278
+ target_language = "zh" # Default to 'zh'
279
+
280
+ # Return the model using string concatenation
281
+ return f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
282
+
283
+ def translate_single_entry(entry, translator):
284
+ original_text = entry["text"]
285
+ translated_text = translator(original_text)[0]['translation_text']
286
+ return {
287
+ "start": entry["start"],
288
+ "original": original_text,
289
+ "translated": translated_text,
290
+ "end": entry["end"],
291
+ "speaker": entry["speaker"]
292
+ }
293
+
294
+ def translate_text(transcription_json, source_language, target_language):
295
+ # Load the translation model for the specified target language
296
+ translation_model_id = get_translation_model(source_language, target_language)
297
+ logger.debug(f"Translation model: {translation_model_id}")
298
+ translator = pipeline("translation", model=translation_model_id)
299
+
300
+ # Use ThreadPoolExecutor to parallelize translations
301
+ with concurrent.futures.ThreadPoolExecutor() as executor:
302
+ # Submit all translation tasks and collect results
303
+ translate_func = lambda entry: translate_single_entry(entry, translator)
304
+ translated_json = list(executor.map(translate_func, transcription_json))
305
+
306
+ # Sort the translated_json by start time
307
+ translated_json.sort(key=lambda x: x["start"])
308
+
309
+ # Log the components being added to translated_json
310
+ for entry in translated_json:
311
+ logger.debug("Added to translated_json: start=%s, original=%s, translated=%s, end=%s, speaker=%s",
312
+ entry["start"], entry["original"], entry["translated"], entry["end"], entry["speaker"])
313
+
314
+ return translated_json
315
+
316
+ def update_translations(file, edited_table, mode):
317
+ """
318
+ Update the translations based on user edits in the Gradio Dataframe.
319
+ """
320
+ output_video_path = "output_video.mp4"
321
+ logger.debug(f"Editable Table: {edited_table}")
322
+
323
+ if file is None:
324
+ logger.info("No file uploaded. Please upload a video/audio file.")
325
+ return None, [], None, "No file uploaded. Please upload a video/audio file."
326
 
327
+ try:
328
+ start_time = time.time() # Start the timer
329
+
330
+ # Convert the edited_table (list of lists) back to list of dictionaries
331
+ updated_translations = [
332
+ {
333
+ "start": row["start"], # Access by column name
334
+ "original": row["original"],
335
+ "translated": row["translated"],
336
+ "end": row["end"]
337
+ }
338
+ for _, row in edited_table.iterrows()
339
+ ]
340
+
341
+ # Call the function to process the video with updated translations
342
+ add_transcript_voiceover(file.name, updated_translations, output_video_path, mode=="Transcription with Voiceover")
343
+
344
+ # Calculate elapsed time
345
+ elapsed_time = time.time() - start_time
346
+ elapsed_time_display = f"Updates applied successfully in {elapsed_time:.2f} seconds."
347
+
348
+ return output_video_path, elapsed_time_display
349
+
350
+ except Exception as e:
351
+ raise ValueError(f"Error updating translations: {e}")
352
+
353
+ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_height, font_path):
354
+ try:
355
+ subtitle_width = int(video_width * 0.8)
356
+ subtitle_font_size = int(video_height // 20)
357
+ font = ImageFont.truetype(font_path, subtitle_font_size)
358
+
359
+ dummy_img = Image.new("RGBA", (subtitle_width, 1), (0, 0, 0, 0))
360
+ draw = ImageDraw.Draw(dummy_img)
361
+
362
+ lines = []
363
+ line = ""
364
+ for word in text.split():
365
+ test_line = f"{line} {word}".strip()
366
+ bbox = draw.textbbox((0, 0), test_line, font=font)
367
+ w = bbox[2] - bbox[0]
368
+ if w <= subtitle_width - 10:
369
+ line = test_line
370
+ else:
371
+ lines.append(line)
372
+ line = word
373
+ lines.append(line)
374
+
375
+ line_heights = [draw.textbbox((0, 0), l, font=font)[3] - draw.textbbox((0, 0), l, font=font)[1] for l in lines]
376
+ total_height = sum(line_heights) + (len(lines) - 1) * 5
377
+ img = Image.new("RGBA", (subtitle_width, total_height), (0, 0, 0, 0))
378
+ draw = ImageDraw.Draw(img)
379
+
380
+ y = 0
381
+ for idx, line in enumerate(lines):
382
+ bbox = draw.textbbox((0, 0), line, font=font)
383
+ w = bbox[2] - bbox[0]
384
+ draw.text(((subtitle_width - w) // 2, y), line, font=font, fill="yellow")
385
+ y += line_heights[idx] + 5
386
 
387
+ img_np = np.array(img) # <- ✅ Fix: convert to NumPy
388
+ txt_clip = ImageClip(img_np).set_start(start_time).set_duration(end_time - start_time).set_position("bottom").set_opacity(0.8)
389
+ return txt_clip
390
+ except Exception as e:
391
+ logger.error(f"\u274c Failed to create subtitle clip: {e}")
392
+ return None
393
+
394
+ def process_entry(entry, i, video_width, video_height, add_voiceover, target_language, font_path, speaker_sample_paths=None):
395
+ logger.debug(f"Processing entry {i}: {entry}")
396
+ error_message = None
397
+
398
+ try:
399
+ txt_clip = create_subtitle_clip_pil(entry["translated"], entry["start"], entry["end"], video_width, video_height, font_path)
400
+ except Exception as e:
401
+ error_message = f"❌ Failed to create subtitle clip for entry {i}: {e}"
402
+ logger.error(error_message)
403
+ txt_clip = None
404
+
405
+ audio_segment = None
406
+ if add_voiceover:
407
+ try:
408
+ segment_audio_path = f"segment_{i}_voiceover.wav"
409
+ desired_duration = entry["end"] - entry["start"]
410
+ speaker = entry.get("speaker", "default")
411
+ speaker_wav_path = f"speaker_{speaker}_sample.wav"
412
+
413
+ output_path, status_msg, tts_error = generate_voiceover_clone([entry], desired_duration, target_language, speaker_wav_path, segment_audio_path)
414
+
415
+ if tts_error:
416
+ error_message = error_message + " | " + tts_error if error_message else tts_error
417
+
418
+ if not output_path or not os.path.exists(segment_audio_path):
419
+ raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
420
+
421
+ audio_clip = AudioFileClip(segment_audio_path)
422
+ logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
423
+
424
+ if audio_clip.duration < desired_duration:
425
+ silence_duration = desired_duration - audio_clip.duration
426
+ audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
427
+ logger.info(f"Padded audio with {silence_duration} seconds of silence.")
428
+
429
+ audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
430
+
431
+ except Exception as e:
432
+ err = f"❌ Failed to generate audio segment for entry {i}: {e}"
433
+ logger.error(err)
434
+ error_message = error_message + " | " + err if error_message else err
435
+ audio_segment = None
436
+
437
+ return i, txt_clip, audio_segment, error_message
438
 
439
+ def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None):
440
+ video = VideoFileClip(video_path)
441
+ font_path = "./NotoSansSC-Regular.ttf"
442
+
443
+ text_clips = []
444
+ audio_segments = []
445
+ error_messages = []
446
+
447
+ with concurrent.futures.ThreadPoolExecutor() as executor:
448
+ futures = [executor.submit(process_entry, entry, i, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths)
449
+ for i, entry in enumerate(translated_json)]
450
+
451
+ results = []
452
+ for future in concurrent.futures.as_completed(futures):
453
+ try:
454
+ i, txt_clip, audio_segment, error = future.result()
455
+ results.append((i, txt_clip, audio_segment))
456
+ if error:
457
+ error_messages.append(f"[Entry {i}] {error}")
458
+ except Exception as e:
459
+ err = f"❌ Unexpected error in future result: {e}"
460
+ logger.error(err)
461
+ error_messages.append(err)
462
+
463
+ # Sort by entry index to ensure order
464
+ results.sort(key=lambda x: x[0])
465
+ text_clips = [clip for _, clip, _ in results if clip]
466
+ if add_voiceover:
467
+ audio_segments = [segment for _, _, segment in results if segment]
468
+
469
+ final_video = CompositeVideoClip([video] + text_clips)
470
+
471
+ if add_voiceover:
472
+ if audio_segments:
473
+ final_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
474
+ final_video = final_video.set_audio(final_audio)
475
+ else:
476
+ logger.warning("⚠️ No audio segments available. Adding silent fallback.")
477
+ silent_audio = AudioClip(lambda t: 0, duration=video.duration)
478
+ final_video = final_video.set_audio(silent_audio)
479
+
480
+ logger.info(f"Saving the final video to: {output_path}")
481
+ final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
482
+
483
+ logger.info("Video processing completed successfully.")
484
+
485
+ # Optional: return errors
486
+ if error_messages:
487
+ logger.warning("⚠️ Errors encountered during processing:")
488
+ for msg in error_messages:
489
+ logger.warning(msg)
490
+
491
+ return error_messages
492
+
493
+ # Initialize TTS model only once (outside the function)
494
+ tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
495
+
496
+ def generate_voiceover_clone(translated_json, desired_duration, target_language, speaker_wav_path, output_audio_path):
497
+ try:
498
+ full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
499
+ if not full_text.strip():
500
+ msg = "❌ Translated text is empty."
501
+ logger.error(msg)
502
+ return None, msg, msg
503
+
504
+ if not speaker_wav_path or not os.path.exists(speaker_wav_path):
505
+ msg = f"❌ Speaker audio not found: {speaker_wav_path}"
506
+ logger.error(msg)
507
+ return None, msg, msg
508
+
509
+ # # Truncate text based on max token assumption (~60 tokens)
510
+ # MAX_TTS_TOKENS = 60
511
+ # tokens = full_text.split() # crude token count
512
+ # if len(tokens) > MAX_TTS_TOKENS:
513
+ # logger.warning(f"⚠️ Text too long for TTS model ({len(tokens)} tokens). Truncating to {MAX_TTS_TOKENS} tokens.")
514
+ # full_text = " ".join(tokens[:MAX_TTS_TOKENS])
515
+
516
+ speed_tts = calibrated_speed(full_text, desired_duration)
517
+ tts.tts_to_file(
518
+ text=full_text,
519
+ speaker_wav=speaker_wav_path,
520
+ language=target_language,
521
+ file_path=output_audio_path,
522
+ speed=speed_tts,
523
+ split_sentences=True
524
+ )
525
+
526
+ if not os.path.exists(output_audio_path):
527
+ msg = f"❌ Voiceover file not generated at: {output_audio_path}"
528
+ logger.error(msg)
529
+ return None, msg, msg
530
+
531
+ msg = "✅ Voice cloning completed successfully."
532
+ logger.info(msg)
533
+ return output_audio_path, msg, None
534
+
535
+ except Exception as e:
536
+ err_msg = f"❌ An error occurred: {str(e)}"
537
+ logger.error("❌ Error during voice cloning:")
538
+ logger.error(traceback.format_exc())
539
+ return None, err_msg, err_msg
540
+
541
+ def calibrated_speed(text, desired_duration):
542
+ """
543
+ Compute a speed factor to help TTS fit audio into desired duration,
544
+ using a simple truncated linear function of characters per second.
545
+ """
546
+ char_count = len(text.strip())
547
+ if char_count == 0 or desired_duration <= 0:
548
+ return 1.0 # fallback
549
+
550
+ cps = char_count / desired_duration # characters per second
551
+
552
+ # Truncated linear mapping
553
+ if cps < 10:
554
+ return 1.0
555
+ elif cps > 25:
556
+ return 1.4
557
+ else:
558
+ # Linearly scale between cps 10 -> 25 and speed 1.0 -> 1.3
559
+ slope = (1.4 - 1.0) / (25 - 10)
560
+ return 1.0 + slope * (cps - 10)
561
+
562
+
563
+ def upload_and_manage(file, target_language, mode="transcription"):
564
+ if file is None:
565
+ logger.info("No file uploaded. Please upload a video/audio file.")
566
+ return None, [], None, "No file uploaded. Please upload a video/audio file."
567
+
568
+ try:
569
+ start_time = time.time() # Start the timer
570
+ logger.info(f"Started processing file: {file.name}")
571
+
572
+ # Define paths for audio and output files
573
+ audio_path = "audio.wav"
574
+ output_video_path = "output_video.mp4"
575
+ voiceover_path = "voiceover.wav"
576
+ logger.info(f"Using audio path: {audio_path}, output video path: {output_video_path}, voiceover path: {voiceover_path}")
577
+
578
+ # Step 1: Transcribe audio from uploaded media file and get timestamps
579
+ logger.info("Transcribing audio...")
580
+ transcription_json, source_language = transcribe_video_with_speakers(file.name)
581
+ logger.info(f"Transcription completed. Detected source language: {source_language}")
582
+
583
+ # Step 2: Translate the transcription
584
+ logger.info(f"Translating transcription from {source_language} to {target_language}...")
585
+ translated_json = translate_text(transcription_json, source_language, target_language)
586
+ logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
587
+
588
+ # Step 3: Add transcript to video based on timestamps
589
+ logger.info("Adding translated transcript to video...")
590
+ add_transcript_voiceover(file.name, translated_json, output_video_path, mode == "Transcription with Voiceover", target_language)
591
+ logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
592
+
593
+ # Convert translated JSON into a format for the editable table
594
+ logger.info("Converting translated JSON into editable table format...")
595
+ editable_table = [
596
+ [float(entry["start"]), entry["original"], entry["translated"], float(entry["end"]), entry["speaker"]]
597
+ for entry in translated_json
598
+ ]
599
+
600
+ # Calculate elapsed time
601
+ elapsed_time = time.time() - start_time
602
+ elapsed_time_display = f"Processing completed in {elapsed_time:.2f} seconds."
603
+ logger.info(f"Processing completed in {elapsed_time:.2f} seconds.")
604
+
605
+ return translated_json, editable_table, output_video_path, elapsed_time_display
606
+
607
+ except Exception as e:
608
+ logger.error(f"An error occurred: {str(e)}")
609
+ return None, [], None, f"An error occurred: {str(e)}"
610
+
611
+ # Gradio Interface with Tabs
612
+ def build_interface():
613
+ with gr.Blocks(css=css) as demo:
614
+ gr.Markdown("## Video Localization")
615
+ with gr.Row():
616
+ with gr.Column(scale=4):
617
+ file_input = gr.File(label="Upload Video/Audio File")
618
+ language_input = gr.Dropdown(["en", "es", "fr", "zh"], label="Select Language") # Language codes
619
+ process_mode = gr.Radio(choices=["Transcription", "Transcription with Voiceover"], label="Choose Processing Type", value="Transcription")
620
+ submit_button = gr.Button("Post and Process")
621
+ editable_translations = gr.State(value=[])
622
+
623
+ with gr.Column(scale=8):
624
+ gr.Markdown("## Edit Translations")
625
 
626
+ # Editable JSON Data
627
+ editable_table = gr.Dataframe(
628
+ value=[], # Default to an empty list to avoid undefined values
629
+ headers=["start", "original", "translated", "end", "speaker"],
630
+ datatype=["number", "str", "str", "number", "str"],
631
+ row_count=1, # Initially empty
632
+ col_count=5,
633
+ interactive=[False, True, True, False, False], # Control editability
634
+ label="Edit Translations",
635
+ wrap=True # Enables text wrapping if supported
636
+ )
637
+ save_changes_button = gr.Button("Save Changes")
638
+ processed_video_output = gr.File(label="Download Processed Video", interactive=True) # Download button
639
+ elapsed_time_display = gr.Textbox(label="Elapsed Time", lines=1, interactive=False)
640
+
641
+ with gr.Column(scale=1):
642
+ gr.Markdown("**Feedback**")
643
+ feedback_input = gr.Textbox(
644
+ placeholder="Leave your feedback here...",
645
+ label=None,
646
+ lines=3,
647
+ )
648
+ feedback_btn = gr.Button("Submit Feedback")
649
+ response_message = gr.Textbox(label=None, lines=1, interactive=False)
650
+ db_download = gr.File(label="Download Database File", visible=False)
651
 
652
+ # Link the feedback handling
653
+ def feedback_submission(feedback):
654
+ message, file_path = handle_feedback(feedback)
655
+ if file_path:
656
+ return message, gr.update(value=file_path, visible=True)
657
+ return message, gr.update(visible=False)
658
+
659
+ save_changes_button.click(
660
+ update_translations,
661
+ inputs=[file_input, editable_table, process_mode],
662
+ outputs=[processed_video_output, elapsed_time_display]
663
+ )
664
+
665
+ submit_button.click(
666
+ upload_and_manage,
667
+ inputs=[file_input, language_input, process_mode],
668
+ outputs=[editable_translations, editable_table, processed_video_output, elapsed_time_display]
669
+ )
670
+
671
+ # Connect submit button to save_feedback_db function
672
+ feedback_btn.click(
673
+ feedback_submission,
674
+ inputs=[feedback_input],
675
+ outputs=[response_message, db_download]
676
+ )
677
+
678
+ return demo
679
+
680
+ # Launch the Gradio interface
681
+ demo = build_interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682
  demo.launch()