qqwjq1981 commited on
Commit
39837b9
·
verified ·
1 Parent(s): 9eecb1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +617 -602
app.py CHANGED
@@ -61,620 +61,635 @@ client = OpenAI(
61
  )
62
  hf_api_key = os.environ.get("hf_token")
63
 
64
- def silence(duration, fps=44100):
65
- """
66
- Returns a silent AudioClip of the specified duration.
67
- """
68
- return AudioArrayClip(np.zeros((int(fps*duration), 2)), fps=fps)
69
-
70
- def count_words_or_characters(text):
71
- # Count non-Chinese words
72
- non_chinese_words = len(re.findall(r'\b[a-zA-Z0-9]+\b', text))
 
73
 
74
- # Count Chinese characters
75
- chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
76
 
77
- return non_chinese_words + chinese_chars
78
 
79
- # Define the passcode
80
- PASSCODE = "show_feedback_db"
81
-
82
- css = """
83
- /* Adjust row height */
84
- .dataframe-container tr {
85
- height: 50px !important;
86
- }
87
-
88
- /* Ensure text wrapping and prevent overflow */
89
- .dataframe-container td {
90
- white-space: normal !important;
91
- word-break: break-word !important;
92
- }
93
-
94
- /* Set column widths */
95
- [data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
96
- [data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
97
- width: 6%; /* Start column */
98
- }
99
-
100
- [data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
101
- [data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
102
- width: 47%; /* Original text */
103
- }
104
-
105
- [data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
106
- [data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
107
- width: 47%; /* Translated text */
108
- }
109
-
110
- [data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
111
- [data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
112
- display: none !important;
113
- }
114
- """
115
-
116
- # Function to save feedback or provide access to the database file
117
- def handle_feedback(feedback):
118
- feedback = feedback.strip() # Clean up leading/trailing whitespace
119
- if not feedback:
120
- return "Feedback cannot be empty.", None
121
-
122
- if feedback == PASSCODE:
123
- # Provide access to the feedback.db file
124
- return "Access granted! Download the database file below.", "feedback.db"
125
- else:
126
- # Save feedback to the database
127
- with sqlite3.connect("feedback.db") as conn:
128
- cursor = conn.cursor()
129
- cursor.execute("CREATE TABLE IF NOT EXISTS studio_feedback (id INTEGER PRIMARY KEY, comment TEXT)")
130
- cursor.execute("INSERT INTO studio_feedback (comment) VALUES (?)", (feedback,))
131
- conn.commit()
132
- return "Thank you for your feedback!", None
133
-
134
- # Configure logging
135
- logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
136
- logger = logging.getLogger(__name__)
137
- logger.info(f"MoviePy Version: {moviepy.__version__}")
138
-
139
- # def segment_background_audio(audio_path, output_path="background_segments.wav"):
140
- # # Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
141
- # pipeline = Pipeline.from_pretrained(
142
- # "pyannote/voice-activity-detection",
143
- # use_auth_token=hf_api_key
144
- # )
145
- # # Step 3: Run VAD to get speech segments
146
- # vad_result = pipeline(audio_path)
147
- # print(f"Detected speech segments: {vad_result}")
148
-
149
- # # Step 4: Load full audio and subtract speech segments
150
- # full_audio = AudioSegment.from_wav(audio_path)
151
- # background_audio = AudioSegment.silent(duration=len(full_audio))
152
-
153
- # for segment in vad_result.itersegments():
154
- # start_ms = int(segment.start * 1000)
155
- # end_ms = int(segment.end * 1000)
156
- # # Remove speech by muting that portion
157
- # background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
158
-
159
- # # Step 5: Subtract background_audio from full_audio
160
- # result_audio = full_audio.overlay(background_audio)
161
-
162
- # # Step 6: Export non-speech segments
163
- # result_audio.export(output_path, format="wav")
164
- # print(f"Saved non-speech (background) audio to: {output_path}")
165
-
166
- # return True
167
-
168
- def transcribe_video_with_speakers(video_path):
169
- # Extract audio from video
170
- video = VideoFileClip(video_path)
171
- audio_path = "audio.wav"
172
- video.audio.write_audiofile(audio_path)
173
- logger.info(f"Audio extracted from video: {audio_path}")
174
-
175
- # segment_result = segment_background_audio(audio_path)
176
- # print(f"Saved non-speech (background) audio to local")
177
 
178
- # Set up device
179
- device = "cuda" if torch.cuda.is_available() else "cpu"
180
- logger.info(f"Using device: {device}")
181
 
182
- try:
183
- # Load a medium model with float32 for broader compatibility
184
- model = whisperx.load_model("medium", device=device, compute_type="float32")
185
- logger.info("WhisperX model loaded")
186
 
187
- # Transcribe
188
- result = model.transcribe(audio_path, chunk_size=5, print_progress = True)
189
- logger.info("Audio transcription completed")
190
-
191
- # Get the detected language
192
- detected_language = result["language"]
193
- logger.debug(f"Detected language: {detected_language}")
194
- # Alignment
195
- model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
196
- result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
197
- logger.info("Transcription alignment completed")
198
 
199
- # Diarization (works independently of Whisper model size)
200
- diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
201
- diarize_segments = diarize_model(audio_path)
202
- logger.info("Speaker diarization completed")
203
 
204
- # Assign speakers
205
- result = whisperx.assign_word_speakers(diarize_segments, result)
206
- logger.info("Speakers assigned to transcribed segments")
207
 
208
- except Exception as e:
209
- logger.error(f"❌ WhisperX pipeline failed: {e}")
210
-
211
- # Extract timestamps, text, and speaker IDs
212
- transcript_with_speakers = [
213
- {
214
- "start": segment["start"],
215
- "end": segment["end"],
216
- "text": segment["text"],
217
- "speaker": segment["speaker"]
218
- }
219
- for segment in result["segments"]
220
- ]
221
-
222
- # Collect audio for each speaker
223
- speaker_audio = {}
224
- for segment in result["segments"]:
225
- speaker = segment["speaker"]
226
- if speaker not in speaker_audio:
227
- speaker_audio[speaker] = []
228
- speaker_audio[speaker].append((segment["start"], segment["end"]))
229
-
230
- # Collapse and truncate speaker audio
231
- speaker_sample_paths = {}
232
- audio_clip = AudioFileClip(audio_path)
233
- for speaker, segments in speaker_audio.items():
234
- speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
235
- combined_clip = concatenate_audioclips(speaker_clips)
236
- truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
237
- sample_path = f"speaker_{speaker}_sample.wav"
238
- truncated_clip.write_audiofile(sample_path)
239
- speaker_sample_paths[speaker] = sample_path
240
- logger.info(f"Created sample for {speaker}: {sample_path}")
241
-
242
- # Clean up
243
- video.close()
244
- audio_clip.close()
245
- os.remove(audio_path)
246
-
247
- return transcript_with_speakers, detected_language
248
-
249
- # Function to get the appropriate translation model based on target language
250
- def get_translation_model(source_language, target_language):
251
- """
252
- Get the translation model based on the source and target language.
253
-
254
- Parameters:
255
- - target_language (str): The language to translate the content into (e.g., 'es', 'fr').
256
- - source_language (str): The language of the input content (default is 'en' for English).
257
 
258
- Returns:
259
- - str: The translation model identifier.
260
- """
261
- # List of allowable languages
262
- allowable_languages = ["en", "es", "fr", "zh", "de", "it", "pt", "ja", "ko", "ru"]
263
-
264
- # Validate source and target languages
265
- if source_language not in allowable_languages:
266
- logger.debug(f"Invalid source language '{source_language}'. Supported languages are: {', '.join(allowable_languages)}")
267
- # Return a default model if source language is invalid
268
- source_language = "en" # Default to 'en'
269
-
270
- if target_language not in allowable_languages:
271
- logger.debug(f"Invalid target language '{target_language}'. Supported languages are: {', '.join(allowable_languages)}")
272
- # Return a default model if target language is invalid
273
- target_language = "zh" # Default to 'zh'
274
-
275
- if source_language == target_language:
276
- source_language = "en" # Default to 'en'
277
- target_language = "zh" # Default to 'zh'
278
-
279
- # Return the model using string concatenation
280
- return f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
281
-
282
- def translate_single_entry(entry, translator):
283
- original_text = entry["text"]
284
- translated_text = translator(original_text)[0]['translation_text']
285
- return {
286
- "start": entry["start"],
287
- "original": original_text,
288
- "translated": translated_text,
289
- "end": entry["end"],
290
- "speaker": entry["speaker"]
291
- }
292
-
293
- def translate_text(transcription_json, source_language, target_language):
294
- # Load the translation model for the specified target language
295
- translation_model_id = get_translation_model(source_language, target_language)
296
- logger.debug(f"Translation model: {translation_model_id}")
297
- translator = pipeline("translation", model=translation_model_id)
298
-
299
- # Use ThreadPoolExecutor to parallelize translations
300
- with concurrent.futures.ThreadPoolExecutor() as executor:
301
- # Submit all translation tasks and collect results
302
- translate_func = lambda entry: translate_single_entry(entry, translator)
303
- translated_json = list(executor.map(translate_func, transcription_json))
304
-
305
- # Sort the translated_json by start time
306
- translated_json.sort(key=lambda x: x["start"])
307
-
308
- # Log the components being added to translated_json
309
- for entry in translated_json:
310
- logger.debug("Added to translated_json: start=%s, original=%s, translated=%s, end=%s, speaker=%s",
311
- entry["start"], entry["original"], entry["translated"], entry["end"], entry["speaker"])
312
-
313
- return translated_json
314
-
315
- def update_translations(file, edited_table, mode):
316
- """
317
- Update the translations based on user edits in the Gradio Dataframe.
318
- """
319
- output_video_path = "output_video.mp4"
320
- logger.debug(f"Editable Table: {edited_table}")
321
-
322
- if file is None:
323
- logger.info("No file uploaded. Please upload a video/audio file.")
324
- return None, [], None, "No file uploaded. Please upload a video/audio file."
325
 
326
- try:
327
- start_time = time.time() # Start the timer
328
-
329
- # Convert the edited_table (list of lists) back to list of dictionaries
330
- updated_translations = [
331
- {
332
- "start": row["start"], # Access by column name
333
- "original": row["original"],
334
- "translated": row["translated"],
335
- "end": row["end"]
336
- }
337
- for _, row in edited_table.iterrows()
338
- ]
339
-
340
- # Call the function to process the video with updated translations
341
- add_transcript_voiceover(file.name, updated_translations, output_video_path, mode=="Transcription with Voiceover")
342
-
343
- # Calculate elapsed time
344
- elapsed_time = time.time() - start_time
345
- elapsed_time_display = f"Updates applied successfully in {elapsed_time:.2f} seconds."
346
-
347
- return output_video_path, elapsed_time_display
348
-
349
- except Exception as e:
350
- raise ValueError(f"Error updating translations: {e}")
351
-
352
- def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_height, font_path):
353
- try:
354
- subtitle_width = int(video_width * 0.8)
355
- subtitle_font_size = int(video_height // 20)
356
- font = ImageFont.truetype(font_path, subtitle_font_size)
357
-
358
- dummy_img = Image.new("RGBA", (subtitle_width, 1), (0, 0, 0, 0))
359
- draw = ImageDraw.Draw(dummy_img)
360
-
361
- lines = []
362
- line = ""
363
- for word in text.split():
364
- test_line = f"{line} {word}".strip()
365
- bbox = draw.textbbox((0, 0), test_line, font=font)
366
- w = bbox[2] - bbox[0]
367
- if w <= subtitle_width - 10:
368
- line = test_line
369
- else:
370
- lines.append(line)
371
- line = word
372
- lines.append(line)
373
-
374
- line_heights = [draw.textbbox((0, 0), l, font=font)[3] - draw.textbbox((0, 0), l, font=font)[1] for l in lines]
375
- total_height = sum(line_heights) + (len(lines) - 1) * 5
376
- img = Image.new("RGBA", (subtitle_width, total_height), (0, 0, 0, 0))
377
- draw = ImageDraw.Draw(img)
378
-
379
- y = 0
380
- for idx, line in enumerate(lines):
381
- bbox = draw.textbbox((0, 0), line, font=font)
382
- w = bbox[2] - bbox[0]
383
- draw.text(((subtitle_width - w) // 2, y), line, font=font, fill="yellow")
384
- y += line_heights[idx] + 5
385
 
386
- img_np = np.array(img) # <- ✅ Fix: convert to NumPy
387
- txt_clip = ImageClip(img_np).set_start(start_time).set_duration(end_time - start_time).set_position("bottom").set_opacity(0.8)
388
- return txt_clip
389
- except Exception as e:
390
- logger.error(f"\u274c Failed to create subtitle clip: {e}")
391
- return None
392
-
393
- def process_entry(entry, i, video_width, video_height, add_voiceover, target_language, font_path, speaker_sample_paths=None):
394
- logger.debug(f"Processing entry {i}: {entry}")
395
- error_message = None
396
-
397
- try:
398
- txt_clip = create_subtitle_clip_pil(entry["translated"], entry["start"], entry["end"], video_width, video_height, font_path)
399
- except Exception as e:
400
- error_message = f"❌ Failed to create subtitle clip for entry {i}: {e}"
401
- logger.error(error_message)
402
- txt_clip = None
403
-
404
- audio_segment = None
405
- if add_voiceover:
406
- try:
407
- segment_audio_path = f"segment_{i}_voiceover.wav"
408
- desired_duration = entry["end"] - entry["start"]
409
- speaker = entry.get("speaker", "default")
410
- speaker_wav_path = f"speaker_{speaker}_sample.wav"
411
-
412
- output_path, status_msg, tts_error = generate_voiceover_clone([entry], desired_duration, target_language, speaker_wav_path, segment_audio_path)
413
-
414
- if tts_error:
415
- error_message = error_message + " | " + tts_error if error_message else tts_error
416
-
417
- if not output_path or not os.path.exists(segment_audio_path):
418
- raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
419
-
420
- audio_clip = AudioFileClip(segment_audio_path)
421
- logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
422
-
423
- if audio_clip.duration < desired_duration:
424
- silence_duration = desired_duration - audio_clip.duration
425
- audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
426
- logger.info(f"Padded audio with {silence_duration} seconds of silence.")
427
-
428
- audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
429
-
430
- except Exception as e:
431
- err = f"❌ Failed to generate audio segment for entry {i}: {e}"
432
- logger.error(err)
433
- error_message = error_message + " | " + err if error_message else err
434
- audio_segment = None
435
-
436
- return i, txt_clip, audio_segment, error_message
437
 
438
- def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None):
439
- video = VideoFileClip(video_path)
440
- font_path = "./NotoSansSC-Regular.ttf"
441
-
442
- text_clips = []
443
- audio_segments = []
444
- error_messages = []
445
-
446
- with concurrent.futures.ThreadPoolExecutor() as executor:
447
- futures = [executor.submit(process_entry, entry, i, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths)
448
- for i, entry in enumerate(translated_json)]
449
-
450
- results = []
451
- for future in concurrent.futures.as_completed(futures):
452
- try:
453
- i, txt_clip, audio_segment, error = future.result()
454
- results.append((i, txt_clip, audio_segment))
455
- if error:
456
- error_messages.append(f"[Entry {i}] {error}")
457
- except Exception as e:
458
- err = f"❌ Unexpected error in future result: {e}"
459
- logger.error(err)
460
- error_messages.append(err)
461
-
462
- # Sort by entry index to ensure order
463
- results.sort(key=lambda x: x[0])
464
- text_clips = [clip for _, clip, _ in results if clip]
465
- if add_voiceover:
466
- audio_segments = [segment for _, _, segment in results if segment]
467
-
468
- final_video = CompositeVideoClip([video] + text_clips)
469
-
470
- if add_voiceover:
471
- if audio_segments:
472
- final_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
473
- final_video = final_video.set_audio(final_audio)
474
- else:
475
- logger.warning("⚠️ No audio segments available. Adding silent fallback.")
476
- silent_audio = AudioClip(lambda t: 0, duration=video.duration)
477
- final_video = final_video.set_audio(silent_audio)
478
-
479
- logger.info(f"Saving the final video to: {output_path}")
480
- final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
481
-
482
- logger.info("Video processing completed successfully.")
483
-
484
- # Optional: return errors
485
- if error_messages:
486
- logger.warning("⚠️ Errors encountered during processing:")
487
- for msg in error_messages:
488
- logger.warning(msg)
489
-
490
- return error_messages
491
-
492
- # Initialize TTS model only once (outside the function)
493
- tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
494
-
495
- def generate_voiceover_clone(translated_json, desired_duration, target_language, speaker_wav_path, output_audio_path):
496
- try:
497
- full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
498
- if not full_text.strip():
499
- msg = "❌ Translated text is empty."
500
- logger.error(msg)
501
- return None, msg, msg
502
-
503
- if not speaker_wav_path or not os.path.exists(speaker_wav_path):
504
- msg = f"❌ Speaker audio not found: {speaker_wav_path}"
505
- logger.error(msg)
506
- return None, msg, msg
507
-
508
- # # Truncate text based on max token assumption (~60 tokens)
509
- # MAX_TTS_TOKENS = 60
510
- # tokens = full_text.split() # crude token count
511
- # if len(tokens) > MAX_TTS_TOKENS:
512
- # logger.warning(f"⚠️ Text too long for TTS model ({len(tokens)} tokens). Truncating to {MAX_TTS_TOKENS} tokens.")
513
- # full_text = " ".join(tokens[:MAX_TTS_TOKENS])
514
-
515
- speed_tts = calibrated_speed(full_text, desired_duration)
516
- tts.tts_to_file(
517
- text=full_text,
518
- speaker_wav=speaker_wav_path,
519
- language=target_language,
520
- file_path=output_audio_path,
521
- speed=speed_tts,
522
- split_sentences=True
523
- )
524
-
525
- if not os.path.exists(output_audio_path):
526
- msg = f"❌ Voiceover file not generated at: {output_audio_path}"
527
- logger.error(msg)
528
- return None, msg, msg
529
-
530
- msg = "✅ Voice cloning completed successfully."
531
- logger.info(msg)
532
- return output_audio_path, msg, None
533
-
534
- except Exception as e:
535
- err_msg = f"❌ An error occurred: {str(e)}"
536
- logger.error("❌ Error during voice cloning:")
537
- logger.error(traceback.format_exc())
538
- return None, err_msg, err_msg
539
-
540
- def calibrated_speed(text, desired_duration):
541
- """
542
- Compute a speed factor to help TTS fit audio into desired duration,
543
- using a simple truncated linear function of characters per second.
544
- """
545
- char_count = len(text.strip())
546
- if char_count == 0 or desired_duration <= 0:
547
- return 1.0 # fallback
548
-
549
- cps = char_count / desired_duration # characters per second
550
-
551
- # Truncated linear mapping
552
- if cps < 10:
553
- return 1.0
554
- elif cps > 25:
555
- return 1.4
556
- else:
557
- # Linearly scale between cps 10 -> 25 and speed 1.0 -> 1.3
558
- slope = (1.4 - 1.0) / (25 - 10)
559
- return 1.0 + slope * (cps - 10)
560
-
561
-
562
- def upload_and_manage(file, target_language, mode="transcription"):
563
- if file is None:
564
- logger.info("No file uploaded. Please upload a video/audio file.")
565
- return None, [], None, "No file uploaded. Please upload a video/audio file."
566
-
567
- try:
568
- start_time = time.time() # Start the timer
569
- logger.info(f"Started processing file: {file.name}")
570
-
571
- # Define paths for audio and output files
572
- audio_path = "audio.wav"
573
- output_video_path = "output_video.mp4"
574
- voiceover_path = "voiceover.wav"
575
- logger.info(f"Using audio path: {audio_path}, output video path: {output_video_path}, voiceover path: {voiceover_path}")
576
-
577
- # Step 1: Transcribe audio from uploaded media file and get timestamps
578
- logger.info("Transcribing audio...")
579
- transcription_json, source_language = transcribe_video_with_speakers(file.name)
580
- logger.info(f"Transcription completed. Detected source language: {source_language}")
581
-
582
- # Step 2: Translate the transcription
583
- logger.info(f"Translating transcription from {source_language} to {target_language}...")
584
- translated_json = translate_text(transcription_json, source_language, target_language)
585
- logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
586
-
587
- # Step 3: Add transcript to video based on timestamps
588
- logger.info("Adding translated transcript to video...")
589
- add_transcript_voiceover(file.name, translated_json, output_video_path, mode == "Transcription with Voiceover", target_language)
590
- logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
591
-
592
- # Convert translated JSON into a format for the editable table
593
- logger.info("Converting translated JSON into editable table format...")
594
- editable_table = [
595
- [float(entry["start"]), entry["original"], entry["translated"], float(entry["end"]), entry["speaker"]]
596
- for entry in translated_json
597
- ]
598
-
599
- # Calculate elapsed time
600
- elapsed_time = time.time() - start_time
601
- elapsed_time_display = f"Processing completed in {elapsed_time:.2f} seconds."
602
- logger.info(f"Processing completed in {elapsed_time:.2f} seconds.")
603
-
604
- return translated_json, editable_table, output_video_path, elapsed_time_display
605
-
606
- except Exception as e:
607
- logger.error(f"An error occurred: {str(e)}")
608
- return None, [], None, f"An error occurred: {str(e)}"
609
- # Gradio Interface with Tabs
610
- def build_interface():
611
- with gr.Blocks(css=css) as demo:
612
- gr.Markdown("## Video Localization")
613
- with gr.Row():
614
- with gr.Column(scale=4):
615
- file_input = gr.File(label="Upload Video/Audio File")
616
- language_input = gr.Dropdown(["en", "es", "fr", "zh"], label="Select Language") # Language codes
617
- process_mode = gr.Radio(choices=["Transcription", "Transcription with Voiceover"], label="Choose Processing Type", value="Transcription")
618
- submit_button = gr.Button("Post and Process")
619
- editable_translations = gr.State(value=[])
620
-
621
- with gr.Column(scale=8):
622
- gr.Markdown("## Edit Translations")
623
 
624
- # Editable JSON Data
625
- editable_table = gr.Dataframe(
626
- value=[], # Default to an empty list to avoid undefined values
627
- headers=["start", "original", "translated", "end", "speaker"],
628
- datatype=["number", "str", "str", "number", "str"],
629
- row_count=1, # Initially empty
630
- col_count=5,
631
- interactive=[False, True, True, False, False], # Control editability
632
- label="Edit Translations",
633
- wrap=True # Enables text wrapping if supported
634
- )
635
- save_changes_button = gr.Button("Save Changes")
636
- processed_video_output = gr.File(label="Download Processed Video", interactive=True) # Download button
637
- elapsed_time_display = gr.Textbox(label="Elapsed Time", lines=1, interactive=False)
638
-
639
- with gr.Column(scale=1):
640
- gr.Markdown("**Feedback**")
641
- feedback_input = gr.Textbox(
642
- placeholder="Leave your feedback here...",
643
- label=None,
644
- lines=3,
645
- )
646
- feedback_btn = gr.Button("Submit Feedback")
647
- response_message = gr.Textbox(label=None, lines=1, interactive=False)
648
- db_download = gr.File(label="Download Database File", visible=False)
649
 
650
- # Link the feedback handling
651
- def feedback_submission(feedback):
652
- message, file_path = handle_feedback(feedback)
653
- if file_path:
654
- return message, gr.update(value=file_path, visible=True)
655
- return message, gr.update(visible=False)
656
-
657
- save_changes_button.click(
658
- update_translations,
659
- inputs=[file_input, editable_table, process_mode],
660
- outputs=[processed_video_output, elapsed_time_display]
661
- )
662
-
663
- submit_button.click(
664
- upload_and_manage,
665
- inputs=[file_input, language_input, process_mode],
666
- outputs=[editable_translations, editable_table, processed_video_output, elapsed_time_display]
667
- )
668
-
669
- # Connect submit button to save_feedback_db function
670
- feedback_btn.click(
671
- feedback_submission,
672
- inputs=[feedback_input],
673
- outputs=[response_message, db_download]
674
- )
675
-
676
- return demo
677
-
678
- # Launch the Gradio interface
679
- demo = build_interface()
680
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  )
62
  hf_api_key = os.environ.get("hf_token")
63
 
64
+
65
+ # def silence(duration, fps=44100):
66
+ # """
67
+ # Returns a silent AudioClip of the specified duration.
68
+ # """
69
+ # return AudioArrayClip(np.zeros((int(fps*duration), 2)), fps=fps)
70
+
71
+ # def count_words_or_characters(text):
72
+ # # Count non-Chinese words
73
+ # non_chinese_words = len(re.findall(r'\b[a-zA-Z0-9]+\b', text))
74
 
75
+ # # Count Chinese characters
76
+ # chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
77
 
78
+ # return non_chinese_words + chinese_chars
79
 
80
+ # # Define the passcode
81
+ # PASSCODE = "show_feedback_db"
82
+
83
+ # css = """
84
+ # /* Adjust row height */
85
+ # .dataframe-container tr {
86
+ # height: 50px !important;
87
+ # }
88
+
89
+ # /* Ensure text wrapping and prevent overflow */
90
+ # .dataframe-container td {
91
+ # white-space: normal !important;
92
+ # word-break: break-word !important;
93
+ # }
94
+
95
+ # /* Set column widths */
96
+ # [data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
97
+ # [data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
98
+ # width: 6%; /* Start column */
99
+ # }
100
+
101
+ # [data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
102
+ # [data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
103
+ # width: 47%; /* Original text */
104
+ # }
105
+
106
+ # [data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
107
+ # [data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
108
+ # width: 47%; /* Translated text */
109
+ # }
110
+
111
+ # [data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
112
+ # [data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
113
+ # display: none !important;
114
+ # }
115
+ # """
116
+
117
+ # # Function to save feedback or provide access to the database file
118
+ # def handle_feedback(feedback):
119
+ # feedback = feedback.strip() # Clean up leading/trailing whitespace
120
+ # if not feedback:
121
+ # return "Feedback cannot be empty.", None
122
+
123
+ # if feedback == PASSCODE:
124
+ # # Provide access to the feedback.db file
125
+ # return "Access granted! Download the database file below.", "feedback.db"
126
+ # else:
127
+ # # Save feedback to the database
128
+ # with sqlite3.connect("feedback.db") as conn:
129
+ # cursor = conn.cursor()
130
+ # cursor.execute("CREATE TABLE IF NOT EXISTS studio_feedback (id INTEGER PRIMARY KEY, comment TEXT)")
131
+ # cursor.execute("INSERT INTO studio_feedback (comment) VALUES (?)", (feedback,))
132
+ # conn.commit()
133
+ # return "Thank you for your feedback!", None
134
+
135
+ # # Configure logging
136
+ # logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
137
+ # logger = logging.getLogger(__name__)
138
+ # logger.info(f"MoviePy Version: {moviepy.__version__}")
139
+
140
+ # # def segment_background_audio(audio_path, output_path="background_segments.wav"):
141
+ # # # Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
142
+ # # pipeline = Pipeline.from_pretrained(
143
+ # # "pyannote/voice-activity-detection",
144
+ # # use_auth_token=hf_api_key
145
+ # # )
146
+ # # # Step 3: Run VAD to get speech segments
147
+ # # vad_result = pipeline(audio_path)
148
+ # # print(f"Detected speech segments: {vad_result}")
149
+
150
+ # # # Step 4: Load full audio and subtract speech segments
151
+ # # full_audio = AudioSegment.from_wav(audio_path)
152
+ # # background_audio = AudioSegment.silent(duration=len(full_audio))
153
+
154
+ # # for segment in vad_result.itersegments():
155
+ # # start_ms = int(segment.start * 1000)
156
+ # # end_ms = int(segment.end * 1000)
157
+ # # # Remove speech by muting that portion
158
+ # # background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
159
+
160
+ # # # Step 5: Subtract background_audio from full_audio
161
+ # # result_audio = full_audio.overlay(background_audio)
162
+
163
+ # # # Step 6: Export non-speech segments
164
+ # # result_audio.export(output_path, format="wav")
165
+ # # print(f"Saved non-speech (background) audio to: {output_path}")
166
+
167
+ # # return True
168
+
169
+ # def transcribe_video_with_speakers(video_path):
170
+ # # Extract audio from video
171
+ # video = VideoFileClip(video_path)
172
+ # audio_path = "audio.wav"
173
+ # video.audio.write_audiofile(audio_path)
174
+ # logger.info(f"Audio extracted from video: {audio_path}")
175
+
176
+ # # segment_result = segment_background_audio(audio_path)
177
+ # # print(f"Saved non-speech (background) audio to local")
178
 
179
+ # # Set up device
180
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
181
+ # logger.info(f"Using device: {device}")
182
 
183
+ # try:
184
+ # # Load a medium model with float32 for broader compatibility
185
+ # model = whisperx.load_model("medium", device=device, compute_type="float32")
186
+ # logger.info("WhisperX model loaded")
187
 
188
+ # # Transcribe
189
+ # result = model.transcribe(audio_path, chunk_size=5, print_progress = True)
190
+ # logger.info("Audio transcription completed")
191
+
192
+ # # Get the detected language
193
+ # detected_language = result["language"]
194
+ # logger.debug(f"Detected language: {detected_language}")
195
+ # # Alignment
196
+ # model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
197
+ # result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
198
+ # logger.info("Transcription alignment completed")
199
 
200
+ # # Diarization (works independently of Whisper model size)
201
+ # diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
202
+ # diarize_segments = diarize_model(audio_path)
203
+ # logger.info("Speaker diarization completed")
204
 
205
+ # # Assign speakers
206
+ # result = whisperx.assign_word_speakers(diarize_segments, result)
207
+ # logger.info("Speakers assigned to transcribed segments")
208
 
209
+ # except Exception as e:
210
+ # logger.error(f"❌ WhisperX pipeline failed: {e}")
211
+
212
+ # # Extract timestamps, text, and speaker IDs
213
+ # transcript_with_speakers = [
214
+ # {
215
+ # "start": segment["start"],
216
+ # "end": segment["end"],
217
+ # "text": segment["text"],
218
+ # "speaker": segment["speaker"]
219
+ # }
220
+ # for segment in result["segments"]
221
+ # ]
222
+
223
+ # # Collect audio for each speaker
224
+ # speaker_audio = {}
225
+ # for segment in result["segments"]:
226
+ # speaker = segment["speaker"]
227
+ # if speaker not in speaker_audio:
228
+ # speaker_audio[speaker] = []
229
+ # speaker_audio[speaker].append((segment["start"], segment["end"]))
230
+
231
+ # # Collapse and truncate speaker audio
232
+ # speaker_sample_paths = {}
233
+ # audio_clip = AudioFileClip(audio_path)
234
+ # for speaker, segments in speaker_audio.items():
235
+ # speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
236
+ # combined_clip = concatenate_audioclips(speaker_clips)
237
+ # truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
238
+ # sample_path = f"speaker_{speaker}_sample.wav"
239
+ # truncated_clip.write_audiofile(sample_path)
240
+ # speaker_sample_paths[speaker] = sample_path
241
+ # logger.info(f"Created sample for {speaker}: {sample_path}")
242
+
243
+ # # Clean up
244
+ # video.close()
245
+ # audio_clip.close()
246
+ # os.remove(audio_path)
247
+
248
+ # return transcript_with_speakers, detected_language
249
+
250
+ # # Function to get the appropriate translation model based on target language
251
+ # def get_translation_model(source_language, target_language):
252
+ # """
253
+ # Get the translation model based on the source and target language.
254
+
255
+ # Parameters:
256
+ # - target_language (str): The language to translate the content into (e.g., 'es', 'fr').
257
+ # - source_language (str): The language of the input content (default is 'en' for English).
258
 
259
+ # Returns:
260
+ # - str: The translation model identifier.
261
+ # """
262
+ # # List of allowable languages
263
+ # allowable_languages = ["en", "es", "fr", "zh", "de", "it", "pt", "ja", "ko", "ru"]
264
+
265
+ # # Validate source and target languages
266
+ # if source_language not in allowable_languages:
267
+ # logger.debug(f"Invalid source language '{source_language}'. Supported languages are: {', '.join(allowable_languages)}")
268
+ # # Return a default model if source language is invalid
269
+ # source_language = "en" # Default to 'en'
270
+
271
+ # if target_language not in allowable_languages:
272
+ # logger.debug(f"Invalid target language '{target_language}'. Supported languages are: {', '.join(allowable_languages)}")
273
+ # # Return a default model if target language is invalid
274
+ # target_language = "zh" # Default to 'zh'
275
+
276
+ # if source_language == target_language:
277
+ # source_language = "en" # Default to 'en'
278
+ # target_language = "zh" # Default to 'zh'
279
+
280
+ # # Return the model using string concatenation
281
+ # return f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
282
+
283
+ # def translate_single_entry(entry, translator):
284
+ # original_text = entry["text"]
285
+ # translated_text = translator(original_text)[0]['translation_text']
286
+ # return {
287
+ # "start": entry["start"],
288
+ # "original": original_text,
289
+ # "translated": translated_text,
290
+ # "end": entry["end"],
291
+ # "speaker": entry["speaker"]
292
+ # }
293
+
294
+ # def translate_text(transcription_json, source_language, target_language):
295
+ # # Load the translation model for the specified target language
296
+ # translation_model_id = get_translation_model(source_language, target_language)
297
+ # logger.debug(f"Translation model: {translation_model_id}")
298
+ # translator = pipeline("translation", model=translation_model_id)
299
+
300
+ # # Use ThreadPoolExecutor to parallelize translations
301
+ # with concurrent.futures.ThreadPoolExecutor() as executor:
302
+ # # Submit all translation tasks and collect results
303
+ # translate_func = lambda entry: translate_single_entry(entry, translator)
304
+ # translated_json = list(executor.map(translate_func, transcription_json))
305
+
306
+ # # Sort the translated_json by start time
307
+ # translated_json.sort(key=lambda x: x["start"])
308
+
309
+ # # Log the components being added to translated_json
310
+ # for entry in translated_json:
311
+ # logger.debug("Added to translated_json: start=%s, original=%s, translated=%s, end=%s, speaker=%s",
312
+ # entry["start"], entry["original"], entry["translated"], entry["end"], entry["speaker"])
313
+
314
+ # return translated_json
315
+
316
+ # def update_translations(file, edited_table, mode):
317
+ # """
318
+ # Update the translations based on user edits in the Gradio Dataframe.
319
+ # """
320
+ # output_video_path = "output_video.mp4"
321
+ # logger.debug(f"Editable Table: {edited_table}")
322
+
323
+ # if file is None:
324
+ # logger.info("No file uploaded. Please upload a video/audio file.")
325
+ # return None, [], None, "No file uploaded. Please upload a video/audio file."
326
 
327
+ # try:
328
+ # start_time = time.time() # Start the timer
329
+
330
+ # # Convert the edited_table (list of lists) back to list of dictionaries
331
+ # updated_translations = [
332
+ # {
333
+ # "start": row["start"], # Access by column name
334
+ # "original": row["original"],
335
+ # "translated": row["translated"],
336
+ # "end": row["end"]
337
+ # }
338
+ # for _, row in edited_table.iterrows()
339
+ # ]
340
+
341
+ # # Call the function to process the video with updated translations
342
+ # add_transcript_voiceover(file.name, updated_translations, output_video_path, mode=="Transcription with Voiceover")
343
+
344
+ # # Calculate elapsed time
345
+ # elapsed_time = time.time() - start_time
346
+ # elapsed_time_display = f"Updates applied successfully in {elapsed_time:.2f} seconds."
347
+
348
+ # return output_video_path, elapsed_time_display
349
+
350
+ # except Exception as e:
351
+ # raise ValueError(f"Error updating translations: {e}")
352
+
353
+ # def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_height, font_path):
354
+ # try:
355
+ # subtitle_width = int(video_width * 0.8)
356
+ # subtitle_font_size = int(video_height // 20)
357
+ # font = ImageFont.truetype(font_path, subtitle_font_size)
358
+
359
+ # dummy_img = Image.new("RGBA", (subtitle_width, 1), (0, 0, 0, 0))
360
+ # draw = ImageDraw.Draw(dummy_img)
361
+
362
+ # lines = []
363
+ # line = ""
364
+ # for word in text.split():
365
+ # test_line = f"{line} {word}".strip()
366
+ # bbox = draw.textbbox((0, 0), test_line, font=font)
367
+ # w = bbox[2] - bbox[0]
368
+ # if w <= subtitle_width - 10:
369
+ # line = test_line
370
+ # else:
371
+ # lines.append(line)
372
+ # line = word
373
+ # lines.append(line)
374
+
375
+ # line_heights = [draw.textbbox((0, 0), l, font=font)[3] - draw.textbbox((0, 0), l, font=font)[1] for l in lines]
376
+ # total_height = sum(line_heights) + (len(lines) - 1) * 5
377
+ # img = Image.new("RGBA", (subtitle_width, total_height), (0, 0, 0, 0))
378
+ # draw = ImageDraw.Draw(img)
379
+
380
+ # y = 0
381
+ # for idx, line in enumerate(lines):
382
+ # bbox = draw.textbbox((0, 0), line, font=font)
383
+ # w = bbox[2] - bbox[0]
384
+ # draw.text(((subtitle_width - w) // 2, y), line, font=font, fill="yellow")
385
+ # y += line_heights[idx] + 5
386
 
387
+ # img_np = np.array(img) # <- ✅ Fix: convert to NumPy
388
+ # txt_clip = ImageClip(img_np).set_start(start_time).set_duration(end_time - start_time).set_position("bottom").set_opacity(0.8)
389
+ # return txt_clip
390
+ # except Exception as e:
391
+ # logger.error(f"\u274c Failed to create subtitle clip: {e}")
392
+ # return None
393
+
394
+ # def process_entry(entry, i, video_width, video_height, add_voiceover, target_language, font_path, speaker_sample_paths=None):
395
+ # logger.debug(f"Processing entry {i}: {entry}")
396
+ # error_message = None
397
+
398
+ # try:
399
+ # txt_clip = create_subtitle_clip_pil(entry["translated"], entry["start"], entry["end"], video_width, video_height, font_path)
400
+ # except Exception as e:
401
+ # error_message = f"❌ Failed to create subtitle clip for entry {i}: {e}"
402
+ # logger.error(error_message)
403
+ # txt_clip = None
404
+
405
+ # audio_segment = None
406
+ # if add_voiceover:
407
+ # try:
408
+ # segment_audio_path = f"segment_{i}_voiceover.wav"
409
+ # desired_duration = entry["end"] - entry["start"]
410
+ # speaker = entry.get("speaker", "default")
411
+ # speaker_wav_path = f"speaker_{speaker}_sample.wav"
412
+
413
+ # output_path, status_msg, tts_error = generate_voiceover_clone([entry], desired_duration, target_language, speaker_wav_path, segment_audio_path)
414
+
415
+ # if tts_error:
416
+ # error_message = error_message + " | " + tts_error if error_message else tts_error
417
+
418
+ # if not output_path or not os.path.exists(segment_audio_path):
419
+ # raise FileNotFoundError(f"Voiceover file not generated at: {segment_audio_path}")
420
+
421
+ # audio_clip = AudioFileClip(segment_audio_path)
422
+ # logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
423
+
424
+ # if audio_clip.duration < desired_duration:
425
+ # silence_duration = desired_duration - audio_clip.duration
426
+ # audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
427
+ # logger.info(f"Padded audio with {silence_duration} seconds of silence.")
428
+
429
+ # audio_segment = audio_clip.set_start(entry["start"]).set_duration(desired_duration)
430
+
431
+ # except Exception as e:
432
+ # err = f"❌ Failed to generate audio segment for entry {i}: {e}"
433
+ # logger.error(err)
434
+ # error_message = error_message + " | " + err if error_message else err
435
+ # audio_segment = None
436
+
437
+ # return i, txt_clip, audio_segment, error_message
438
 
439
+ # def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en", speaker_sample_paths=None):
440
+ # video = VideoFileClip(video_path)
441
+ # font_path = "./NotoSansSC-Regular.ttf"
442
+
443
+ # text_clips = []
444
+ # audio_segments = []
445
+ # error_messages = []
446
+
447
+ # with concurrent.futures.ThreadPoolExecutor() as executor:
448
+ # futures = [executor.submit(process_entry, entry, i, video.w, video.h, add_voiceover, target_language, font_path, speaker_sample_paths)
449
+ # for i, entry in enumerate(translated_json)]
450
+
451
+ # results = []
452
+ # for future in concurrent.futures.as_completed(futures):
453
+ # try:
454
+ # i, txt_clip, audio_segment, error = future.result()
455
+ # results.append((i, txt_clip, audio_segment))
456
+ # if error:
457
+ # error_messages.append(f"[Entry {i}] {error}")
458
+ # except Exception as e:
459
+ # err = f"❌ Unexpected error in future result: {e}"
460
+ # logger.error(err)
461
+ # error_messages.append(err)
462
+
463
+ # # Sort by entry index to ensure order
464
+ # results.sort(key=lambda x: x[0])
465
+ # text_clips = [clip for _, clip, _ in results if clip]
466
+ # if add_voiceover:
467
+ # audio_segments = [segment for _, _, segment in results if segment]
468
+
469
+ # final_video = CompositeVideoClip([video] + text_clips)
470
+
471
+ # if add_voiceover:
472
+ # if audio_segments:
473
+ # final_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
474
+ # final_video = final_video.set_audio(final_audio)
475
+ # else:
476
+ # logger.warning("⚠️ No audio segments available. Adding silent fallback.")
477
+ # silent_audio = AudioClip(lambda t: 0, duration=video.duration)
478
+ # final_video = final_video.set_audio(silent_audio)
479
+
480
+ # logger.info(f"Saving the final video to: {output_path}")
481
+ # final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
482
+
483
+ # logger.info("Video processing completed successfully.")
484
+
485
+ # # Optional: return errors
486
+ # if error_messages:
487
+ # logger.warning("⚠️ Errors encountered during processing:")
488
+ # for msg in error_messages:
489
+ # logger.warning(msg)
490
+
491
+ # return error_messages
492
+
493
+ # # Initialize TTS model only once (outside the function)
494
+ # tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
495
+
496
+ # def generate_voiceover_clone(translated_json, desired_duration, target_language, speaker_wav_path, output_audio_path):
497
+ # try:
498
+ # full_text = " ".join(entry["translated"] for entry in translated_json if "translated" in entry and entry["translated"].strip())
499
+ # if not full_text.strip():
500
+ # msg = "❌ Translated text is empty."
501
+ # logger.error(msg)
502
+ # return None, msg, msg
503
+
504
+ # if not speaker_wav_path or not os.path.exists(speaker_wav_path):
505
+ # msg = f"❌ Speaker audio not found: {speaker_wav_path}"
506
+ # logger.error(msg)
507
+ # return None, msg, msg
508
+
509
+ # # # Truncate text based on max token assumption (~60 tokens)
510
+ # # MAX_TTS_TOKENS = 60
511
+ # # tokens = full_text.split() # crude token count
512
+ # # if len(tokens) > MAX_TTS_TOKENS:
513
+ # # logger.warning(f"⚠️ Text too long for TTS model ({len(tokens)} tokens). Truncating to {MAX_TTS_TOKENS} tokens.")
514
+ # # full_text = " ".join(tokens[:MAX_TTS_TOKENS])
515
+
516
+ # speed_tts = calibrated_speed(full_text, desired_duration)
517
+ # tts.tts_to_file(
518
+ # text=full_text,
519
+ # speaker_wav=speaker_wav_path,
520
+ # language=target_language,
521
+ # file_path=output_audio_path,
522
+ # speed=speed_tts,
523
+ # split_sentences=True
524
+ # )
525
+
526
+ # if not os.path.exists(output_audio_path):
527
+ # msg = f"❌ Voiceover file not generated at: {output_audio_path}"
528
+ # logger.error(msg)
529
+ # return None, msg, msg
530
+
531
+ # msg = "✅ Voice cloning completed successfully."
532
+ # logger.info(msg)
533
+ # return output_audio_path, msg, None
534
+
535
+ # except Exception as e:
536
+ # err_msg = f"❌ An error occurred: {str(e)}"
537
+ # logger.error("❌ Error during voice cloning:")
538
+ # logger.error(traceback.format_exc())
539
+ # return None, err_msg, err_msg
540
+
541
+ # def calibrated_speed(text, desired_duration):
542
+ # """
543
+ # Compute a speed factor to help TTS fit audio into desired duration,
544
+ # using a simple truncated linear function of characters per second.
545
+ # """
546
+ # char_count = len(text.strip())
547
+ # if char_count == 0 or desired_duration <= 0:
548
+ # return 1.0 # fallback
549
+
550
+ # cps = char_count / desired_duration # characters per second
551
+
552
+ # # Truncated linear mapping
553
+ # if cps < 10:
554
+ # return 1.0
555
+ # elif cps > 25:
556
+ # return 1.4
557
+ # else:
558
+ # # Linearly scale between cps 10 -> 25 and speed 1.0 -> 1.3
559
+ # slope = (1.4 - 1.0) / (25 - 10)
560
+ # return 1.0 + slope * (cps - 10)
561
+
562
+
563
+ # def upload_and_manage(file, target_language, mode="transcription"):
564
+ # if file is None:
565
+ # logger.info("No file uploaded. Please upload a video/audio file.")
566
+ # return None, [], None, "No file uploaded. Please upload a video/audio file."
567
+
568
+ # try:
569
+ # start_time = time.time() # Start the timer
570
+ # logger.info(f"Started processing file: {file.name}")
571
+
572
+ # # Define paths for audio and output files
573
+ # audio_path = "audio.wav"
574
+ # output_video_path = "output_video.mp4"
575
+ # voiceover_path = "voiceover.wav"
576
+ # logger.info(f"Using audio path: {audio_path}, output video path: {output_video_path}, voiceover path: {voiceover_path}")
577
+
578
+ # # Step 1: Transcribe audio from uploaded media file and get timestamps
579
+ # logger.info("Transcribing audio...")
580
+ # transcription_json, source_language = transcribe_video_with_speakers(file.name)
581
+ # logger.info(f"Transcription completed. Detected source language: {source_language}")
582
+
583
+ # # Step 2: Translate the transcription
584
+ # logger.info(f"Translating transcription from {source_language} to {target_language}...")
585
+ # translated_json = translate_text(transcription_json, source_language, target_language)
586
+ # logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
587
+
588
+ # # Step 3: Add transcript to video based on timestamps
589
+ # logger.info("Adding translated transcript to video...")
590
+ # add_transcript_voiceover(file.name, translated_json, output_video_path, mode == "Transcription with Voiceover", target_language)
591
+ # logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
592
+
593
+ # # Convert translated JSON into a format for the editable table
594
+ # logger.info("Converting translated JSON into editable table format...")
595
+ # editable_table = [
596
+ # [float(entry["start"]), entry["original"], entry["translated"], float(entry["end"]), entry["speaker"]]
597
+ # for entry in translated_json
598
+ # ]
599
+
600
+ # # Calculate elapsed time
601
+ # elapsed_time = time.time() - start_time
602
+ # elapsed_time_display = f"Processing completed in {elapsed_time:.2f} seconds."
603
+ # logger.info(f"Processing completed in {elapsed_time:.2f} seconds.")
604
+
605
+ # return translated_json, editable_table, output_video_path, elapsed_time_display
606
+
607
+ # except Exception as e:
608
+ # logger.error(f"An error occurred: {str(e)}")
609
+ # return None, [], None, f"An error occurred: {str(e)}"
610
+ # # Gradio Interface with Tabs
611
+ # def build_interface():
612
+ # with gr.Blocks(css=css) as demo:
613
+ # gr.Markdown("## Video Localization")
614
+ # with gr.Row():
615
+ # with gr.Column(scale=4):
616
+ # file_input = gr.File(label="Upload Video/Audio File")
617
+ # language_input = gr.Dropdown(["en", "es", "fr", "zh"], label="Select Language") # Language codes
618
+ # process_mode = gr.Radio(choices=["Transcription", "Transcription with Voiceover"], label="Choose Processing Type", value="Transcription")
619
+ # submit_button = gr.Button("Post and Process")
620
+ # editable_translations = gr.State(value=[])
621
+
622
+ # with gr.Column(scale=8):
623
+ # gr.Markdown("## Edit Translations")
624
 
625
+ # # Editable JSON Data
626
+ # editable_table = gr.Dataframe(
627
+ # value=[], # Default to an empty list to avoid undefined values
628
+ # headers=["start", "original", "translated", "end", "speaker"],
629
+ # datatype=["number", "str", "str", "number", "str"],
630
+ # row_count=1, # Initially empty
631
+ # col_count=5,
632
+ # interactive=[False, True, True, False, False], # Control editability
633
+ # label="Edit Translations",
634
+ # wrap=True # Enables text wrapping if supported
635
+ # )
636
+ # save_changes_button = gr.Button("Save Changes")
637
+ # processed_video_output = gr.File(label="Download Processed Video", interactive=True) # Download button
638
+ # elapsed_time_display = gr.Textbox(label="Elapsed Time", lines=1, interactive=False)
639
+
640
+ # with gr.Column(scale=1):
641
+ # gr.Markdown("**Feedback**")
642
+ # feedback_input = gr.Textbox(
643
+ # placeholder="Leave your feedback here...",
644
+ # label=None,
645
+ # lines=3,
646
+ # )
647
+ # feedback_btn = gr.Button("Submit Feedback")
648
+ # response_message = gr.Textbox(label=None, lines=1, interactive=False)
649
+ # db_download = gr.File(label="Download Database File", visible=False)
650
 
651
+ # # Link the feedback handling
652
+ # def feedback_submission(feedback):
653
+ # message, file_path = handle_feedback(feedback)
654
+ # if file_path:
655
+ # return message, gr.update(value=file_path, visible=True)
656
+ # return message, gr.update(visible=False)
657
+
658
+ # save_changes_button.click(
659
+ # update_translations,
660
+ # inputs=[file_input, editable_table, process_mode],
661
+ # outputs=[processed_video_output, elapsed_time_display]
662
+ # )
663
+
664
+ # submit_button.click(
665
+ # upload_and_manage,
666
+ # inputs=[file_input, language_input, process_mode],
667
+ # outputs=[editable_translations, editable_table, processed_video_output, elapsed_time_display]
668
+ # )
669
+
670
+ # # Connect submit button to save_feedback_db function
671
+ # feedback_btn.click(
672
+ # feedback_submission,
673
+ # inputs=[feedback_input],
674
+ # outputs=[response_message, db_download]
675
+ # )
676
+
677
+ # return demo
678
+
679
+ # # Launch the Gradio interface
680
+ # demo = build_interface()
681
+ # demo.launch()
682
+
683
+ import gradio as gr
684
+
685
+ def dummy_func(x):
686
+ return x, "Success"
687
+
688
+ with gr.Blocks() as demo:
689
+ inp = gr.Textbox()
690
+ out1 = gr.Textbox()
691
+ out2 = gr.Textbox()
692
+ btn = gr.Button("Run")
693
+ btn.click(dummy_func, inputs=inp, outputs=[out1, out2])
694
+
695
+ demo.launch()