qqwjq1981 commited on
Commit
940ca8e
·
verified ·
1 Parent(s): 40b3f9e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -6
app.py CHANGED
@@ -383,10 +383,11 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
383
  logger.error(f"\u274c Failed to create subtitle clip: {e}")
384
  return None
385
 
 
386
  def solve_optimal_alignment(original_segments, generated_durations, total_duration):
387
  """
388
- Robust version: Aligns generated speech segments, falls back to greedy allocation if solver fails.
389
- Modifies and returns the translated_json with updated 'start' and 'end'.
390
  """
391
  N = len(original_segments)
392
  d = np.array(generated_durations)
@@ -414,13 +415,105 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
414
  except Exception as e:
415
  print(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
416
 
417
- current_time = 0.0
418
  for i in range(N):
419
- original_segments[i]['start'] = round(current_time, 3)
420
- original_segments[i]['end'] = round(current_time + generated_durations[i], 3)
421
- current_time += generated_durations[i]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
 
423
  return original_segments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
425
  logger.debug(f"Processing entry {i}: {entry}")
426
  error_message = None
@@ -644,6 +737,8 @@ def upload_and_manage(file, target_language, process_mode):
644
  translated_json = translate_text(transcription_json, source_language, target_language)
645
  logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
646
 
 
 
647
  # Step 3: Add transcript to video based on timestamps
648
  logger.info("Adding translated transcript to video...")
649
  add_transcript_voiceover(file.name, translated_json, output_video_path, process_mode, target_language)
 
383
  logger.error(f"\u274c Failed to create subtitle clip: {e}")
384
  return None
385
 
386
+
387
  def solve_optimal_alignment(original_segments, generated_durations, total_duration):
388
  """
389
+ Aligns speech segments using quadratic programming. If optimization fails,
390
+ applies greedy fallback: center shorter segments, stretch longer ones.
391
  """
392
  N = len(original_segments)
393
  d = np.array(generated_durations)
 
415
  except Exception as e:
416
  print(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
417
 
 
418
  for i in range(N):
419
+ orig_start = original_segments[i]['start']
420
+ orig_end = original_segments[i]['end']
421
+ orig_mid = (orig_start + orig_end) / 2
422
+ gen_duration = generated_durations[i]
423
+ orig_duration = orig_end - orig_start
424
+
425
+ if gen_duration <= orig_duration:
426
+ new_start = orig_mid - gen_duration / 2
427
+ new_end = orig_mid + gen_duration / 2
428
+ else:
429
+ extra = (gen_duration - orig_duration) / 2
430
+ new_start = orig_start - extra
431
+ new_end = orig_end + extra
432
+
433
+ # Prevent overlap with previous
434
+ if i > 0:
435
+ prev_end = original_segments[i - 1]['end']
436
+ new_start = max(new_start, prev_end + 0.01)
437
+
438
+ # Prevent overlap with next
439
+ if i < N - 1:
440
+ next_start = original_segments[i + 1]['start']
441
+ new_end = min(new_end, next_start - 0.01)
442
+
443
+ if new_end <= new_start:
444
+ new_start = orig_start
445
+ new_end = orig_start + gen_duration
446
+
447
+ original_segments[i]['start'] = round(new_start, 3)
448
+ original_segments[i]['end'] = round(new_end, 3)
449
 
450
  return original_segments
451
+
452
+ def get_frame_image_bytes(video, t):
453
+ frame = video.get_frame(t)
454
+ img = Image.fromarray(frame)
455
+ buf = io.BytesIO()
456
+ img.save(buf, format='JPEG')
457
+ return buf.getvalue()
458
+
459
+ def post_edit_segment(entry, image_bytes):
460
+ try:
461
+ system_prompt = """You are a multilingual assistant helping polish subtitles and voiceover content.
462
+ Your job is to fix punctuation, validate meaning, improve tone, and ensure the translation matches the speaker's intended message."""
463
+
464
+ user_prompt = f"""
465
+ Original (source) transcript: {entry.get("original", "")}
466
+ Translated version: {entry.get("translated", "")}
467
+ Speaker ID: {entry.get("speaker", "")}
468
+ Time: {entry.get("start")} - {entry.get("end")}
469
+
470
+ Please:
471
+ 1. Add correct punctuation and sentence boundaries.
472
+ 2. Improve fluency and tone of the translated text.
473
+ 3. Ensure the meaning is preserved from the original.
474
+ 4. Use the attached image frame to infer emotion or setting.
475
+
476
+ Return the revised original and translated texts in the following format:
477
+ Original: <edited original>
478
+ Translated: <edited translation>
479
+ """
480
+ response = ChatCompletion.create(
481
+ model="gpt-4o",
482
+ messages=[
483
+ {"role": "system", "content": system_prompt},
484
+ {"role": "user", "content": user_prompt, "image": image_bytes}
485
+ ]
486
+ )
487
+
488
+ output = response.choices[0].message.content.strip()
489
+ lines = output.splitlines()
490
+ for line in lines:
491
+ if line.startswith("Original:"):
492
+ entry['original'] = line[len("Original:"):].strip()
493
+ elif line.startswith("Translated:"):
494
+ entry['translated'] = line[len("Translated:"):].strip()
495
+
496
+ return entry
497
+ except Exception as e:
498
+ print(f"Post-editing failed for segment: {e}")
499
+ return entry
500
+
501
+
502
+ def post_edit_translated_segments(translated_json, video_path):
503
+ video = VideoFileClip(video_path)
504
+
505
+ def process(entry):
506
+ mid_time = (entry['start'] + entry['end']) / 2
507
+ image_bytes = get_frame_image_bytes(video, mid_time)
508
+ entry = post_edit_segment(entry, image_bytes)
509
+ return entry
510
+
511
+ with concurrent.futures.ThreadPoolExecutor() as executor:
512
+ edited = list(executor.map(process, translated_json))
513
+
514
+ video.close()
515
+ return edited
516
+
517
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
518
  logger.debug(f"Processing entry {i}: {entry}")
519
  error_message = None
 
737
  translated_json = translate_text(transcription_json, source_language, target_language)
738
  logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
739
 
740
+ translated_json = post_edit_translated_segments(translated_json, file.name)
741
+
742
  # Step 3: Add transcript to video based on timestamps
743
  logger.info("Adding translated transcript to video...")
744
  add_transcript_voiceover(file.name, translated_json, output_video_path, process_mode, target_language)