Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -383,10 +383,11 @@ def create_subtitle_clip_pil(text, start_time, end_time, video_width, video_heig
|
|
| 383 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
| 384 |
return None
|
| 385 |
|
|
|
|
| 386 |
def solve_optimal_alignment(original_segments, generated_durations, total_duration):
|
| 387 |
"""
|
| 388 |
-
|
| 389 |
-
|
| 390 |
"""
|
| 391 |
N = len(original_segments)
|
| 392 |
d = np.array(generated_durations)
|
|
@@ -414,13 +415,105 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
|
|
| 414 |
except Exception as e:
|
| 415 |
print(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
|
| 416 |
|
| 417 |
-
current_time = 0.0
|
| 418 |
for i in range(N):
|
| 419 |
-
original_segments[i]['start']
|
| 420 |
-
original_segments[i]['end']
|
| 421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
return original_segments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
| 425 |
logger.debug(f"Processing entry {i}: {entry}")
|
| 426 |
error_message = None
|
|
@@ -644,6 +737,8 @@ def upload_and_manage(file, target_language, process_mode):
|
|
| 644 |
translated_json = translate_text(transcription_json, source_language, target_language)
|
| 645 |
logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
|
| 646 |
|
|
|
|
|
|
|
| 647 |
# Step 3: Add transcript to video based on timestamps
|
| 648 |
logger.info("Adding translated transcript to video...")
|
| 649 |
add_transcript_voiceover(file.name, translated_json, output_video_path, process_mode, target_language)
|
|
|
|
| 383 |
logger.error(f"\u274c Failed to create subtitle clip: {e}")
|
| 384 |
return None
|
| 385 |
|
| 386 |
+
|
| 387 |
def solve_optimal_alignment(original_segments, generated_durations, total_duration):
|
| 388 |
"""
|
| 389 |
+
Aligns speech segments using quadratic programming. If optimization fails,
|
| 390 |
+
applies greedy fallback: center shorter segments, stretch longer ones.
|
| 391 |
"""
|
| 392 |
N = len(original_segments)
|
| 393 |
d = np.array(generated_durations)
|
|
|
|
| 415 |
except Exception as e:
|
| 416 |
print(f"⚠️ Optimization failed: {e}, falling back to greedy alignment.")
|
| 417 |
|
|
|
|
| 418 |
for i in range(N):
|
| 419 |
+
orig_start = original_segments[i]['start']
|
| 420 |
+
orig_end = original_segments[i]['end']
|
| 421 |
+
orig_mid = (orig_start + orig_end) / 2
|
| 422 |
+
gen_duration = generated_durations[i]
|
| 423 |
+
orig_duration = orig_end - orig_start
|
| 424 |
+
|
| 425 |
+
if gen_duration <= orig_duration:
|
| 426 |
+
new_start = orig_mid - gen_duration / 2
|
| 427 |
+
new_end = orig_mid + gen_duration / 2
|
| 428 |
+
else:
|
| 429 |
+
extra = (gen_duration - orig_duration) / 2
|
| 430 |
+
new_start = orig_start - extra
|
| 431 |
+
new_end = orig_end + extra
|
| 432 |
+
|
| 433 |
+
# Prevent overlap with previous
|
| 434 |
+
if i > 0:
|
| 435 |
+
prev_end = original_segments[i - 1]['end']
|
| 436 |
+
new_start = max(new_start, prev_end + 0.01)
|
| 437 |
+
|
| 438 |
+
# Prevent overlap with next
|
| 439 |
+
if i < N - 1:
|
| 440 |
+
next_start = original_segments[i + 1]['start']
|
| 441 |
+
new_end = min(new_end, next_start - 0.01)
|
| 442 |
+
|
| 443 |
+
if new_end <= new_start:
|
| 444 |
+
new_start = orig_start
|
| 445 |
+
new_end = orig_start + gen_duration
|
| 446 |
+
|
| 447 |
+
original_segments[i]['start'] = round(new_start, 3)
|
| 448 |
+
original_segments[i]['end'] = round(new_end, 3)
|
| 449 |
|
| 450 |
return original_segments
|
| 451 |
+
|
| 452 |
+
def get_frame_image_bytes(video, t):
|
| 453 |
+
frame = video.get_frame(t)
|
| 454 |
+
img = Image.fromarray(frame)
|
| 455 |
+
buf = io.BytesIO()
|
| 456 |
+
img.save(buf, format='JPEG')
|
| 457 |
+
return buf.getvalue()
|
| 458 |
+
|
| 459 |
+
def post_edit_segment(entry, image_bytes):
|
| 460 |
+
try:
|
| 461 |
+
system_prompt = """You are a multilingual assistant helping polish subtitles and voiceover content.
|
| 462 |
+
Your job is to fix punctuation, validate meaning, improve tone, and ensure the translation matches the speaker's intended message."""
|
| 463 |
+
|
| 464 |
+
user_prompt = f"""
|
| 465 |
+
Original (source) transcript: {entry.get("original", "")}
|
| 466 |
+
Translated version: {entry.get("translated", "")}
|
| 467 |
+
Speaker ID: {entry.get("speaker", "")}
|
| 468 |
+
Time: {entry.get("start")} - {entry.get("end")}
|
| 469 |
+
|
| 470 |
+
Please:
|
| 471 |
+
1. Add correct punctuation and sentence boundaries.
|
| 472 |
+
2. Improve fluency and tone of the translated text.
|
| 473 |
+
3. Ensure the meaning is preserved from the original.
|
| 474 |
+
4. Use the attached image frame to infer emotion or setting.
|
| 475 |
+
|
| 476 |
+
Return the revised original and translated texts in the following format:
|
| 477 |
+
Original: <edited original>
|
| 478 |
+
Translated: <edited translation>
|
| 479 |
+
"""
|
| 480 |
+
response = ChatCompletion.create(
|
| 481 |
+
model="gpt-4o",
|
| 482 |
+
messages=[
|
| 483 |
+
{"role": "system", "content": system_prompt},
|
| 484 |
+
{"role": "user", "content": user_prompt, "image": image_bytes}
|
| 485 |
+
]
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
output = response.choices[0].message.content.strip()
|
| 489 |
+
lines = output.splitlines()
|
| 490 |
+
for line in lines:
|
| 491 |
+
if line.startswith("Original:"):
|
| 492 |
+
entry['original'] = line[len("Original:"):].strip()
|
| 493 |
+
elif line.startswith("Translated:"):
|
| 494 |
+
entry['translated'] = line[len("Translated:"):].strip()
|
| 495 |
+
|
| 496 |
+
return entry
|
| 497 |
+
except Exception as e:
|
| 498 |
+
print(f"Post-editing failed for segment: {e}")
|
| 499 |
+
return entry
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
def post_edit_translated_segments(translated_json, video_path):
|
| 503 |
+
video = VideoFileClip(video_path)
|
| 504 |
+
|
| 505 |
+
def process(entry):
|
| 506 |
+
mid_time = (entry['start'] + entry['end']) / 2
|
| 507 |
+
image_bytes = get_frame_image_bytes(video, mid_time)
|
| 508 |
+
entry = post_edit_segment(entry, image_bytes)
|
| 509 |
+
return entry
|
| 510 |
+
|
| 511 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 512 |
+
edited = list(executor.map(process, translated_json))
|
| 513 |
+
|
| 514 |
+
video.close()
|
| 515 |
+
return edited
|
| 516 |
+
|
| 517 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
| 518 |
logger.debug(f"Processing entry {i}: {entry}")
|
| 519 |
error_message = None
|
|
|
|
| 737 |
translated_json = translate_text(transcription_json, source_language, target_language)
|
| 738 |
logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
|
| 739 |
|
| 740 |
+
translated_json = post_edit_translated_segments(translated_json, file.name)
|
| 741 |
+
|
| 742 |
# Step 3: Add transcript to video based on timestamps
|
| 743 |
logger.info("Adding translated transcript to video...")
|
| 744 |
add_transcript_voiceover(file.name, translated_json, output_video_path, process_mode, target_language)
|