Commit
·
abb41a8
1
Parent(s):
700a61a
get latest NFA which should ensure subtitles show until end of video
Browse files- utils/make_ass_files.py +68 -8
utils/make_ass_files.py
CHANGED
|
@@ -23,7 +23,9 @@ For the word-level ASS files, the text will be highlighted word-by-word, with th
|
|
| 23 |
by the NFA alignemtns.
|
| 24 |
"""
|
| 25 |
|
|
|
|
| 26 |
import os
|
|
|
|
| 27 |
|
| 28 |
from utils.constants import BLANK_TOKEN, SPACE_TOKEN
|
| 29 |
from utils.data_prep import Segment, Token, Word
|
|
@@ -74,8 +76,13 @@ def make_ass_files(
|
|
| 74 |
if ass_file_config.resegment_text_to_fill_space:
|
| 75 |
utt_obj = resegment_utt_obj(utt_obj, ass_file_config)
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
return utt_obj
|
| 81 |
|
|
@@ -166,9 +173,7 @@ def resegment_utt_obj(utt_obj, ass_file_config):
|
|
| 166 |
return utt_obj
|
| 167 |
|
| 168 |
|
| 169 |
-
def make_word_level_ass_file(
|
| 170 |
-
utt_obj, output_dir_root, ass_file_config,
|
| 171 |
-
):
|
| 172 |
|
| 173 |
default_style_dict = {
|
| 174 |
"Name": "Default",
|
|
@@ -298,14 +303,33 @@ def make_word_level_ass_file(
|
|
| 298 |
)
|
| 299 |
f.write(subtitle_text + '\n')
|
| 300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file
|
| 302 |
|
| 303 |
return utt_obj
|
| 304 |
|
| 305 |
|
| 306 |
-
def make_token_level_ass_file(
|
| 307 |
-
utt_obj, output_dir_root, ass_file_config,
|
| 308 |
-
):
|
| 309 |
|
| 310 |
default_style_dict = {
|
| 311 |
"Name": "Default",
|
|
@@ -457,6 +481,42 @@ def make_token_level_ass_file(
|
|
| 457 |
)
|
| 458 |
f.write(subtitle_text + '\n')
|
| 459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file
|
| 461 |
|
| 462 |
return utt_obj
|
|
|
|
| 23 |
by the NFA alignemtns.
|
| 24 |
"""
|
| 25 |
|
| 26 |
+
import math
|
| 27 |
import os
|
| 28 |
+
import soundfile as sf
|
| 29 |
|
| 30 |
from utils.constants import BLANK_TOKEN, SPACE_TOKEN
|
| 31 |
from utils.data_prep import Segment, Token, Word
|
|
|
|
| 76 |
if ass_file_config.resegment_text_to_fill_space:
|
| 77 |
utt_obj = resegment_utt_obj(utt_obj, ass_file_config)
|
| 78 |
|
| 79 |
+
# get duration of the utterance, so we know the final timestamp of the final set of subtitles,
|
| 80 |
+
# which we will keep showing until the end
|
| 81 |
+
with sf.SoundFile(utt_obj.audio_filepath) as f:
|
| 82 |
+
audio_dur = f.frames / f.samplerate
|
| 83 |
+
|
| 84 |
+
utt_obj = make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur)
|
| 85 |
+
utt_obj = make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur)
|
| 86 |
|
| 87 |
return utt_obj
|
| 88 |
|
|
|
|
| 173 |
return utt_obj
|
| 174 |
|
| 175 |
|
| 176 |
+
def make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur):
|
|
|
|
|
|
|
| 177 |
|
| 178 |
default_style_dict = {
|
| 179 |
"Name": "Default",
|
|
|
|
| 303 |
)
|
| 304 |
f.write(subtitle_text + '\n')
|
| 305 |
|
| 306 |
+
# write final set of subtitles for text after speech has been spoken
|
| 307 |
+
words_in_final_segment = []
|
| 308 |
+
for segment_or_token in utt_obj.segments_and_tokens[::-1]:
|
| 309 |
+
if type(segment_or_token) is Segment:
|
| 310 |
+
final_segment = segment_or_token
|
| 311 |
+
|
| 312 |
+
for word_or_token in final_segment.words_and_tokens:
|
| 313 |
+
if type(word_or_token) is Word:
|
| 314 |
+
words_in_final_segment.append(word_or_token)
|
| 315 |
+
break
|
| 316 |
+
|
| 317 |
+
text_after_speech = already_spoken_color_code + " ".join([x.text for x in words_in_final_segment]) + r"{\r}"
|
| 318 |
+
# note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become
|
| 319 |
+
# longer than the original audio during the MP4 creation stage.
|
| 320 |
+
subtitle_text = (
|
| 321 |
+
f"Dialogue: 0,{seconds_to_ass_format(words_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,,"
|
| 322 |
+
+ text_after_speech.rstrip()
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
f.write(subtitle_text + '\n')
|
| 326 |
+
|
| 327 |
utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file
|
| 328 |
|
| 329 |
return utt_obj
|
| 330 |
|
| 331 |
|
| 332 |
+
def make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur):
|
|
|
|
|
|
|
| 333 |
|
| 334 |
default_style_dict = {
|
| 335 |
"Name": "Default",
|
|
|
|
| 481 |
)
|
| 482 |
f.write(subtitle_text + '\n')
|
| 483 |
|
| 484 |
+
# Write final set of subtitles for text after speech has been spoken.
|
| 485 |
+
# To do this, we need to collect 'tokens_in_final_segment' so that we know what the final line is.
|
| 486 |
+
tokens_in_final_segment = []
|
| 487 |
+
for segment_or_token in utt_obj.segments_and_tokens[::-1]:
|
| 488 |
+
# Collect tokens from final segment - will 'break' so we only look at the final one.
|
| 489 |
+
if type(segment_or_token) is Segment:
|
| 490 |
+
# 'segment_or_token' is known to be Segment, which has attribute 'words_and_tokens'
|
| 491 |
+
for word_or_token in segment_or_token.words_and_tokens:
|
| 492 |
+
if type(word_or_token) is Token:
|
| 493 |
+
if word_or_token.text != BLANK_TOKEN:
|
| 494 |
+
tokens_in_final_segment.append(word_or_token)
|
| 495 |
+
else:
|
| 496 |
+
# 'word_or_token' is known to be a Word, which has attribute 'tokens'
|
| 497 |
+
for token in word_or_token.tokens:
|
| 498 |
+
if token.text != BLANK_TOKEN:
|
| 499 |
+
tokens_in_final_segment.append(token)
|
| 500 |
+
break
|
| 501 |
+
|
| 502 |
+
for token in tokens_in_final_segment:
|
| 503 |
+
token.text_cased = token.text_cased.replace(
|
| 504 |
+
"▁", " "
|
| 505 |
+
) # replace underscores used in subword tokens with spaces
|
| 506 |
+
token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space
|
| 507 |
+
|
| 508 |
+
text_after_speech = (
|
| 509 |
+
already_spoken_color_code + "".join([x.text_cased for x in tokens_in_final_segment]) + r"{\r}"
|
| 510 |
+
)
|
| 511 |
+
# note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become
|
| 512 |
+
# longer than the original audio during the MP4 creation stage.
|
| 513 |
+
subtitle_text = (
|
| 514 |
+
f"Dialogue: 0,{seconds_to_ass_format(tokens_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,,"
|
| 515 |
+
+ text_after_speech.rstrip()
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
f.write(subtitle_text + '\n')
|
| 519 |
+
|
| 520 |
utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file
|
| 521 |
|
| 522 |
return utt_obj
|