File size: 5,308 Bytes
710db5f 9a005ca 1ca8bfc 9a005ca 710db5f 5034a0a 6f8989f 939d453 cabbcc5 6f8989f 8e1fa57 fc0b265 5034a0a fc0b265 2399e19 8dc4fc2 9287297 710db5f 9a005ca 710db5f ddfea13 710db5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import re
def timeformat_srt(time):
hours = time // 3600
minutes = (time - hours * 3600) // 60
seconds = time - hours * 3600 - minutes * 60
milliseconds = (time - int(time)) * 1000
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
def timeformat_txt(time):
hours = time // 3600
minutes = (time - hours * 3600) // 60
seconds = time - hours * 3600 - minutes * 60
#milliseconds = (time - int(time)) * 1000
if hours > 0:
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
else:
return f"{int(minutes):02d}:{int(seconds):02d}"
def timeformat_vtt(time):
hours = time // 3600
minutes = (time - hours * 3600) // 60
seconds = time - hours * 3600 - minutes * 60
milliseconds = (time - int(time)) * 1000
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
def write_file(subtitle, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
f.write(subtitle)
def get_srt(segments):
output = ""
for i, segment in enumerate(segments):
output += f"{i + 1}\n"
output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
if segment['text'].startswith(' '):
segment['text'] = segment['text'][1:]
output += f"{segment['text']}\n\n"
return output
def get_csv(segments):
bDiarization = False
output = ""
# Check if speakers are identified
for i, segment in enumerate(segments):
if re.search(r'SPEAKER [0-9][0-9]: ',segment['text']) != None:
bDiarization = True
break
for i, segment in enumerate(segments):
if bDiarization:
speaker_id = ((segment['text']).split(":", 1)[0]).strip()
speaker_text = ((segment['text']).split(":", 1)[1]).strip()
output += f"{i + 1};{timeformat_srt(segment['start'])};{timeformat_srt(segment['end'])};{speaker_id};{speaker_text};\n"
bDiarization = True
else:
output += f"{i + 1};{timeformat_srt(segment['start'])};{timeformat_srt(segment['end'])};{segment['text']};\n"
# Add titles to csv file
if bDiarization:
output = "Line;Start time;End time;Speaker;Text;\n" + output
else:
output = "Line;Start time;End time;Text;\n" + output
return output.rstrip("\n")
def get_vtt(segments):
output = "WebVTT\n\n"
for i, segment in enumerate(segments):
output += f"{i + 1}\n"
output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
if segment['text'].startswith(' '):
segment['text'] = segment['text'][1:]
output += f"{segment['text']}\n\n"
return output
def get_txt(segments):
output = ""
for i, segment in enumerate(segments):
if segment['text'].startswith(' '):
segment['text'] = segment['text'][1:]
#output += f"{segment['text']}\n"
output += f"{timeformat_txt(segment['start'])}\t{segment['text']}\n"
return output
def parse_srt(file_path):
"""Reads SRT file and returns as dict"""
with open(file_path, 'r', encoding='utf-8') as file:
srt_data = file.read()
data = []
blocks = srt_data.split('\n\n')
for block in blocks:
if block.strip() != '':
lines = block.strip().split('\n')
index = lines[0]
timestamp = lines[1]
sentence = ' '.join(lines[2:])
data.append({
"index": index,
"timestamp": timestamp,
"sentence": sentence
})
return data
def parse_vtt(file_path):
"""Reads WebVTT file and returns as dict"""
with open(file_path, 'r', encoding='utf-8') as file:
webvtt_data = file.read()
data = []
blocks = webvtt_data.split('\n\n')
for block in blocks:
if block.strip() != '' and not block.strip().startswith("WebVTT"):
lines = block.strip().split('\n')
index = lines[0]
timestamp = lines[1]
sentence = ' '.join(lines[2:])
data.append({
"index": index,
"timestamp": timestamp,
"sentence": sentence
})
return data
def get_serialized_srt(dicts):
output = ""
for dic in dicts:
output += f'{dic["index"]}\n'
output += f'{dic["timestamp"]}\n'
output += f'{dic["sentence"]}\n\n'
return output
def get_serialized_vtt(dicts):
output = "WebVTT\n\n"
for dic in dicts:
output += f'{dic["index"]}\n'
output += f'{dic["timestamp"]}\n'
output += f'{dic["sentence"]}\n\n'
return output
def safe_filename(name):
INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
# Truncate the filename if it exceeds the max_length (20)
if len(safe_name) > 20:
file_extension = safe_name.split('.')[-1]
if len(file_extension) + 1 < 20:
truncated_name = safe_name[:20 - len(file_extension) - 1]
safe_name = truncated_name + '.' + file_extension
else:
safe_name = safe_name[:20]
return safe_name
|