Spaces:
Running
Running
File size: 4,104 Bytes
63ab978 ddbe0b6 6f0e822 63ab978 6f0e822 63ab978 6f0e822 63ab978 6f0e822 63ab978 6f0e822 40f2b57 63ab978 6f0e822 63ab978 ddbe0b6 63ab978 6f0e822 63ab978 89f36c9 6f0e822 63ab978 ddbe0b6 dbfaffc 63ab978 89f36c9 6f0e822 7e8138f ddbe0b6 7e8138f 6f0e822 dbfaffc 6f0e822 dbfaffc 6f0e822 8ac99d5 6f0e822 63ab978 6f0e822 dbfaffc 6f0e822 63ab978 3048545 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import re
from modules.whisper.data_classes import Segment
def timeformat_srt(time):
hours = time // 3600
minutes = (time - hours * 3600) // 60
seconds = time - hours * 3600 - minutes * 60
milliseconds = (time - int(time)) * 1000
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
def timeformat_vtt(time):
hours = time // 3600
minutes = (time - hours * 3600) // 60
seconds = time - hours * 3600 - minutes * 60
milliseconds = (time - int(time)) * 1000
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
def write_file(subtitle, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
f.write(subtitle)
def get_srt(segments):
if segments and isinstance(segments[0], Segment):
segments = [seg.dict() for seg in segments]
output = ""
for i, segment in enumerate(segments):
output += f"{i + 1}\n"
output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
if segment['text'].startswith(' '):
segment['text'] = segment['text'][1:]
output += f"{segment['text']}\n\n"
return output
def get_vtt(segments):
if segments and isinstance(segments[0], Segment):
segments = [seg.dict() for seg in segments]
output = "WEBVTT\n\n"
for i, segment in enumerate(segments):
output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
if segment['text'].startswith(' '):
segment['text'] = segment['text'][1:]
output += f"{segment['text']}\n\n"
return output
def get_txt(segments):
if segments and isinstance(segments[0], Segment):
segments = [seg.dict() for seg in segments]
output = ""
for i, segment in enumerate(segments):
if segment['text'].startswith(' '):
segment['text'] = segment['text'][1:]
output += f"{segment['text']}\n"
return output
def parse_srt(file_path):
"""Reads SRT file and returns as dict"""
with open(file_path, 'r', encoding='utf-8') as file:
srt_data = file.read()
data = []
blocks = srt_data.split('\n\n')
for block in blocks:
if block.strip() != '':
lines = block.strip().split('\n')
index = lines[0]
timestamp = lines[1]
sentence = ' '.join(lines[2:])
data.append({
"index": index,
"timestamp": timestamp,
"sentence": sentence
})
return data
def parse_vtt(file_path):
"""Reads WEBVTT file and returns as dict"""
with open(file_path, 'r', encoding='utf-8') as file:
webvtt_data = file.read()
data = []
blocks = webvtt_data.split('\n\n')
for block in blocks:
if block.strip() != '' and not block.strip().startswith("WEBVTT"):
lines = block.strip().split('\n')
timestamp = lines[0]
sentence = ' '.join(lines[1:])
data.append({
"timestamp": timestamp,
"sentence": sentence
})
return data
def get_serialized_srt(dicts):
output = ""
for dic in dicts:
output += f'{dic["index"]}\n'
output += f'{dic["timestamp"]}\n'
output += f'{dic["sentence"]}\n\n'
return output
def get_serialized_vtt(dicts):
output = "WEBVTT\n\n"
for dic in dicts:
output += f'{dic["timestamp"]}\n'
output += f'{dic["sentence"]}\n\n'
return output
def safe_filename(name):
INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
# Truncate the filename if it exceeds the max_length (20)
if len(safe_name) > 20:
file_extension = safe_name.split('.')[-1]
if len(file_extension) + 1 < 20:
truncated_name = safe_name[:20 - len(file_extension) - 1]
safe_name = truncated_name + '.' + file_extension
else:
safe_name = safe_name[:20]
return safe_name
|