Spaces:
Running
Running
| import re | |
| def timeformat_srt(time): | |
| hours = time // 3600 | |
| minutes = (time - hours * 3600) // 60 | |
| seconds = time - hours * 3600 - minutes * 60 | |
| milliseconds = (time - int(time)) * 1000 | |
| return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}" | |
| def timeformat_vtt(time): | |
| hours = time // 3600 | |
| minutes = (time - hours * 3600) // 60 | |
| seconds = time - hours * 3600 - minutes * 60 | |
| milliseconds = (time - int(time)) * 1000 | |
| return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}" | |
| def write_file(subtitle, output_file): | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| f.write(subtitle) | |
| def get_srt(segments): | |
| output = "" | |
| for i, segment in enumerate(segments): | |
| output += f"{i + 1}\n" | |
| output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n" | |
| if segment['text'].startswith(' '): | |
| segment['text'] = segment['text'][1:] | |
| output += f"{segment['text']}\n\n" | |
| return output | |
| def get_vtt(segments): | |
| output = "WebVTT\n\n" | |
| for i, segment in enumerate(segments): | |
| output += f"{i + 1}\n" | |
| output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n" | |
| if segment['text'].startswith(' '): | |
| segment['text'] = segment['text'][1:] | |
| output += f"{segment['text']}\n\n" | |
| return output | |
| def get_txt(segments): | |
| output = "" | |
| for i, segment in enumerate(segments): | |
| if segment['text'].startswith(' '): | |
| segment['text'] = segment['text'][1:] | |
| output += f"{segment['text']}\n" | |
| return output | |
| def parse_srt(file_path): | |
| """Reads SRT file and returns as dict""" | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| srt_data = file.read() | |
| data = [] | |
| blocks = srt_data.split('\n\n') | |
| for block in blocks: | |
| if block.strip() != '': | |
| lines = block.strip().split('\n') | |
| index = lines[0] | |
| timestamp = lines[1] | |
| sentence = ' '.join(lines[2:]) | |
| data.append({ | |
| "index": index, | |
| "timestamp": timestamp, | |
| "sentence": sentence | |
| }) | |
| return data | |
| def parse_vtt(file_path): | |
| """Reads WebVTT file and returns as dict""" | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| webvtt_data = file.read() | |
| data = [] | |
| blocks = webvtt_data.split('\n\n') | |
| for block in blocks: | |
| if block.strip() != '' and not block.strip().startswith("WebVTT"): | |
| lines = block.strip().split('\n') | |
| index = lines[0] | |
| timestamp = lines[1] | |
| sentence = ' '.join(lines[2:]) | |
| data.append({ | |
| "index": index, | |
| "timestamp": timestamp, | |
| "sentence": sentence | |
| }) | |
| return data | |
| def get_serialized_srt(dicts): | |
| output = "" | |
| for dic in dicts: | |
| output += f'{dic["index"]}\n' | |
| output += f'{dic["timestamp"]}\n' | |
| output += f'{dic["sentence"]}\n\n' | |
| return output | |
| def get_serialized_vtt(dicts): | |
| output = "WebVTT\n\n" | |
| for dic in dicts: | |
| output += f'{dic["index"]}\n' | |
| output += f'{dic["timestamp"]}\n' | |
| output += f'{dic["sentence"]}\n\n' | |
| return output | |
| def safe_filename(name): | |
| from app import _args | |
| INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]' | |
| safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name) | |
| if not _args.colab: | |
| return safe_name | |
| # Truncate the filename if it exceeds the max_length (20) | |
| if len(safe_name) > 20: | |
| file_extension = safe_name.split('.')[-1] | |
| if len(file_extension) + 1 < 20: | |
| truncated_name = safe_name[:20 - len(file_extension) - 1] | |
| safe_name = truncated_name + '.' + file_extension | |
| else: | |
| safe_name = safe_name[:20] | |
| return safe_name | |