refinamento

Running

File size: 4,104 Bytes

63ab978
 
ddbe0b6
 
6f0e822
63ab978
6f0e822
 
 
 
63ab978
 
6f0e822
63ab978
6f0e822
 
 
 
63ab978
 
6f0e822
40f2b57
63ab978
6f0e822
 
63ab978
 
ddbe0b6
 
 
63ab978
 
6f0e822
63ab978
89f36c9
 
6f0e822
 
 
63ab978
 
ddbe0b6
 
 
dbfaffc
63ab978
 
89f36c9
 
6f0e822
 
 
 
7e8138f
ddbe0b6
 
 
7e8138f
 
 
 
 
 
 
 
6f0e822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbfaffc
6f0e822
 
 
 
 
 
 
dbfaffc
6f0e822
8ac99d5
 
6f0e822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63ab978
 
6f0e822
 
dbfaffc
6f0e822
 
 
 
 
 
63ab978
 
3048545

import re

from modules.whisper.data_classes import Segment


def timeformat_srt(time):
    hours = time // 3600
    minutes = (time - hours * 3600) // 60
    seconds = time - hours * 3600 - minutes * 60
    milliseconds = (time - int(time)) * 1000
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"


def timeformat_vtt(time):
    hours = time // 3600
    minutes = (time - hours * 3600) // 60
    seconds = time - hours * 3600 - minutes * 60
    milliseconds = (time - int(time)) * 1000
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"


def write_file(subtitle, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(subtitle)


def get_srt(segments):
    if segments and isinstance(segments[0], Segment):
        segments = [seg.dict() for seg in segments]

    output = ""
    for i, segment in enumerate(segments):
        output += f"{i + 1}\n"
        output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
        if segment['text'].startswith(' '):
            segment['text'] = segment['text'][1:]
        output += f"{segment['text']}\n\n"
    return output


def get_vtt(segments):
    if segments and isinstance(segments[0], Segment):
        segments = [seg.dict() for seg in segments]

    output = "WEBVTT\n\n"
    for i, segment in enumerate(segments):
        output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
        if segment['text'].startswith(' '):
            segment['text'] = segment['text'][1:]
        output += f"{segment['text']}\n\n"
    return output


def get_txt(segments):
    if segments and isinstance(segments[0], Segment):
        segments = [seg.dict() for seg in segments]

    output = ""
    for i, segment in enumerate(segments):
        if segment['text'].startswith(' '):
            segment['text'] = segment['text'][1:]
        output += f"{segment['text']}\n"
    return output


def parse_srt(file_path):
    """Reads SRT file and returns as dict"""
    with open(file_path, 'r', encoding='utf-8') as file:
        srt_data = file.read()

    data = []
    blocks = srt_data.split('\n\n')

    for block in blocks:
        if block.strip() != '':
            lines = block.strip().split('\n')
            index = lines[0]
            timestamp = lines[1]
            sentence = ' '.join(lines[2:])

            data.append({
                "index": index,
                "timestamp": timestamp,
                "sentence": sentence
            })
    return data


def parse_vtt(file_path):
    """Reads WEBVTT file and returns as dict"""
    with open(file_path, 'r', encoding='utf-8') as file:
        webvtt_data = file.read()

    data = []
    blocks = webvtt_data.split('\n\n')

    for block in blocks:
        if block.strip() != '' and not block.strip().startswith("WEBVTT"):
            lines = block.strip().split('\n')
            timestamp = lines[0]
            sentence = ' '.join(lines[1:])

            data.append({
                "timestamp": timestamp,
                "sentence": sentence
            })

    return data


def get_serialized_srt(dicts):
    output = ""
    for dic in dicts:
        output += f'{dic["index"]}\n'
        output += f'{dic["timestamp"]}\n'
        output += f'{dic["sentence"]}\n\n'
    return output


def get_serialized_vtt(dicts):
    output = "WEBVTT\n\n"
    for dic in dicts:
        output += f'{dic["timestamp"]}\n'
        output += f'{dic["sentence"]}\n\n'
    return output


def safe_filename(name):
    INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
    safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
    # Truncate the filename if it exceeds the max_length (20)
    if len(safe_name) > 20:
        file_extension = safe_name.split('.')[-1]
        if len(file_extension) + 1 < 20:
            truncated_name = safe_name[:20 - len(file_extension) - 1]
            safe_name = truncated_name + '.' + file_extension
        else:
            safe_name = safe_name[:20]
    return safe_name