File size: 2,169 Bytes
e2d8d82
e698260
931df81
 
bef4887
e698260
931df81
 
 
 
e698260
8f3ea36
931df81
 
581b947
e698260
 
 
 
 
 
cb85517
32e4ded
e698260
 
bef4887
931df81
 
 
3130060
e698260
3130060
32e4ded
e698260
 
9002374
 
73cf1fe
f8597f4
9002374
 
e698260
 
 
931df81
e698260
 
931df81
e698260
9002374
 
e698260
 
581b947
e698260
 
9002374
e698260
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from faster_whisper import WhisperModel
from utils import load_groups_json
import torch
import gc
from ui import *
import utils

gc.collect()
torch.cuda.empty_cache()

model_lang_list = ['en', 'id', None]
model_size = ["tiny", "base", "small", "medium", "large-v2"]


def start_transcribe(input_file, lang_choice: int, model_size_choice: int, progress):

    print(
        f"Starting transcribing with model size {model_size[model_size_choice]} for language {model_lang_list[lang_choice]}")

    model = WhisperModel(
        model_size[model_size_choice], device="cuda", compute_type="int8_float16")
    _, speaker_groups = load_groups_json()

    subtitle_txt_list = []
    transcribe_txt_list = []
    for speaker, _ in zip(speaker_groups, progress.tqdm(speaker_groups, desc=ui_lang["progress_transcribing_audio"])):
        # Transcribe and save temp file
        audiof = f"{speaker}.wav"
        print(f"Loading {audiof}")
        segments, _ = model.transcribe(
            audio=audiof, language=model_lang_list[lang_choice], word_timestamps=True)
        segments_list = list(segments)

        speaker_txt_list = []
        shift = speaker_groups[speaker][0] + 1
        print(
            f"Current starting point: {shift}s or {time_str_subtitle(shift)}")
        name = str(speaker)[:10]
        for segment in segments_list:
            start = time_str_subtitle(segment.start + shift)
            end = time_str_subtitle(segment.end + shift)

            segment_txt = segment.text
            speaker_txt_list.append(segment_txt)

            subtitle = f"{len(subtitle_txt_list) + 1}\n{start} --> {end}\n[{name}] {segment_txt}\n\n"
            subtitle_txt_list.append(subtitle)

        speaker_txt = " ".join(speaker_txt_list)
        transcribe_txt_list.append(
            f"[{time_str(shift)}]\n[{name}] {speaker_txt}\n")

    utils.write_transcribe_subtitle_file(
        input_file, transcribe_txt_list, subtitle_txt_list, False)


def time_str_subtitle(t):
    return '{0:02d}:{1:02d}:{2:06.3f}'.format(round(t // 3600),
                                              round(t % 3600 // 60),
                                              t % 60)