from faster_whisper import WhisperModel from utils import load_groups_json import torch import gc import file_name import utils gc.collect() torch.cuda.empty_cache() model_lang_list = ['en', 'id', None] model_size = ["tiny", "base", "small", "medium", "large"] def start_transcribe(input_file, lang_choice: int, model_size_choice: int, progress): print( f"Starting transcribing with model size {model_size[model_size_choice]} for language {model_lang_list[lang_choice]}") model = WhisperModel( model_size[model_size_choice], device="cuda", compute_type="int8_float16") _, speaker_groups = load_groups_json() subtitle_txt_list = [] transcribe_txt_list = [] for speaker, _ in zip(speaker_groups, progress.tqdm(speaker_groups, desc="Transcribing")): # Transcribe and save temp file audiof = f"{speaker}.wav" print(f"Loading {audiof}") segments, _ = model.transcribe( audio=audiof, language=model_lang_list[lang_choice], word_timestamps=True) segments_list = list(segments) speaker_txt_list = [] shift = speaker_groups[speaker][0] + 1 print(f"Current starting point: {shift}s or {time_str(shift)}") name = str(speaker)[:10] for segment in segments_list: start = time_str(segment.start + shift) end = time_str(segment.end + shift) segment_txt = segment.text speaker_txt_list.append(segment_txt) subtitle = f"{len(subtitle_txt_list) + 1}\n{start} --> {end}\n[{name}] {segment_txt}\n\n" subtitle_txt_list.append(subtitle) speaker_txt = " ".join(speaker_txt_list) transcribe_txt_list.append(f"[{name}] {speaker_txt}\n") utils.write_transcribe_subtitle_file( input_file, transcribe_txt_list, subtitle_txt_list, False) def time_str(t): return '{0:02d}:{1:02d}:{2:06.3f}'.format(round(t // 3600), round(t % 3600 // 60), t % 60)