whisper_transcribe / transcribe.py
chompionsawelo's picture
minor fix
a442a66
raw
history blame
2.04 kB
from faster_whisper import WhisperModel
import torch
import gc
import json
gc.collect()
torch.cuda.empty_cache()
model = WhisperModel("medium", device="cuda", compute_type="int8_float16")
def start_transcribe(progress):
_, speaker_groups = load_groups_json()
subtitle_txt = []
for speaker, _ in zip(speaker_groups, progress.tqdm(speaker_groups, desc="Transcribing")):
# Transcribe and save temp file
audiof = f"{speaker}.wav"
print(f"Loading {audiof}")
segments, _ = model.transcribe(
audio=audiof, language='id', word_timestamps=True)
segments_list = list(segments)
text_list_to_print = []
for segment in segments_list:
start = timeStr(segment.start)
end = timeStr(segment.end)
name = str(speaker)[:10]
text = segment.text
subtitle_txt.append(
f"{len(subtitle_txt) + 1}\n{start} --> {end}\n[{name}] {text}\n\n")
# Appending text for each segment to print
text_list_to_print.append(text)
# Print full text for each speaker turn
text = "\n".join(text_list_to_print)
print(text)
# Append to complete transcribe file
with open("transcribe.txt", "a") as file:
file.write(f"[{name}] {text}\n")
# Appending subtitle txt for each segment
with open("subtitle.srt", "w") as file:
file.writelines(subtitle_txt)
return ["transcribe.txt", "subtitle.srt"]
def timeStr(t):
return '{0:02d}:{1:02d}:{2:06.2f}'.format(round(t // 3600),
round(t % 3600 // 60),
t % 60)
def load_groups_json():
with open("sample_groups.json", "r") as json_file_sample:
sample_groups_list: list = json.load(json_file_sample)
with open("speaker_groups.json", "r") as json_file_speaker:
speaker_groups_dict: dict = json.load(json_file_speaker)
return sample_groups_list, speaker_groups_dict