from pyannote.audio import Pipeline from pydub import AudioSegment import os import torch import json hugging_face_token = os.environ["HUGGING_FACE_TOKEN"] pipeline = Pipeline.from_pretrained( 'pyannote/speaker-diarization', use_auth_token=hugging_face_token) device = torch.device("cpu") pipeline.to(device) def startDiarization(input_file): print("Starting diarization") diarization = pipeline(input_file) sample_groups = [] speaker_groups = {} for turn, _, speaker in diarization.itertracks(yield_label=True): if (speaker not in sample_groups): sample_groups.append(str(speaker)) suffix = 1 file_name = f"{speaker}-{suffix}" while file_name in speaker_groups: suffix += 1 file_name = f"{speaker}-{suffix}" speaker_groups[file_name] = [turn.start, turn.end] print(f"speaker_groups {file_name}: {speaker_groups[file_name]}") print(f"start={turn.start:.3f}s stop={turn.end:.3f}s speaker_{speaker}") saveGroupsJson(sample_groups, speaker_groups) audioSegmentation(input_file, speaker_groups) print(str(speaker_groups)) return str(speaker_groups) def audioSegmentation(input_file, speaker_groups_dict): audioSegment = AudioSegment.from_wav(input_file) for speaker in speaker_groups_dict: time = speaker_groups_dict[speaker] audioSegment[time[0]*1000: time[1] * 1000].export(f"{speaker}.wav", format='wav') print(f"group {speaker}: {time[0]*1000}--{time[1]*1000}") def saveGroupsJson(sample_groups_list: list, speaker_groups_dict: dict): with open("sample_groups.json", "w") as json_file_sample: json.dump(sample_groups_list, json_file_sample) with open("speaker_groups.json", "w") as json_file_speaker: json.dump(speaker_groups_dict, json_file_speaker)