from pyannote.audio import Pipeline from pydub import AudioSegment import gradio as gr import os import torch import json # hugging_face_token = os.environ["HUGGING_FACE_TOKEN"] hugging_face_token = "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" pipeline = Pipeline.from_pretrained( 'pyannote/speaker-diarization', use_auth_token=hugging_face_token) device = torch.device("cuda") pipeline.to(device) def start_diarization(input_file, progress: gr.Progress): print("Starting diarization") progress(0, desc="Starting diarization") diarization = pipeline(input_file) sample_groups = [] speaker_groups = {} print(str(diarization)) # for turn, _, speaker in diarization.itertracks(yield_label=True): # print(diarization) # for step in progress.tqdm(diarization.) # if (speaker not in sample_groups): # sample_groups.append(str(speaker)) # suffix = 1 # file_name = f"{speaker}-{suffix}" # while file_name in speaker_groups: # suffix += 1 # file_name = f"{speaker}-{suffix}" # speaker_groups[file_name] = [turn.start, turn.end] # print(f"speaker_groups {file_name}: {speaker_groups[file_name]}") # print(f"start={turn.start:.3f}s stop={turn.end:.3f}s speaker_{speaker}") save_groups_json(sample_groups, speaker_groups) audio_segmentation(input_file, speaker_groups) print(str(speaker_groups)) return str(speaker_groups) def audio_segmentation(input_file, speaker_groups_dict): audioSegment = AudioSegment.from_wav(input_file) for speaker in speaker_groups_dict: time = speaker_groups_dict[speaker] audioSegment[time[0]*1000: time[1] * 1000].export(f"{speaker}.wav", format='wav') print(f"group {speaker}: {time[0]*1000}--{time[1]*1000}") def save_groups_json(sample_groups_list: list, speaker_groups_dict: dict): with open("sample_groups.json", "w") as json_file_sample: json.dump(sample_groups_list, json_file_sample) with open("speaker_groups.json", "w") as json_file_speaker: json.dump(speaker_groups_dict, json_file_speaker)