Spaces:

chompionsawelo
/

whisper_transcribe

Runtime error

File size: 2,112 Bytes

3e533d7
 
ed6e5d8
8caee60
3e533d7
 
 
ed6e5d8
 
8caee60
 
18e34db
3e533d7
 
 
ed6e5d8
bdec318
ed6e5d8
3e533d7
 
 
 
ed6e5d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdec318
3e533d7
 
 
ed6e5d8
3e533d7
 
 
 
 
 
 
 
ed6e5d8
3e533d7

from pyannote.audio import Pipeline
from pydub import AudioSegment
import gradio as gr
import os
import torch
import json

# hugging_face_token = os.environ["HUGGING_FACE_TOKEN"]
hugging_face_token = "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"
pipeline = Pipeline.from_pretrained(
    'pyannote/speaker-diarization', use_auth_token=hugging_face_token)
device = torch.device("cuda")
pipeline.to(device)


def start_diarization(input_file, progress: gr.Progress):
    print("Starting diarization")
    progress(0, desc="Starting diarization")
    diarization = pipeline(input_file)

    sample_groups = []
    speaker_groups = {}
    print(str(diarization))
    # for turn, _, speaker in diarization.itertracks(yield_label=True):
    # print(diarization)
    # for step in progress.tqdm(diarization.)

    # if (speaker not in sample_groups):
    #     sample_groups.append(str(speaker))

    # suffix = 1
    # file_name = f"{speaker}-{suffix}"
    # while file_name in speaker_groups:
    #     suffix += 1
    #     file_name = f"{speaker}-{suffix}"
    # speaker_groups[file_name] = [turn.start, turn.end]

    # print(f"speaker_groups {file_name}: {speaker_groups[file_name]}")
    # print(f"start={turn.start:.3f}s stop={turn.end:.3f}s speaker_{speaker}")

    save_groups_json(sample_groups, speaker_groups)
    audio_segmentation(input_file, speaker_groups)
    print(str(speaker_groups))
    return str(speaker_groups)


def audio_segmentation(input_file, speaker_groups_dict):
    audioSegment = AudioSegment.from_wav(input_file)
    for speaker in speaker_groups_dict:
        time = speaker_groups_dict[speaker]
        audioSegment[time[0]*1000: time[1] *
                     1000].export(f"{speaker}.wav", format='wav')
        print(f"group {speaker}: {time[0]*1000}--{time[1]*1000}")


def save_groups_json(sample_groups_list: list, speaker_groups_dict: dict):
    with open("sample_groups.json", "w") as json_file_sample:
        json.dump(sample_groups_list, json_file_sample)
    with open("speaker_groups.json", "w") as json_file_speaker:
        json.dump(speaker_groups_dict, json_file_speaker)