Spaces:
Runtime error
Runtime error
Commit
·
3e533d7
1
Parent(s):
0d5492e
Implement diarization
Browse files- app.py +22 -4
- diarization.py +44 -0
- requirements.txt +6 -0
app.py
CHANGED
|
@@ -1,7 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import login
|
| 2 |
+
from diarization import startDiarization
|
| 3 |
+
import ffmpeg
|
| 4 |
import gradio as gr
|
| 5 |
+
import os
|
| 6 |
|
| 7 |
+
hugging_face_token = os.environ["HUGGING_FACE_TOKEN"]
|
| 8 |
+
login(token=hugging_face_token)
|
| 9 |
|
| 10 |
+
# Get video & convert it
|
| 11 |
+
inputs_interface = gr.inputs.Video(label="Insert video")
|
| 12 |
+
in_file = ffmpeg.input(inputs_interface).audio
|
| 13 |
+
out_file = ffmpeg.output(in_file, "input.wav", f="wav").run()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def prepareInput():
|
| 17 |
+
return startDiarization(out_file)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
gr.Interface(
|
| 21 |
+
prepareInput,
|
| 22 |
+
inputs=inputs_interface,
|
| 23 |
+
outputs="text",
|
| 24 |
+
title="Get Diarization"
|
| 25 |
+
).launch()
|
diarization.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pyannote.audio import Pipeline
|
| 2 |
+
from pydub import AudioSegment
|
| 3 |
+
import torch
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization')
|
| 7 |
+
device = torch.device("cpu")
|
| 8 |
+
pipeline.to(device)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def startDiarization(input_file):
|
| 12 |
+
diarization = pipeline(input_file)
|
| 13 |
+
|
| 14 |
+
sample_groups = []
|
| 15 |
+
speaker_groups = {}
|
| 16 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
| 17 |
+
if (speaker not in sample_groups):
|
| 18 |
+
sample_groups.append(str(speaker))
|
| 19 |
+
|
| 20 |
+
suffix = 1
|
| 21 |
+
file_name = f"{speaker}-{suffix}"
|
| 22 |
+
while file_name in speaker_groups:
|
| 23 |
+
suffix += 1
|
| 24 |
+
file_name = f"{speaker}-{suffix}"
|
| 25 |
+
speaker_groups[file_name] = [turn.start, turn.end]
|
| 26 |
+
saveGroupsJson(sample_groups, speaker_groups)
|
| 27 |
+
audioSegmentation(input_file, speaker_groups)
|
| 28 |
+
return str(speaker_groups)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def audioSegmentation(input_file, speaker_groups_dict):
|
| 32 |
+
audioSegment = AudioSegment.from_wav(input_file)
|
| 33 |
+
for speaker in speaker_groups_dict:
|
| 34 |
+
time = speaker_groups_dict[speaker]
|
| 35 |
+
audioSegment[time[0]*1000: time[1] *
|
| 36 |
+
1000].export(f"{speaker}.wav", format='wav')
|
| 37 |
+
print(f"group {speaker}: {time[0]*1000}--{time[1]*1000}")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def saveGroupsJson(sample_groups_list: list, speaker_groups_dict: dict):
|
| 41 |
+
with open("sample_groups.json", "w") as json_file_sample:
|
| 42 |
+
json.dump(sample_groups_list, json_file_sample)
|
| 43 |
+
with open("speaker_groups.json", "w") as json_file_speaker:
|
| 44 |
+
json.dump(speaker_groups_dict, json_file_speaker)
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ffmpeg-python
|
| 2 |
+
pyannote @ git+https://github.com/pyannote/pyannote-audio.git@develop
|
| 3 |
+
pydub
|
| 4 |
+
transformers
|
| 5 |
+
torch
|
| 6 |
+
whisper
|