Spaces:
Runtime error
Runtime error
Commit
·
3e533d7
1
Parent(s):
0d5492e
Implement diarization
Browse files- app.py +22 -4
- diarization.py +44 -0
- requirements.txt +6 -0
app.py
CHANGED
@@ -1,7 +1,25 @@
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
2 |
|
3 |
-
|
4 |
-
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import login
|
2 |
+
from diarization import startDiarization
|
3 |
+
import ffmpeg
|
4 |
import gradio as gr
|
5 |
+
import os
|
6 |
|
7 |
+
hugging_face_token = os.environ["HUGGING_FACE_TOKEN"]
|
8 |
+
login(token=hugging_face_token)
|
9 |
|
10 |
+
# Get video & convert it
|
11 |
+
inputs_interface = gr.inputs.Video(label="Insert video")
|
12 |
+
in_file = ffmpeg.input(inputs_interface).audio
|
13 |
+
out_file = ffmpeg.output(in_file, "input.wav", f="wav").run()
|
14 |
+
|
15 |
+
|
16 |
+
def prepareInput():
|
17 |
+
return startDiarization(out_file)
|
18 |
+
|
19 |
+
|
20 |
+
gr.Interface(
|
21 |
+
prepareInput,
|
22 |
+
inputs=inputs_interface,
|
23 |
+
outputs="text",
|
24 |
+
title="Get Diarization"
|
25 |
+
).launch()
|
diarization.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pyannote.audio import Pipeline
|
2 |
+
from pydub import AudioSegment
|
3 |
+
import torch
|
4 |
+
import json
|
5 |
+
|
6 |
+
pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization')
|
7 |
+
device = torch.device("cpu")
|
8 |
+
pipeline.to(device)
|
9 |
+
|
10 |
+
|
11 |
+
def startDiarization(input_file):
|
12 |
+
diarization = pipeline(input_file)
|
13 |
+
|
14 |
+
sample_groups = []
|
15 |
+
speaker_groups = {}
|
16 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
17 |
+
if (speaker not in sample_groups):
|
18 |
+
sample_groups.append(str(speaker))
|
19 |
+
|
20 |
+
suffix = 1
|
21 |
+
file_name = f"{speaker}-{suffix}"
|
22 |
+
while file_name in speaker_groups:
|
23 |
+
suffix += 1
|
24 |
+
file_name = f"{speaker}-{suffix}"
|
25 |
+
speaker_groups[file_name] = [turn.start, turn.end]
|
26 |
+
saveGroupsJson(sample_groups, speaker_groups)
|
27 |
+
audioSegmentation(input_file, speaker_groups)
|
28 |
+
return str(speaker_groups)
|
29 |
+
|
30 |
+
|
31 |
+
def audioSegmentation(input_file, speaker_groups_dict):
|
32 |
+
audioSegment = AudioSegment.from_wav(input_file)
|
33 |
+
for speaker in speaker_groups_dict:
|
34 |
+
time = speaker_groups_dict[speaker]
|
35 |
+
audioSegment[time[0]*1000: time[1] *
|
36 |
+
1000].export(f"{speaker}.wav", format='wav')
|
37 |
+
print(f"group {speaker}: {time[0]*1000}--{time[1]*1000}")
|
38 |
+
|
39 |
+
|
40 |
+
def saveGroupsJson(sample_groups_list: list, speaker_groups_dict: dict):
|
41 |
+
with open("sample_groups.json", "w") as json_file_sample:
|
42 |
+
json.dump(sample_groups_list, json_file_sample)
|
43 |
+
with open("speaker_groups.json", "w") as json_file_speaker:
|
44 |
+
json.dump(speaker_groups_dict, json_file_speaker)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ffmpeg-python
|
2 |
+
pyannote @ git+https://github.com/pyannote/pyannote-audio.git@develop
|
3 |
+
pydub
|
4 |
+
transformers
|
5 |
+
torch
|
6 |
+
whisper
|