chompionsawelo commited on
Commit
3e533d7
·
1 Parent(s): 0d5492e

Implement diarization

Browse files
Files changed (3) hide show
  1. app.py +22 -4
  2. diarization.py +44 -0
  3. requirements.txt +6 -0
app.py CHANGED
@@ -1,7 +1,25 @@
 
 
 
1
  import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import login
2
+ from diarization import startDiarization
3
+ import ffmpeg
4
  import gradio as gr
5
+ import os
6
 
7
+ hugging_face_token = os.environ["HUGGING_FACE_TOKEN"]
8
+ login(token=hugging_face_token)
9
 
10
+ # Get video & convert it
11
+ inputs_interface = gr.inputs.Video(label="Insert video")
12
+ in_file = ffmpeg.input(inputs_interface).audio
13
+ out_file = ffmpeg.output(in_file, "input.wav", f="wav").run()
14
+
15
+
16
+ def prepareInput():
17
+ return startDiarization(out_file)
18
+
19
+
20
+ gr.Interface(
21
+ prepareInput,
22
+ inputs=inputs_interface,
23
+ outputs="text",
24
+ title="Get Diarization"
25
+ ).launch()
diarization.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pyannote.audio import Pipeline
2
+ from pydub import AudioSegment
3
+ import torch
4
+ import json
5
+
6
+ pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization')
7
+ device = torch.device("cpu")
8
+ pipeline.to(device)
9
+
10
+
11
+ def startDiarization(input_file):
12
+ diarization = pipeline(input_file)
13
+
14
+ sample_groups = []
15
+ speaker_groups = {}
16
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
17
+ if (speaker not in sample_groups):
18
+ sample_groups.append(str(speaker))
19
+
20
+ suffix = 1
21
+ file_name = f"{speaker}-{suffix}"
22
+ while file_name in speaker_groups:
23
+ suffix += 1
24
+ file_name = f"{speaker}-{suffix}"
25
+ speaker_groups[file_name] = [turn.start, turn.end]
26
+ saveGroupsJson(sample_groups, speaker_groups)
27
+ audioSegmentation(input_file, speaker_groups)
28
+ return str(speaker_groups)
29
+
30
+
31
+ def audioSegmentation(input_file, speaker_groups_dict):
32
+ audioSegment = AudioSegment.from_wav(input_file)
33
+ for speaker in speaker_groups_dict:
34
+ time = speaker_groups_dict[speaker]
35
+ audioSegment[time[0]*1000: time[1] *
36
+ 1000].export(f"{speaker}.wav", format='wav')
37
+ print(f"group {speaker}: {time[0]*1000}--{time[1]*1000}")
38
+
39
+
40
+ def saveGroupsJson(sample_groups_list: list, speaker_groups_dict: dict):
41
+ with open("sample_groups.json", "w") as json_file_sample:
42
+ json.dump(sample_groups_list, json_file_sample)
43
+ with open("speaker_groups.json", "w") as json_file_speaker:
44
+ json.dump(speaker_groups_dict, json_file_speaker)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ffmpeg-python
2
+ pyannote @ git+https://github.com/pyannote/pyannote-audio.git@develop
3
+ pydub
4
+ transformers
5
+ torch
6
+ whisper