chompionsawelo commited on
Commit
cb85517
·
1 Parent(s): e2d8d82
Files changed (3) hide show
  1. app.py +6 -6
  2. diarization.py +16 -21
  3. transcribe.py +2 -2
app.py CHANGED
@@ -3,24 +3,24 @@ from diarization import start_diarization
3
  from transcribe import start_transcribe
4
  import ffmpeg
5
  import gradio as gr
6
- import os
7
 
8
 
9
- def prepare_input(input_file):
10
  output_file = "input.wav"
 
11
  ffmpeg.input(input_file).audio.output(
12
  output_file, format="wav").run()
13
-
14
- progress = gr.Progress()
15
  start_diarization(output_file, progress)
16
- # return start_transcribe(progress)
 
17
 
18
 
19
  video_interface = gr.Interface(
20
  fn=prepare_input,
21
  inputs=gr.Video(type="file"),
22
  outputs="text",
23
- title="Test 1"
24
  )
25
 
26
  if __name__ == "__main__":
 
3
  from transcribe import start_transcribe
4
  import ffmpeg
5
  import gradio as gr
 
6
 
7
 
8
+ def prepare_input(input_file, progress=gr.Progress()):
9
  output_file = "input.wav"
10
+ progress(0.2, desc="Preparing video")
11
  ffmpeg.input(input_file).audio.output(
12
  output_file, format="wav").run()
13
+ progress(0.4, desc="Acquiring diarization")
 
14
  start_diarization(output_file, progress)
15
+ progress(0.6, desc="Transcribing audio")
16
+ return start_transcribe(progress)
17
 
18
 
19
  video_interface = gr.Interface(
20
  fn=prepare_input,
21
  inputs=gr.Video(type="file"),
22
  outputs="text",
23
+ title="Test 2"
24
  )
25
 
26
  if __name__ == "__main__":
diarization.py CHANGED
@@ -5,8 +5,7 @@ import os
5
  import torch
6
  import json
7
 
8
- # hugging_face_token = os.environ["HUGGING_FACE_TOKEN"]
9
- hugging_face_token = "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"
10
  pipeline = Pipeline.from_pretrained(
11
  'pyannote/speaker-diarization', use_auth_token=hugging_face_token)
12
  device = torch.device("cuda")
@@ -14,29 +13,25 @@ pipeline.to(device)
14
 
15
 
16
  def start_diarization(input_file, progress: gr.Progress):
17
- print("Starting diarization")
18
- progress(0, desc="Starting diarization")
19
  diarization = pipeline(input_file)
20
 
21
  sample_groups = []
22
  speaker_groups = {}
23
- print(str(diarization))
24
- # for turn, _, speaker in diarization.itertracks(yield_label=True):
25
- # print(diarization)
26
- # for step in progress.tqdm(diarization.)
27
-
28
- # if (speaker not in sample_groups):
29
- # sample_groups.append(str(speaker))
30
-
31
- # suffix = 1
32
- # file_name = f"{speaker}-{suffix}"
33
- # while file_name in speaker_groups:
34
- # suffix += 1
35
- # file_name = f"{speaker}-{suffix}"
36
- # speaker_groups[file_name] = [turn.start, turn.end]
37
-
38
- # print(f"speaker_groups {file_name}: {speaker_groups[file_name]}")
39
- # print(f"start={turn.start:.3f}s stop={turn.end:.3f}s speaker_{speaker}")
40
 
41
  save_groups_json(sample_groups, speaker_groups)
42
  audio_segmentation(input_file, speaker_groups)
 
5
  import torch
6
  import json
7
 
8
+ hugging_face_token = os.environ["HUGGING_FACE_TOKEN"]
 
9
  pipeline = Pipeline.from_pretrained(
10
  'pyannote/speaker-diarization', use_auth_token=hugging_face_token)
11
  device = torch.device("cuda")
 
13
 
14
 
15
  def start_diarization(input_file, progress: gr.Progress):
 
 
16
  diarization = pipeline(input_file)
17
 
18
  sample_groups = []
19
  speaker_groups = {}
20
+ iterables = diarization.itertracks(yield_label=True)
21
+ for turn, _, speaker, _ in zip(iterables, progress.tqdm(iterables, desc="Processing diarization")):
22
+ if (speaker not in sample_groups):
23
+ sample_groups.append(str(speaker))
24
+
25
+ suffix = 1
26
+ file_name = f"{speaker}-{suffix}"
27
+ while file_name in speaker_groups:
28
+ suffix += 1
29
+ file_name = f"{speaker}-{suffix}"
30
+ speaker_groups[file_name] = [turn.start, turn.end]
31
+
32
+ print(f"speaker_groups {file_name}: {speaker_groups[file_name]}")
33
+ print(
34
+ f"start={turn.start:.3f}s stop={turn.end:.3f}s speaker_{speaker}")
 
 
35
 
36
  save_groups_json(sample_groups, speaker_groups)
37
  audio_segmentation(input_file, speaker_groups)
transcribe.py CHANGED
@@ -10,8 +10,8 @@ model = WhisperModel("medium", device="cuda", compute_type="int8_float16")
10
 
11
 
12
  def start_transcribe(progress):
13
- sample_groups, speaker_groups = load_groups_json()
14
- for speaker in speaker_groups:
15
  # Transcribe and save temp file
16
  audiof = f"{speaker}.wav"
17
  print(f"Loading {audiof}")
 
10
 
11
 
12
  def start_transcribe(progress):
13
+ _, speaker_groups = load_groups_json()
14
+ for speaker, _ in zip(speaker_groups, progress.tqdm(speaker_groups, desc="Processing diarization")):
15
  # Transcribe and save temp file
16
  audiof = f"{speaker}.wav"
17
  print(f"Loading {audiof}")