reab5555 commited on
Commit
ad16427
·
verified ·
1 Parent(s): 0377381

Update voice_analysis.py

Browse files
Files changed (1) hide show
  1. voice_analysis.py +28 -2
voice_analysis.py CHANGED
@@ -22,9 +22,20 @@ def diarize_speakers(audio_path):
22
 
23
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=hf_token)
24
  diarization = pipeline(audio_path)
25
- return diarization
26
 
27
- def get_speaker_embeddings(audio_path, diarization, model_name="pyannote/embedding"):
 
 
 
 
 
 
 
 
 
 
 
 
28
  model = Model.from_pretrained(model_name, use_auth_token=os.environ.get("py_annote_hf_token"))
29
  waveform, sample_rate = torchaudio.load(audio_path)
30
  duration = waveform.shape[1] / sample_rate
@@ -39,6 +50,9 @@ def get_speaker_embeddings(audio_path, diarization, model_name="pyannote/embeddi
39
 
40
  embeddings = []
41
  for turn, _, speaker in diarization.itertracks(yield_label=True):
 
 
 
42
  start_frame = int(turn.start * sample_rate)
43
  end_frame = int(turn.end * sample_rate)
44
  segment = waveform[:, start_frame:end_frame]
@@ -79,6 +93,18 @@ def get_speaker_embeddings(audio_path, diarization, model_name="pyannote/embeddi
79
 
80
  return embeddings, duration
81
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def align_voice_embeddings(voice_embeddings, frame_count, fps, audio_duration):
83
  aligned_embeddings = []
84
  current_embedding_index = 0
 
22
 
23
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=hf_token)
24
  diarization = pipeline(audio_path)
 
25
 
26
+ # Identify the most frequent speaker
27
+ speaker_segments = {}
28
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
29
+ if speaker not in speaker_segments:
30
+ speaker_segments[speaker] = 0
31
+ speaker_segments[speaker] += turn.end - turn.start
32
+
33
+ most_frequent_speaker = max(speaker_segments, key=speaker_segments.get)
34
+
35
+ return diarization, most_frequent_speaker
36
+
37
+
38
+ def get_speaker_embeddings(audio_path, diarization, most_frequent_speaker, model_name="pyannote/embedding"):
39
  model = Model.from_pretrained(model_name, use_auth_token=os.environ.get("py_annote_hf_token"))
40
  waveform, sample_rate = torchaudio.load(audio_path)
41
  duration = waveform.shape[1] / sample_rate
 
50
 
51
  embeddings = []
52
  for turn, _, speaker in diarization.itertracks(yield_label=True):
53
+ if speaker != most_frequent_speaker:
54
+ continue
55
+
56
  start_frame = int(turn.start * sample_rate)
57
  end_frame = int(turn.end * sample_rate)
58
  segment = waveform[:, start_frame:end_frame]
 
93
 
94
  return embeddings, duration
95
 
96
+
97
+ # Ensure embeddings cover the entire duration
98
+ if embeddings and embeddings[-1]['time'] + embeddings[-1]['duration'] < duration:
99
+ embeddings.append({
100
+ "time": duration,
101
+ "duration": 0,
102
+ "embedding": np.zeros_like(embeddings[0]['embedding']),
103
+ "speaker": "silence"
104
+ })
105
+
106
+ return embeddings, duration
107
+
108
  def align_voice_embeddings(voice_embeddings, frame_count, fps, audio_duration):
109
  aligned_embeddings = []
110
  current_embedding_index = 0