reab5555 commited on
Commit
793c2d4
·
verified ·
1 Parent(s): f4e99d6

Update voice_analysis.py

Browse files
Files changed (1) hide show
  1. voice_analysis.py +11 -21
voice_analysis.py CHANGED
@@ -91,26 +91,8 @@ def get_speaker_embeddings(audio_path, diarization, most_frequent_speaker, model
91
  "speaker": speaker
92
  })
93
 
94
- # Ensure embeddings cover the entire duration
95
- if embeddings and embeddings[-1]['time'] + embeddings[-1]['duration'] < duration:
96
- embeddings.append({
97
- "time": duration,
98
- "duration": 0,
99
- "embedding": np.zeros_like(embeddings[0]['embedding']),
100
- "speaker": "silence"
101
- })
102
-
103
- return embeddings, duration
104
-
105
-
106
- # Ensure embeddings cover the entire duration
107
- if embeddings and embeddings[-1]['time'] + embeddings[-1]['duration'] < duration:
108
- embeddings.append({
109
- "time": duration,
110
- "duration": 0,
111
- "embedding": np.zeros_like(embeddings[0]['embedding']),
112
- "speaker": "silence"
113
- })
114
 
115
  return embeddings, duration
116
 
@@ -121,10 +103,18 @@ def align_voice_embeddings(voice_embeddings, frame_count, fps, audio_duration):
121
  for frame in range(frame_count):
122
  frame_time = frame / fps
123
 
 
124
  while (current_embedding_index < len(voice_embeddings) - 1 and
125
  voice_embeddings[current_embedding_index + 1]["time"] <= frame_time):
126
  current_embedding_index += 1
127
 
128
- aligned_embeddings.append(voice_embeddings[current_embedding_index]["embedding"].flatten())
 
 
 
 
 
 
 
129
 
130
  return aligned_embeddings
 
91
  "speaker": speaker
92
  })
93
 
94
+ # Sort embeddings by time
95
+ embeddings.sort(key=lambda x: x['time'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  return embeddings, duration
98
 
 
103
  for frame in range(frame_count):
104
  frame_time = frame / fps
105
 
106
+ # Find the correct embedding for the current frame time
107
  while (current_embedding_index < len(voice_embeddings) - 1 and
108
  voice_embeddings[current_embedding_index + 1]["time"] <= frame_time):
109
  current_embedding_index += 1
110
 
111
+ current_embedding = voice_embeddings[current_embedding_index]
112
+
113
+ # Check if the current frame is within the most frequent speaker's time range
114
+ if current_embedding["time"] <= frame_time < (current_embedding["time"] + current_embedding["duration"]):
115
+ aligned_embeddings.append(current_embedding["embedding"].flatten())
116
+ else:
117
+ # If not in the speaker's range, append a zero vector
118
+ aligned_embeddings.append(np.zeros_like(voice_embeddings[0]["embedding"].flatten()))
119
 
120
  return aligned_embeddings