Spaces:

reab5555
/

Multimodal-Behavioral-Anomalies-Detection

Running

reab5555 commited on Jul 30, 2024

Commit

793c2d4

verified ·

1 Parent(s): f4e99d6

Update voice_analysis.py

Files changed (1) hide show

voice_analysis.py CHANGED Viewed

@@ -91,26 +91,8 @@ def get_speaker_embeddings(audio_path, diarization, most_frequent_speaker, model
                     "speaker": speaker
                 })
-    # Ensure embeddings cover the entire duration
-    if embeddings and embeddings[-1]['time'] + embeddings[-1]['duration'] < duration:
-        embeddings.append({
-            "time": duration,
-            "duration": 0,
-            "embedding": np.zeros_like(embeddings[0]['embedding']),
-            "speaker": "silence"
-        })
-    return embeddings, duration
-    # Ensure embeddings cover the entire duration
-    if embeddings and embeddings[-1]['time'] + embeddings[-1]['duration'] < duration:
-        embeddings.append({
-            "time": duration,
-            "duration": 0,
-            "embedding": np.zeros_like(embeddings[0]['embedding']),
-            "speaker": "silence"
-        })
     return embeddings, duration
@@ -121,10 +103,18 @@ def align_voice_embeddings(voice_embeddings, frame_count, fps, audio_duration):
     for frame in range(frame_count):
         frame_time = frame / fps
         while (current_embedding_index < len(voice_embeddings) - 1 and
                voice_embeddings[current_embedding_index + 1]["time"] <= frame_time):
             current_embedding_index += 1
-        aligned_embeddings.append(voice_embeddings[current_embedding_index]["embedding"].flatten())
     return aligned_embeddings

                     "speaker": speaker
                 })
+    # Sort embeddings by time
+    embeddings.sort(key=lambda x: x['time'])
     return embeddings, duration
     for frame in range(frame_count):
         frame_time = frame / fps
+        # Find the correct embedding for the current frame time
         while (current_embedding_index < len(voice_embeddings) - 1 and
                voice_embeddings[current_embedding_index + 1]["time"] <= frame_time):
             current_embedding_index += 1
+        current_embedding = voice_embeddings[current_embedding_index]
+        # Check if the current frame is within the most frequent speaker's time range
+        if current_embedding["time"] <= frame_time < (current_embedding["time"] + current_embedding["duration"]):
+            aligned_embeddings.append(current_embedding["embedding"].flatten())
+        else:
+            # If not in the speaker's range, append a zero vector
+            aligned_embeddings.append(np.zeros_like(voice_embeddings[0]["embedding"].flatten()))
     return aligned_embeddings