Spaces:

reab5555
/

Multimodal-Behavioral-Anomalies-Detection

Running

App Files Files Community

reab5555 commited on Jul 28, 2024

Commit

08d515b

verified ·

1 Parent(s): 931d60e

Update voice_analysis.py

Browse files

Files changed (1) hide show

voice_analysis.py +32 -6

voice_analysis.py CHANGED Viewed

@@ -32,6 +32,10 @@ def get_speaker_embeddings(audio_path, diarization, model_name="pyannote/embeddi
     if waveform.shape[0] == 2:
         waveform = torch.mean(waveform, dim=0, keepdim=True)
     embeddings = []
     for turn, _, speaker in diarization.itertracks(yield_label=True):
         start_frame = int(turn.start * sample_rate)
@@ -39,16 +43,38 @@ def get_speaker_embeddings(audio_path, diarization, model_name="pyannote/embeddi
         segment = waveform[:, start_frame:end_frame]
         if segment.shape[1] > 0:
-            # Ensure the segment is on the correct device
-            segment = segment.to(model.device)
-            with torch.no_grad():
-                embedding = model(segment)
-            embeddings.append({"time": turn.start, "duration": turn.duration, "embedding": embedding.cpu().numpy(), "speaker": speaker})
     # Ensure embeddings cover the entire duration
     if embeddings and embeddings[-1]['time'] + embeddings[-1]['duration'] < duration:
-        embeddings.append({"time": duration, "duration": 0, "embedding": np.zeros_like(embeddings[0]['embedding']), "speaker": "silence"})
     return embeddings, duration

     if waveform.shape[0] == 2:
         waveform = torch.mean(waveform, dim=0, keepdim=True)
+    # Minimum segment duration (in seconds)
+    min_segment_duration = 0.5
+    min_segment_length = int(min_segment_duration * sample_rate)
     embeddings = []
     for turn, _, speaker in diarization.itertracks(yield_label=True):
         start_frame = int(turn.start * sample_rate)
         segment = waveform[:, start_frame:end_frame]
         if segment.shape[1] > 0:
+            # Pad short segments
+            if segment.shape[1] < min_segment_length:
+                padding = torch.zeros(1, min_segment_length - segment.shape[1])
+                segment = torch.cat([segment, padding], dim=1)
+            # Split long segments
+            for i in range(0, segment.shape[1], min_segment_length):
+                sub_segment = segment[:, i:i+min_segment_length]
+                if sub_segment.shape[1] < min_segment_length:
+                    padding = torch.zeros(1, min_segment_length - sub_segment.shape[1])
+                    sub_segment = torch.cat([sub_segment, padding], dim=1)
+                # Ensure the segment is on the correct device
+                sub_segment = sub_segment.to(model.device)
+                with torch.no_grad():
+                    embedding = model(sub_segment)
+                embeddings.append({
+                    "time": turn.start + i / sample_rate,
+                    "duration": min_segment_duration,
+                    "embedding": embedding.cpu().numpy(),
+                    "speaker": speaker
+                })
     # Ensure embeddings cover the entire duration
     if embeddings and embeddings[-1]['time'] + embeddings[-1]['duration'] < duration:
+        embeddings.append({
+            "time": duration,
+            "duration": 0,
+            "embedding": np.zeros_like(embeddings[0]['embedding']),
+            "speaker": "silence"
+        })
     return embeddings, duration