Spaces:

reab5555
/

Multimodal-Behavioral-Anomalies-Detection

Running

reab5555 commited on Jul 28, 2024

Commit

7c1ee96

verified ·

1 Parent(s): 7bbb7f4

Update voice_analysis.py

Files changed (1) hide show

voice_analysis.py CHANGED Viewed

@@ -30,20 +30,26 @@ def get_speaker_embeddings(audio_path, diarization, model_name="pyannote/embeddi
         raise ValueError("py_annote_hf_token environment variable is not set. Please check your Hugging Face Space's Variables and secrets section.")
     model = Model.from_pretrained(model_name, use_auth_token=hf_token)
     waveform, sample_rate = torchaudio.load(audio_path)
     embeddings = []
     for turn, _, speaker in diarization.itertracks(yield_label=True):
-        start = int(turn.start * sample_rate)
-        end = int(turn.end * sample_rate)
-        segment = waveform[:, start:end]
         if segment.shape[1] == 0:
             continue
         with torch.no_grad():
-            embedding = model({"waveform": segment, "sample_rate": sample_rate})
         embeddings.append({"time": turn.start, "embedding": embedding.squeeze().cpu().numpy(), "speaker": speaker})

         raise ValueError("py_annote_hf_token environment variable is not set. Please check your Hugging Face Space's Variables and secrets section.")
     model = Model.from_pretrained(model_name, use_auth_token=hf_token)
+    model.eval()  # Set the model to evaluation mode
     waveform, sample_rate = torchaudio.load(audio_path)
     embeddings = []
     for turn, _, speaker in diarization.itertracks(yield_label=True):
+        start_frame = int(turn.start * sample_rate)
+        end_frame = int(turn.end * sample_rate)
+        segment = waveform[:, start_frame:end_frame]
         if segment.shape[1] == 0:
             continue
+        # Ensure the segment is long enough (at least 1 second)
+        if segment.shape[1] < sample_rate:
+            padding = torch.zeros(1, sample_rate - segment.shape[1])
+            segment = torch.cat([segment, padding], dim=1)
         with torch.no_grad():
+            embedding = model(segment)  # Pass the tensor directly, not a dictionary
         embeddings.append({"time": turn.start, "embedding": embedding.squeeze().cpu().numpy(), "speaker": speaker})