Spaces:

reab5555
/

Multimodal-Behavioral-Anomalies-Detection

Running

App Files Files Community

reab5555 commited on Jul 28, 2024

Commit

a4db1a6

verified ·

1 Parent(s): 9998b9d

Update voice_analysis.py

Browse files

Files changed (1) hide show

voice_analysis.py +43 -45

voice_analysis.py CHANGED Viewed

@@ -1,52 +1,50 @@
-import numpy as np
-import librosa
-from sklearn.cluster import DBSCAN
-def extract_voice_features(audio_path, fps, video_duration):
-    # Load the audio file
-    y, sr = librosa.load(audio_path)
-    # Calculate the number of samples per frame
-    samples_per_frame = int(sr / fps)
-    # Calculate the total number of frames
-    total_frames = int(fps * video_duration)
-    # Extract MFCC features
-    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
-    # Segment the MFCCs to align with video frames
-    segments = []
-    for i in range(total_frames):
-        start = i * samples_per_frame
-        end = start + samples_per_frame
-        if end > mfccs.shape[1]:
-            break
-        segment = mfccs[:, start:end]
-        segments.append(np.mean(segment, axis=1))
-    return np.array(segments)
-def cluster_voices(features):
-    if len(features) < 2:
-        print("Not enough voice segments for clustering. Assigning all to one cluster.")
-        return np.zeros(len(features), dtype=int)
-    dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
-    clusters = dbscan.fit_predict(features)
-    if np.all(clusters == -1):
-        print("DBSCAN assigned all to noise. Considering as one cluster.")
-        return np.zeros(len(features), dtype=int)
-    return clusters
-def get_most_frequent_voice(features, clusters):
-    largest_cluster = max(set(clusters), key=list(clusters).count)
-    return features[clusters == largest_cluster]
-def process_audio(audio_path, fps, video_duration):
-    features = extract_voice_features(audio_path, fps, video_duration)
-    clusters = cluster_voices(features)
-    most_frequent_voice = get_most_frequent_voice(features, clusters)
-    return most_frequent_voice, features, clusters

+import moviepy.editor as mp
+from pyannote.audio import Pipeline
+import torch
+import torchaudio
+from pyannote.core import Segment
+def extract_audio_from_video(video_path):
+    video = mp.VideoFileClip(video_path)
+    audio_path = video_path.rsplit('.', 1)[0] + '.wav'
+    video.audio.write_audiofile(audio_path)
+    return audio_path
+def diarize_speakers(audio_path):
+    pipeline = Pipeline.from_pretrained("pyannote/[email protected]", use_auth_token="YOUR_HF_TOKEN")
+    diarization = pipeline(audio_path)
+    return diarization
+def get_speaker_embeddings(audio_path, diarization, model):
+    waveform, sample_rate = torchaudio.load(audio_path)
+    embeddings = []
+    for turn, _, speaker in diarization.itertracks(yield_label=True):
+        start = int(turn.start * sample_rate)
+        end = int(turn.end * sample_rate)
+        segment = waveform[:, start:end]
+        if segment.shape[1] == 0:
+            continue
+        with torch.no_grad():
+            embedding = model({"waveform": segment, "sample_rate": sample_rate})
+        embeddings.append({"time": turn.start, "embedding": embedding.squeeze().cpu().numpy(), "speaker": speaker})
+    return embeddings
+def align_voice_embeddings(voice_embeddings, frame_count, fps):
+    aligned_embeddings = []
+    current_embedding_index = 0
+    for frame in range(frame_count):
+        frame_time = frame / fps
+        while (current_embedding_index < len(voice_embeddings) - 1 and
+               voice_embeddings[current_embedding_index + 1]["time"] <= frame_time):
+            current_embedding_index += 1
+        aligned_embeddings.append(voice_embeddings[current_embedding_index]["embedding"])
+    return np.array(aligned_embeddings)