Update video_processing.py
Browse files- video_processing.py +6 -2
video_processing.py
CHANGED
@@ -154,9 +154,11 @@ def process_video(video_path, anomaly_threshold, desired_fps, progress=None):
|
|
154 |
face_samples = get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
|
155 |
|
156 |
progress(0.8, "Extracting audio and performing voice analysis")
|
|
|
157 |
audio_path = extract_audio_from_video(video_path)
|
158 |
diarization = diarize_speakers(audio_path)
|
159 |
-
voice_embeddings = get_speaker_embeddings(audio_path, diarization, "pyannote/embedding")
|
|
|
160 |
|
161 |
progress(0.85, "Performing anomaly detection")
|
162 |
embedding_columns = [col for col in df.columns if col.startswith('Raw_Embedding_')]
|
@@ -170,7 +172,9 @@ def process_video(video_path, anomaly_threshold, desired_fps, progress=None):
|
|
170 |
if len(X_posture) == 0:
|
171 |
raise ValueError("No valid posture data found")
|
172 |
|
173 |
-
|
|
|
|
|
174 |
|
175 |
mse_embeddings, mse_posture, mse_voice = anomaly_detection(X_embeddings, X_posture, X_voice)
|
176 |
|
|
|
154 |
face_samples = get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
|
155 |
|
156 |
progress(0.8, "Extracting audio and performing voice analysis")
|
157 |
+
|
158 |
audio_path = extract_audio_from_video(video_path)
|
159 |
diarization = diarize_speakers(audio_path)
|
160 |
+
voice_embeddings, audio_duration = get_speaker_embeddings(audio_path, diarization, "pyannote/embedding")
|
161 |
+
|
162 |
|
163 |
progress(0.85, "Performing anomaly detection")
|
164 |
embedding_columns = [col for col in df.columns if col.startswith('Raw_Embedding_')]
|
|
|
172 |
if len(X_posture) == 0:
|
173 |
raise ValueError("No valid posture data found")
|
174 |
|
175 |
+
aligned_voice_embeddings = align_voice_embeddings(voice_embeddings, frame_count, original_fps, audio_duration)
|
176 |
+
|
177 |
+
X_voice = np.array([emb for emb in aligned_voice_embeddings])
|
178 |
|
179 |
mse_embeddings, mse_posture, mse_voice = anomaly_detection(X_embeddings, X_posture, X_voice)
|
180 |
|