Spaces:
Runtime error
Runtime error
Update video_processing.py
Browse files- video_processing.py +36 -74
video_processing.py
CHANGED
@@ -8,6 +8,7 @@ from PIL import Image, ImageDraw, ImageFont
|
|
8 |
import math
|
9 |
from face_analysis import get_face_embedding, cluster_faces, organize_faces_by_person, draw_facial_landmarks
|
10 |
from pose_analysis import pose, calculate_posture_score, draw_pose_landmarks
|
|
|
11 |
from anomaly_detection import anomaly_detection
|
12 |
from visualization import plot_mse, plot_mse_histogram, plot_mse_heatmap, create_video_with_heatmap
|
13 |
from utils import frame_to_timecode
|
@@ -15,6 +16,7 @@ import pandas as pd
|
|
15 |
from facenet_pytorch import MTCNN
|
16 |
import torch
|
17 |
import mediapipe as mp
|
|
|
18 |
|
19 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
20 |
mtcnn = MTCNN(keep_all=False, device=device, thresholds=[0.9, 0.9, 0.9], min_face_size=50)
|
@@ -22,71 +24,6 @@ mtcnn = MTCNN(keep_all=False, device=device, thresholds=[0.9, 0.9, 0.9], min_fac
|
|
22 |
mp_face_mesh = mp.solutions.face_mesh
|
23 |
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.8)
|
24 |
|
25 |
-
def extract_frames(video_path, output_folder, desired_fps, progress_callback=None):
|
26 |
-
os.makedirs(output_folder, exist_ok=True)
|
27 |
-
clip = VideoFileClip(video_path)
|
28 |
-
original_fps = clip.fps
|
29 |
-
duration = clip.duration
|
30 |
-
total_frames = int(duration * original_fps)
|
31 |
-
step = max(1, original_fps / desired_fps)
|
32 |
-
total_frames_to_extract = int(total_frames / step)
|
33 |
-
|
34 |
-
frame_count = 0
|
35 |
-
for t in np.arange(0, duration, step / original_fps):
|
36 |
-
frame = clip.get_frame(t)
|
37 |
-
cv2.imwrite(os.path.join(output_folder, f"frame_{frame_count:04d}.jpg"), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
|
38 |
-
frame_count += 1
|
39 |
-
if progress_callback:
|
40 |
-
progress = min(100, (frame_count / total_frames_to_extract) * 100)
|
41 |
-
progress_callback(progress, f"Extracting frame")
|
42 |
-
if frame_count >= total_frames_to_extract:
|
43 |
-
break
|
44 |
-
clip.close()
|
45 |
-
return frame_count, original_fps
|
46 |
-
|
47 |
-
def process_frames(frames_folder, aligned_faces_folder, frame_count, progress):
|
48 |
-
embeddings_by_frame = {}
|
49 |
-
posture_scores_by_frame = {}
|
50 |
-
posture_landmarks_by_frame = {}
|
51 |
-
facial_landmarks_by_frame = {}
|
52 |
-
aligned_face_paths = []
|
53 |
-
frame_files = sorted([f for f in os.listdir(frames_folder) if f.endswith('.jpg')])
|
54 |
-
|
55 |
-
for i, frame_file in enumerate(frame_files):
|
56 |
-
frame_num = int(frame_file.split('_')[1].split('.')[0])
|
57 |
-
frame_path = os.path.join(frames_folder, frame_file)
|
58 |
-
frame = cv2.imread(frame_path)
|
59 |
-
|
60 |
-
if frame is not None:
|
61 |
-
posture_score, posture_landmarks = calculate_posture_score(frame)
|
62 |
-
posture_scores_by_frame[frame_num] = posture_score
|
63 |
-
posture_landmarks_by_frame[frame_num] = posture_landmarks
|
64 |
-
|
65 |
-
boxes, probs = mtcnn.detect(frame)
|
66 |
-
|
67 |
-
if boxes is not None and len(boxes) > 0 and probs[0] >= 0.99:
|
68 |
-
x1, y1, x2, y2 = [int(b) for b in boxes[0]]
|
69 |
-
face = frame[y1:y2, x1:x2]
|
70 |
-
if face.size > 0:
|
71 |
-
face_rgb = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
|
72 |
-
results = face_mesh.process(face_rgb)
|
73 |
-
if results.multi_face_landmarks:
|
74 |
-
facial_landmarks_by_frame[frame_num] = results.multi_face_landmarks[0]
|
75 |
-
if is_frontal_face(results.multi_face_landmarks[0].landmark):
|
76 |
-
aligned_face = face
|
77 |
-
|
78 |
-
if aligned_face is not None:
|
79 |
-
aligned_face_resized = cv2.resize(aligned_face, (160, 160))
|
80 |
-
output_path = os.path.join(aligned_faces_folder, f"frame_{frame_num}_face.jpg")
|
81 |
-
cv2.imwrite(output_path, aligned_face_resized)
|
82 |
-
aligned_face_paths.append(output_path)
|
83 |
-
embedding = get_face_embedding(aligned_face_resized)
|
84 |
-
embeddings_by_frame[frame_num] = embedding
|
85 |
-
|
86 |
-
progress((i + 1) / len(frame_files), f"Processing frame {i + 1} of {len(frame_files)}")
|
87 |
-
|
88 |
-
return embeddings_by_frame, posture_scores_by_frame, posture_landmarks_by_frame, aligned_face_paths, facial_landmarks_by_frame
|
89 |
-
|
90 |
def process_video(video_path, anomaly_threshold, desired_fps, progress=None):
|
91 |
start_time = time.time()
|
92 |
output_folder = "output"
|
@@ -94,7 +31,8 @@ def process_video(video_path, anomaly_threshold, desired_fps, progress=None):
|
|
94 |
|
95 |
GRAPH_COLORS = {
|
96 |
'facial_embeddings': 'navy',
|
97 |
-
'body_posture': 'purple'
|
|
|
98 |
}
|
99 |
|
100 |
with tempfile.TemporaryDirectory() as temp_dir:
|
@@ -147,7 +85,13 @@ def process_video(video_path, anomaly_threshold, desired_fps, progress=None):
|
|
147 |
progress(0.75, "Getting face samples")
|
148 |
face_samples = get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
|
149 |
|
150 |
-
progress(0.8, "
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
embedding_columns = [col for col in df.columns if col.startswith('Raw_Embedding_')]
|
152 |
|
153 |
X_embeddings = df[embedding_columns].values
|
@@ -159,9 +103,11 @@ def process_video(video_path, anomaly_threshold, desired_fps, progress=None):
|
|
159 |
if len(X_posture) == 0:
|
160 |
raise ValueError("No valid posture data found")
|
161 |
|
162 |
-
|
|
|
|
|
163 |
|
164 |
-
progress(0.
|
165 |
mse_plot_embeddings, anomaly_frames_embeddings = plot_mse(df, mse_embeddings, "Facial Features",
|
166 |
color=GRAPH_COLORS['facial_embeddings'],
|
167 |
anomaly_threshold=anomaly_threshold)
|
@@ -176,21 +122,28 @@ def process_video(video_path, anomaly_threshold, desired_fps, progress=None):
|
|
176 |
mse_histogram_posture = plot_mse_histogram(mse_posture, "MSE Distribution: Body Posture",
|
177 |
anomaly_threshold, color=GRAPH_COLORS['body_posture'])
|
178 |
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
mse_heatmap_embeddings = plot_mse_heatmap(mse_embeddings, "Facial Features MSE Heatmap", df)
|
|
|
|
|
182 |
|
183 |
-
progress(0.
|
184 |
|
185 |
# Create video with heatmap
|
186 |
heatmap_video_path = os.path.join(output_folder, "heatmap_video.mp4")
|
187 |
-
heatmap_video_path = create_video_with_heatmap(video_path, df, mse_embeddings, mse_posture, heatmap_video_path, original_fps, largest_cluster)
|
188 |
|
189 |
except Exception as e:
|
190 |
print(f"Error details: {str(e)}")
|
191 |
import traceback
|
192 |
traceback.print_exc()
|
193 |
-
return (f"Error in video processing: {str(e)}",) + (None,) *
|
194 |
|
195 |
progress(1.0, "Preparing results")
|
196 |
results = f"Number of persons detected: {num_clusters}\n\n"
|
@@ -242,12 +195,16 @@ def process_video(video_path, anomaly_threshold, desired_fps, progress=None):
|
|
242 |
df,
|
243 |
mse_embeddings,
|
244 |
mse_posture,
|
|
|
245 |
mse_plot_embeddings,
|
246 |
-
mse_histogram_embeddings,
|
247 |
mse_plot_posture,
|
|
|
|
|
248 |
mse_histogram_posture,
|
|
|
249 |
mse_heatmap_embeddings,
|
250 |
mse_heatmap_posture,
|
|
|
251 |
face_samples["most_frequent"],
|
252 |
anomaly_faces_embeddings,
|
253 |
anomaly_frames_posture_images,
|
@@ -256,6 +213,11 @@ def process_video(video_path, anomaly_threshold, desired_fps, progress=None):
|
|
256 |
heatmap_video_path
|
257 |
)
|
258 |
|
|
|
|
|
|
|
|
|
|
|
259 |
def is_frontal_face(landmarks, threshold=60):
|
260 |
nose_tip = landmarks[4]
|
261 |
left_chin = landmarks[234]
|
|
|
8 |
import math
|
9 |
from face_analysis import get_face_embedding, cluster_faces, organize_faces_by_person, draw_facial_landmarks
|
10 |
from pose_analysis import pose, calculate_posture_score, draw_pose_landmarks
|
11 |
+
from voice_analysis import extract_audio_from_video, diarize_speakers, get_speaker_embeddings
|
12 |
from anomaly_detection import anomaly_detection
|
13 |
from visualization import plot_mse, plot_mse_histogram, plot_mse_heatmap, create_video_with_heatmap
|
14 |
from utils import frame_to_timecode
|
|
|
16 |
from facenet_pytorch import MTCNN
|
17 |
import torch
|
18 |
import mediapipe as mp
|
19 |
+
from pyannote.audio import Model
|
20 |
|
21 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
22 |
mtcnn = MTCNN(keep_all=False, device=device, thresholds=[0.9, 0.9, 0.9], min_face_size=50)
|
|
|
24 |
mp_face_mesh = mp.solutions.face_mesh
|
25 |
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.8)
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def process_video(video_path, anomaly_threshold, desired_fps, progress=None):
|
28 |
start_time = time.time()
|
29 |
output_folder = "output"
|
|
|
31 |
|
32 |
GRAPH_COLORS = {
|
33 |
'facial_embeddings': 'navy',
|
34 |
+
'body_posture': 'purple',
|
35 |
+
'voice': 'green'
|
36 |
}
|
37 |
|
38 |
with tempfile.TemporaryDirectory() as temp_dir:
|
|
|
85 |
progress(0.75, "Getting face samples")
|
86 |
face_samples = get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
|
87 |
|
88 |
+
progress(0.8, "Extracting audio and performing voice analysis")
|
89 |
+
audio_path = extract_audio_from_video(video_path)
|
90 |
+
diarization = diarize_speakers(audio_path)
|
91 |
+
voice_model = Model.from_pretrained("pyannote/embedding")
|
92 |
+
voice_embeddings = get_speaker_embeddings(audio_path, diarization, voice_model)
|
93 |
+
|
94 |
+
progress(0.85, "Performing anomaly detection")
|
95 |
embedding_columns = [col for col in df.columns if col.startswith('Raw_Embedding_')]
|
96 |
|
97 |
X_embeddings = df[embedding_columns].values
|
|
|
103 |
if len(X_posture) == 0:
|
104 |
raise ValueError("No valid posture data found")
|
105 |
|
106 |
+
X_voice = np.array([emb['embedding'] for emb in voice_embeddings])
|
107 |
+
|
108 |
+
mse_embeddings, mse_posture, mse_voice = anomaly_detection(X_embeddings, X_posture, X_voice)
|
109 |
|
110 |
+
progress(0.9, "Generating graphs")
|
111 |
mse_plot_embeddings, anomaly_frames_embeddings = plot_mse(df, mse_embeddings, "Facial Features",
|
112 |
color=GRAPH_COLORS['facial_embeddings'],
|
113 |
anomaly_threshold=anomaly_threshold)
|
|
|
122 |
mse_histogram_posture = plot_mse_histogram(mse_posture, "MSE Distribution: Body Posture",
|
123 |
anomaly_threshold, color=GRAPH_COLORS['body_posture'])
|
124 |
|
125 |
+
mse_plot_voice, anomaly_frames_voice = plot_mse(df, mse_voice, "Voice",
|
126 |
+
color=GRAPH_COLORS['voice'],
|
127 |
+
anomaly_threshold=anomaly_threshold)
|
128 |
+
|
129 |
+
mse_histogram_voice = plot_mse_histogram(mse_voice, "MSE Distribution: Voice",
|
130 |
+
anomaly_threshold, color=GRAPH_COLORS['voice'])
|
131 |
|
132 |
mse_heatmap_embeddings = plot_mse_heatmap(mse_embeddings, "Facial Features MSE Heatmap", df)
|
133 |
+
mse_heatmap_posture = plot_mse_heatmap(mse_posture, "Body Posture MSE Heatmap", df)
|
134 |
+
mse_heatmap_voice = plot_mse_heatmap(mse_voice, "Voice MSE Heatmap", df)
|
135 |
|
136 |
+
progress(0.95, "Generating video with heatmap")
|
137 |
|
138 |
# Create video with heatmap
|
139 |
heatmap_video_path = os.path.join(output_folder, "heatmap_video.mp4")
|
140 |
+
heatmap_video_path = create_video_with_heatmap(video_path, df, mse_embeddings, mse_posture, mse_voice, heatmap_video_path, original_fps, largest_cluster)
|
141 |
|
142 |
except Exception as e:
|
143 |
print(f"Error details: {str(e)}")
|
144 |
import traceback
|
145 |
traceback.print_exc()
|
146 |
+
return (f"Error in video processing: {str(e)}",) + (None,) * 21
|
147 |
|
148 |
progress(1.0, "Preparing results")
|
149 |
results = f"Number of persons detected: {num_clusters}\n\n"
|
|
|
195 |
df,
|
196 |
mse_embeddings,
|
197 |
mse_posture,
|
198 |
+
mse_voice,
|
199 |
mse_plot_embeddings,
|
|
|
200 |
mse_plot_posture,
|
201 |
+
mse_plot_voice,
|
202 |
+
mse_histogram_embeddings,
|
203 |
mse_histogram_posture,
|
204 |
+
mse_histogram_voice,
|
205 |
mse_heatmap_embeddings,
|
206 |
mse_heatmap_posture,
|
207 |
+
mse_heatmap_voice,
|
208 |
face_samples["most_frequent"],
|
209 |
anomaly_faces_embeddings,
|
210 |
anomaly_frames_posture_images,
|
|
|
213 |
heatmap_video_path
|
214 |
)
|
215 |
|
216 |
+
|
217 |
+
|
218 |
+
|
219 |
+
|
220 |
+
|
221 |
def is_frontal_face(landmarks, threshold=60):
|
222 |
nose_tip = landmarks[4]
|
223 |
left_chin = landmarks[234]
|