Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,7 @@ from scipy import interpolate
|
|
12 |
from sklearn.cluster import DBSCAN, KMeans
|
13 |
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
14 |
from sklearn.metrics import silhouette_score
|
|
|
15 |
import umap
|
16 |
import pandas as pd
|
17 |
import matplotlib
|
@@ -41,6 +42,7 @@ mp_face_mesh = mp.solutions.face_mesh
|
|
41 |
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.5)
|
42 |
emotion_detector = FER(mtcnn=False)
|
43 |
|
|
|
44 |
def frame_to_timecode(frame_num, total_frames, duration):
|
45 |
total_seconds = (frame_num / total_frames) * duration
|
46 |
hours = int(total_seconds // 3600)
|
@@ -49,6 +51,7 @@ def frame_to_timecode(frame_num, total_frames, duration):
|
|
49 |
milliseconds = int((total_seconds - int(total_seconds)) * 1000)
|
50 |
return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
|
51 |
|
|
|
52 |
def get_face_embedding_and_emotion(face_img):
|
53 |
face_tensor = torch.tensor(face_img).permute(2, 0, 1).unsqueeze(0).float() / 255
|
54 |
face_tensor = (face_tensor - 0.5) / 0.5
|
@@ -64,6 +67,7 @@ def get_face_embedding_and_emotion(face_img):
|
|
64 |
|
65 |
return embedding.cpu().numpy().flatten(), emotion_dict
|
66 |
|
|
|
67 |
def alignFace(img):
|
68 |
img_raw = img.copy()
|
69 |
results = face_mesh.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
|
@@ -89,6 +93,7 @@ def alignFace(img):
|
|
89 |
new_img = cv2.warpAffine(img_raw, rotation_matrix, (width, height))
|
90 |
return new_img
|
91 |
|
|
|
92 |
def extract_frames(video_path, output_folder, desired_fps, progress_callback=None):
|
93 |
os.makedirs(output_folder, exist_ok=True)
|
94 |
clip = VideoFileClip(video_path)
|
@@ -112,6 +117,7 @@ def extract_frames(video_path, output_folder, desired_fps, progress_callback=Non
|
|
112 |
clip.close()
|
113 |
return frame_count, original_fps
|
114 |
|
|
|
115 |
def process_frames(frames_folder, aligned_faces_folder, frame_count, progress, batch_size):
|
116 |
embeddings_by_frame = {}
|
117 |
emotions_by_frame = {}
|
@@ -155,6 +161,7 @@ def process_frames(frames_folder, aligned_faces_folder, frame_count, progress, b
|
|
155 |
|
156 |
return embeddings_by_frame, emotions_by_frame, aligned_face_paths
|
157 |
|
|
|
158 |
def cluster_faces(embeddings):
|
159 |
if len(embeddings) < 2:
|
160 |
print("Not enough faces for clustering. Assigning all to one cluster.")
|
@@ -171,6 +178,7 @@ def cluster_faces(embeddings):
|
|
171 |
|
172 |
return clusters
|
173 |
|
|
|
174 |
def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder):
|
175 |
for (frame_num, embedding), cluster in zip(embeddings_by_frame.items(), clusters):
|
176 |
person_folder = os.path.join(organized_faces_folder, f"person_{cluster}")
|
@@ -179,8 +187,34 @@ def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder
|
|
179 |
dst = os.path.join(person_folder, f"frame_{frame_num}_face.jpg")
|
180 |
shutil.copy(src, dst)
|
181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps, original_fps, output_folder,
|
183 |
-
|
184 |
emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
|
185 |
person_data = {}
|
186 |
|
@@ -199,7 +233,10 @@ def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, de
|
|
199 |
embeddings_array = np.array(embeddings)
|
200 |
np.save(os.path.join(output_folder, 'face_embeddings.npy'), embeddings_array)
|
201 |
|
202 |
-
|
|
|
|
|
|
|
203 |
embeddings_reduced = reducer.fit_transform(embeddings)
|
204 |
|
205 |
scaler = MinMaxScaler(feature_range=(0, 1))
|
@@ -216,7 +253,11 @@ def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, de
|
|
216 |
'Embedding_Index': range(len(embeddings))
|
217 |
}
|
218 |
|
219 |
-
|
|
|
|
|
|
|
|
|
220 |
df_data[f'Comp {i + 1}'] = embeddings_reduced_normalized[:, i]
|
221 |
|
222 |
for emotion in emotions:
|
@@ -226,33 +267,6 @@ def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, de
|
|
226 |
|
227 |
return df, largest_cluster
|
228 |
|
229 |
-
def determine_optimal_anomalies(anomaly_scores, z_threshold=3.5):
|
230 |
-
mean = np.mean(anomaly_scores)
|
231 |
-
std = np.std(anomaly_scores)
|
232 |
-
threshold = mean + z_threshold * std
|
233 |
-
anomalies = anomaly_scores > threshold
|
234 |
-
return anomalies, np.where(anomalies)[0]
|
235 |
-
|
236 |
-
def timecode_to_seconds(timecode):
|
237 |
-
h, m, s = map(float, timecode.split(':'))
|
238 |
-
return h * 3600 + m * 60 + s
|
239 |
-
|
240 |
-
def group_similar_timecodes(timecodes, scores, threshold_seconds=10):
|
241 |
-
grouped = []
|
242 |
-
current_group = []
|
243 |
-
|
244 |
-
for i, (timecode, score) in enumerate(zip(timecodes, scores)):
|
245 |
-
if not current_group or abs(
|
246 |
-
timecode_to_seconds(timecode) - timecode_to_seconds(current_group[0][0])) <= threshold_seconds:
|
247 |
-
current_group.append((timecode, score, i))
|
248 |
-
else:
|
249 |
-
grouped.append(current_group)
|
250 |
-
current_group = [(timecode, score, i)]
|
251 |
-
|
252 |
-
if current_group:
|
253 |
-
grouped.append(current_group)
|
254 |
-
|
255 |
-
return grouped
|
256 |
|
257 |
class LSTMAutoencoder(nn.Module):
|
258 |
def __init__(self, input_size, hidden_size=64, num_layers=2):
|
@@ -268,21 +282,17 @@ class LSTMAutoencoder(nn.Module):
|
|
268 |
out = self.fc(outputs)
|
269 |
return out
|
270 |
|
271 |
-
|
|
|
272 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
273 |
X = torch.FloatTensor(X).to(device)
|
274 |
if X.dim() == 2:
|
275 |
X = X.unsqueeze(0)
|
276 |
elif X.dim() == 1:
|
277 |
X = X.unsqueeze(0).unsqueeze(2)
|
278 |
-
elif X.dim() > 3:
|
279 |
-
raise ValueError(f"Input X should be 1D, 2D or 3D, but got {X.dim()} dimensions")
|
280 |
|
281 |
print(f"X shape after reshaping: {X.shape}")
|
282 |
|
283 |
-
train_size = int(0.9 * X.shape[1])
|
284 |
-
X_train, X_val = X[:, :train_size, :], X[:, train_size:, :]
|
285 |
-
|
286 |
model = LSTMAutoencoder(input_size=X.shape[2]).to(device)
|
287 |
criterion = nn.MSELoss()
|
288 |
optimizer = optim.Adam(model.parameters())
|
@@ -290,22 +300,19 @@ def lstm_anomaly_detection(X, feature_columns, epochs=100, batch_size=64):
|
|
290 |
for epoch in range(epochs):
|
291 |
model.train()
|
292 |
optimizer.zero_grad()
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
optimizer.step()
|
297 |
|
298 |
-
|
299 |
-
|
300 |
-
output_val = model(X_val)
|
301 |
-
loss_val = criterion(output_val, X_val.squeeze(0))
|
302 |
|
303 |
model.eval()
|
304 |
with torch.no_grad():
|
305 |
reconstructed = model(X).squeeze(0).cpu().numpy()
|
306 |
|
307 |
mse_all = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
|
308 |
-
anomalies_all, top_indices_all = determine_optimal_anomalies(mse_all)
|
309 |
|
310 |
component_columns = [col for col in feature_columns if col.startswith('Comp')]
|
311 |
component_indices = [feature_columns.index(col) for col in component_columns]
|
@@ -316,53 +323,10 @@ def lstm_anomaly_detection(X, feature_columns, epochs=100, batch_size=64):
|
|
316 |
else:
|
317 |
mse_comp = mse_all
|
318 |
|
319 |
-
|
320 |
-
|
321 |
-
return (anomalies_all, mse_all, top_indices_all,
|
322 |
-
anomalies_comp, mse_comp, top_indices_comp,
|
323 |
-
model)
|
324 |
-
|
325 |
-
def emotion_anomaly_detection(emotion_data, epochs=100, batch_size=64):
|
326 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
327 |
-
X = torch.FloatTensor(emotion_data.values).to(device)
|
328 |
-
if X.dim() == 1:
|
329 |
-
X = X.unsqueeze(0).unsqueeze(2) # Add batch and feature dimensions
|
330 |
-
elif X.dim() == 2:
|
331 |
-
X = X.unsqueeze(0) # Add batch dimension
|
332 |
-
|
333 |
-
model = LSTMAutoencoder(input_size=1).to(device)
|
334 |
-
criterion = nn.MSELoss()
|
335 |
-
optimizer = optim.Adam(model.parameters())
|
336 |
|
337 |
-
|
338 |
-
model.train()
|
339 |
-
optimizer.zero_grad()
|
340 |
-
output = model(X)
|
341 |
-
loss = criterion(output, X)
|
342 |
-
loss.backward()
|
343 |
-
optimizer.step()
|
344 |
-
|
345 |
-
model.eval()
|
346 |
-
with torch.no_grad():
|
347 |
-
reconstructed = model(X).squeeze(0).cpu().numpy()
|
348 |
-
|
349 |
-
mse = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
|
350 |
-
anomalies, top_indices = determine_optimal_anomalies(mse)
|
351 |
-
|
352 |
-
return anomalies, mse, top_indices
|
353 |
-
|
354 |
-
def normalize_scores(scores):
|
355 |
-
min_score = np.min(scores)
|
356 |
-
max_score = np.max(scores)
|
357 |
-
if max_score == min_score:
|
358 |
-
return np.full_like(scores, 100)
|
359 |
-
return ((scores - min_score) / (max_score - min_score)) * 100
|
360 |
-
|
361 |
-
def plot_to_image(fig):
|
362 |
-
buf = io.BytesIO()
|
363 |
-
fig.savefig(buf, format='png', dpi=300, bbox_inches='tight')
|
364 |
-
buf.seek(0)
|
365 |
-
return buf
|
366 |
|
367 |
def embedding_anomaly_detection(embeddings, epochs=100, batch_size=64):
|
368 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
@@ -389,106 +353,74 @@ def embedding_anomaly_detection(embeddings, epochs=100, batch_size=64):
|
|
389 |
reconstructed = model(X).squeeze(0).cpu().numpy()
|
390 |
|
391 |
mse = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
|
392 |
-
|
393 |
-
|
394 |
-
return anomalies, mse, top_indices
|
395 |
-
def plot_anomaly_scores(df, anomaly_scores, top_indices, title, timecodes):
|
396 |
-
plt.figure(figsize=(16, 8), dpi=300)
|
397 |
-
fig, ax = plt.subplots(figsize=(16, 8))
|
398 |
-
|
399 |
-
df['Seconds'] = df['Timecode'].apply(
|
400 |
-
lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
|
401 |
-
|
402 |
-
# Filter out data points without faces
|
403 |
-
valid_indices = [i for i in range(len(anomaly_scores)) if i in df.index]
|
404 |
-
seconds = df['Seconds'].iloc[valid_indices].values
|
405 |
-
scores = anomaly_scores[valid_indices]
|
406 |
|
407 |
-
|
|
|
|
|
|
|
|
|
408 |
|
409 |
-
top_indices = [idx for idx in top_indices if idx in valid_indices]
|
410 |
-
ax.scatter(df['Seconds'].iloc[top_indices], anomaly_scores[top_indices], color='red', s=50, zorder=5)
|
411 |
-
|
412 |
-
# Calculate and plot baseline
|
413 |
-
non_anomalous_scores = np.delete(scores, top_indices)
|
414 |
-
baseline = np.mean(non_anomalous_scores)
|
415 |
-
ax.axhline(y=baseline, color='black', linestyle='--', linewidth=2.5)
|
416 |
-
ax.text(df['Seconds'].max(), baseline, f'Baseline ({baseline:.2f})',
|
417 |
-
verticalalignment='bottom', horizontalalignment='right', color='black')
|
418 |
-
|
419 |
-
grouped_timecodes = group_similar_timecodes([df['Timecode'].iloc[idx] for idx in top_indices],
|
420 |
-
scores[top_indices])
|
421 |
-
|
422 |
-
for group in grouped_timecodes:
|
423 |
-
max_score_idx = max(range(len(group)), key=lambda i: group[i][1])
|
424 |
-
timecode, score, idx = group[max_score_idx]
|
425 |
-
ax.annotate(timecode,
|
426 |
-
(df['Seconds'].iloc[top_indices[idx]], score),
|
427 |
-
xytext=(5, 5), textcoords='offset points',
|
428 |
-
fontsize=6, color='red')
|
429 |
-
|
430 |
-
max_seconds = df['Seconds'].max()
|
431 |
-
ax.set_xlim(0, max_seconds)
|
432 |
-
num_ticks = 100
|
433 |
-
ax.set_xticks(np.linspace(0, max_seconds, num_ticks))
|
434 |
-
ax.set_xticklabels([f"{int(x // 60):02d}:{int(x % 60):02d}" for x in ax.get_xticks()],
|
435 |
-
rotation=90, ha='center', va='top')
|
436 |
-
|
437 |
-
ax.set_xlabel('Time')
|
438 |
-
ax.set_ylabel('Anomaly Score')
|
439 |
-
ax.set_title(title)
|
440 |
-
|
441 |
-
ax.grid(True, linestyle='--', alpha=0.7)
|
442 |
-
plt.tight_layout()
|
443 |
-
plt.close()
|
444 |
-
return fig
|
445 |
|
446 |
-
def
|
447 |
plt.figure(figsize=(16, 8), dpi=300)
|
448 |
fig, ax = plt.subplots(figsize=(16, 8))
|
449 |
|
450 |
df['Seconds'] = df['Timecode'].apply(
|
451 |
lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
|
452 |
|
453 |
-
#
|
454 |
-
|
455 |
-
seconds = df['Seconds'].iloc[valid_indices].values
|
456 |
-
scores = anomaly_scores[valid_indices]
|
457 |
|
458 |
-
|
|
|
459 |
|
460 |
-
|
461 |
-
|
|
|
462 |
|
463 |
-
#
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
verticalalignment='bottom', horizontalalignment='right', color='black')
|
469 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
470 |
|
471 |
-
|
472 |
-
|
|
|
|
|
|
|
473 |
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
xytext=(5, 5), textcoords='offset points',
|
480 |
-
fontsize=6, color='red')
|
481 |
|
|
|
482 |
max_seconds = df['Seconds'].max()
|
483 |
-
ax.set_xlim(0, max_seconds)
|
484 |
num_ticks = 100
|
485 |
-
|
486 |
-
|
487 |
-
|
|
|
|
|
|
|
488 |
|
489 |
ax.set_xlabel('Time')
|
490 |
-
ax.set_ylabel(
|
491 |
-
ax.set_title(
|
492 |
|
493 |
ax.grid(True, linestyle='--', alpha=0.7)
|
494 |
plt.tight_layout()
|
@@ -522,10 +454,18 @@ def get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
|
|
522 |
cv2.imwrite(output_path, small_face)
|
523 |
face_samples["others"].append(output_path)
|
524 |
return face_samples
|
525 |
-
|
|
|
526 |
output_folder = "output"
|
527 |
os.makedirs(output_folder, exist_ok=True)
|
528 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
529 |
with tempfile.TemporaryDirectory() as temp_dir:
|
530 |
aligned_faces_folder = os.path.join(temp_dir, 'aligned_faces')
|
531 |
organized_faces_folder = os.path.join(temp_dir, 'organized_faces')
|
@@ -552,7 +492,7 @@ def process_video(video_path, num_components, desired_fps, batch_size, progress=
|
|
552 |
|
553 |
if not aligned_face_paths:
|
554 |
return ("No faces were extracted from the video.",
|
555 |
-
None, None, None, None, None, None, None, None)
|
556 |
|
557 |
progress(0.6, "Clustering faces")
|
558 |
embeddings = [embedding for _, embedding in embeddings_by_frame.items()]
|
@@ -564,7 +504,7 @@ def process_video(video_path, num_components, desired_fps, batch_size, progress=
|
|
564 |
|
565 |
progress(0.8, "Saving person data")
|
566 |
df, largest_cluster = save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps,
|
567 |
-
original_fps, temp_dir,
|
568 |
|
569 |
progress(0.85, "Getting face samples")
|
570 |
face_samples = get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
|
@@ -572,46 +512,29 @@ def process_video(video_path, num_components, desired_fps, batch_size, progress=
|
|
572 |
progress(0.9, "Performing anomaly detection")
|
573 |
feature_columns = [col for col in df.columns if
|
574 |
col not in ['Frame', 'Timecode', 'Time (Minutes)', 'Embedding_Index']]
|
|
|
575 |
X = df[feature_columns].values
|
576 |
|
577 |
try:
|
578 |
-
|
579 |
-
X, feature_columns, batch_size=batch_size)
|
580 |
-
|
581 |
-
anomaly_scores_all = normalize_scores(anomaly_scores_all)
|
582 |
-
anomaly_scores_comp = normalize_scores(anomaly_scores_comp)
|
583 |
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
'anomalies': anomalies,
|
589 |
-
'scores': normalize_scores(scores),
|
590 |
-
'indices': indices
|
591 |
-
}
|
592 |
|
593 |
-
except Exception as e:
|
594 |
-
print(f"Error details: {str(e)}")
|
595 |
-
return f"Error in anomaly detection: {str(e)}", None, None, None, None, None, None, None, None
|
596 |
-
|
597 |
-
progress(0.95, "Generating plots")
|
598 |
-
try:
|
599 |
-
anomaly_plot_all = plot_anomaly_scores(df, anomaly_scores_all, top_indices_all,
|
600 |
-
"Facial Features + Emotions",
|
601 |
-
df['Timecode'].iloc[top_indices_all].values)
|
602 |
-
anomaly_plot_comp = plot_anomaly_scores(df, anomaly_scores_comp, top_indices_comp, "Facial Features",
|
603 |
-
df['Timecode'].iloc[top_indices_comp].values)
|
604 |
emotion_plots = [
|
605 |
-
|
606 |
-
|
607 |
-
emotion_anomalies[emotion]['indices'],
|
608 |
-
color,
|
609 |
-
df['Timecode'].iloc[emotion_anomalies[emotion]['indices']].values)
|
610 |
for emotion, color in zip(['fear', 'sad', 'angry', 'happy', 'surprise', 'neutral'],
|
611 |
['purple', 'green', 'orange', 'darkblue', 'gold', 'grey'])
|
612 |
]
|
|
|
613 |
except Exception as e:
|
614 |
-
|
|
|
|
|
615 |
|
616 |
progress(1.0, "Preparing results")
|
617 |
results = f"Number of persons/clusters detected: {num_clusters}\n\n"
|
@@ -619,55 +542,58 @@ def process_video(video_path, num_components, desired_fps, batch_size, progress=
|
|
619 |
for cluster_id in range(num_clusters):
|
620 |
results += f"Person/Cluster {cluster_id + 1}: {len([c for c in clusters if c == cluster_id])} frames\n"
|
621 |
|
622 |
-
|
623 |
return (
|
624 |
results,
|
625 |
-
|
626 |
-
|
|
|
627 |
*emotion_plots,
|
628 |
face_samples["most_frequent"],
|
629 |
face_samples["others"]
|
630 |
)
|
631 |
|
632 |
-
|
633 |
gallery_outputs = [
|
634 |
gr.Gallery(label="Most Frequent Person Random Samples", columns=5, rows=2, height="auto"),
|
635 |
gr.Gallery(label="Other Persons Random Samples", columns=5, rows=1, height="auto")
|
636 |
]
|
637 |
|
|
|
638 |
iface = gr.Interface(
|
639 |
fn=process_video,
|
640 |
inputs=[
|
641 |
gr.Video(),
|
642 |
-
gr.Slider(minimum=1, maximum=10, step=1, value=5, label="Number of Components"),
|
643 |
gr.Slider(minimum=1, maximum=20, step=1, value=10, label="Desired FPS"),
|
644 |
gr.Slider(minimum=1, maximum=32, step=1, value=8, label="Batch Size")
|
645 |
],
|
646 |
outputs=[
|
647 |
gr.Textbox(label="Anomaly Detection Results"),
|
648 |
-
gr.Plot(label="
|
649 |
-
gr.Plot(label="
|
650 |
-
gr.Plot(label="
|
651 |
-
gr.Plot(label="
|
652 |
-
gr.Plot(label="
|
653 |
-
gr.Plot(label="
|
654 |
-
gr.Plot(label="
|
655 |
-
gr.Plot(label="
|
|
|
656 |
] + gallery_outputs,
|
657 |
title="Facial Expressions Anomaly Detection",
|
658 |
description="""
|
659 |
This application detects anomalies in facial expressions and emotions from a video input.
|
660 |
It identifies distinct persons in the video and provides sample faces for each, with multiple samples for the most frequent person.
|
661 |
|
|
|
|
|
|
|
|
|
|
|
662 |
Adjust the parameters as needed:
|
663 |
-
- Number of Components: Complexity of the facial expression model
|
664 |
- Desired FPS: Frames per second to analyze (lower for faster processing)
|
665 |
- Batch Size: Affects processing speed and memory usage
|
666 |
-
|
667 |
-
Click on any graph to enlarge it.
|
668 |
""",
|
669 |
allow_flagging="never"
|
670 |
)
|
671 |
|
672 |
-
|
673 |
-
iface.launch()
|
|
|
12 |
from sklearn.cluster import DBSCAN, KMeans
|
13 |
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
14 |
from sklearn.metrics import silhouette_score
|
15 |
+
from sklearn.decomposition import PCA
|
16 |
import umap
|
17 |
import pandas as pd
|
18 |
import matplotlib
|
|
|
42 |
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.5)
|
43 |
emotion_detector = FER(mtcnn=False)
|
44 |
|
45 |
+
|
46 |
def frame_to_timecode(frame_num, total_frames, duration):
|
47 |
total_seconds = (frame_num / total_frames) * duration
|
48 |
hours = int(total_seconds // 3600)
|
|
|
51 |
milliseconds = int((total_seconds - int(total_seconds)) * 1000)
|
52 |
return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
|
53 |
|
54 |
+
|
55 |
def get_face_embedding_and_emotion(face_img):
|
56 |
face_tensor = torch.tensor(face_img).permute(2, 0, 1).unsqueeze(0).float() / 255
|
57 |
face_tensor = (face_tensor - 0.5) / 0.5
|
|
|
67 |
|
68 |
return embedding.cpu().numpy().flatten(), emotion_dict
|
69 |
|
70 |
+
|
71 |
def alignFace(img):
|
72 |
img_raw = img.copy()
|
73 |
results = face_mesh.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
|
|
|
93 |
new_img = cv2.warpAffine(img_raw, rotation_matrix, (width, height))
|
94 |
return new_img
|
95 |
|
96 |
+
|
97 |
def extract_frames(video_path, output_folder, desired_fps, progress_callback=None):
|
98 |
os.makedirs(output_folder, exist_ok=True)
|
99 |
clip = VideoFileClip(video_path)
|
|
|
117 |
clip.close()
|
118 |
return frame_count, original_fps
|
119 |
|
120 |
+
|
121 |
def process_frames(frames_folder, aligned_faces_folder, frame_count, progress, batch_size):
|
122 |
embeddings_by_frame = {}
|
123 |
emotions_by_frame = {}
|
|
|
161 |
|
162 |
return embeddings_by_frame, emotions_by_frame, aligned_face_paths
|
163 |
|
164 |
+
|
165 |
def cluster_faces(embeddings):
|
166 |
if len(embeddings) < 2:
|
167 |
print("Not enough faces for clustering. Assigning all to one cluster.")
|
|
|
178 |
|
179 |
return clusters
|
180 |
|
181 |
+
|
182 |
def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder):
|
183 |
for (frame_num, embedding), cluster in zip(embeddings_by_frame.items(), clusters):
|
184 |
person_folder = os.path.join(organized_faces_folder, f"person_{cluster}")
|
|
|
187 |
dst = os.path.join(person_folder, f"frame_{frame_num}_face.jpg")
|
188 |
shutil.copy(src, dst)
|
189 |
|
190 |
+
|
191 |
+
def find_optimal_components(embeddings, max_components=10):
|
192 |
+
pca = PCA(n_components=max_components)
|
193 |
+
pca.fit(embeddings)
|
194 |
+
|
195 |
+
explained_variance_ratio = pca.explained_variance_ratio_
|
196 |
+
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
|
197 |
+
|
198 |
+
# Plot explained variance ratio
|
199 |
+
plt.figure(figsize=(10, 6))
|
200 |
+
plt.plot(range(1, max_components + 1), cumulative_variance_ratio, 'bo-')
|
201 |
+
plt.xlabel('Number of Components')
|
202 |
+
plt.ylabel('Cumulative Explained Variance Ratio')
|
203 |
+
plt.title('Explained Variance Ratio vs. Number of Components')
|
204 |
+
plt.grid(True)
|
205 |
+
|
206 |
+
# Find elbow point
|
207 |
+
differences = np.diff(cumulative_variance_ratio)
|
208 |
+
elbow_point = np.argmin(differences) + 1
|
209 |
+
|
210 |
+
plt.axvline(x=elbow_point, color='r', linestyle='--', label=f'Elbow point: {elbow_point}')
|
211 |
+
plt.legend()
|
212 |
+
|
213 |
+
return elbow_point, plt
|
214 |
+
|
215 |
+
|
216 |
def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps, original_fps, output_folder,
|
217 |
+
video_duration):
|
218 |
emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
|
219 |
person_data = {}
|
220 |
|
|
|
233 |
embeddings_array = np.array(embeddings)
|
234 |
np.save(os.path.join(output_folder, 'face_embeddings.npy'), embeddings_array)
|
235 |
|
236 |
+
# Find optimal number of components
|
237 |
+
optimal_components, _ = find_optimal_components(embeddings_array)
|
238 |
+
|
239 |
+
reducer = umap.UMAP(n_components=optimal_components, random_state=1)
|
240 |
embeddings_reduced = reducer.fit_transform(embeddings)
|
241 |
|
242 |
scaler = MinMaxScaler(feature_range=(0, 1))
|
|
|
253 |
'Embedding_Index': range(len(embeddings))
|
254 |
}
|
255 |
|
256 |
+
# Add raw embeddings
|
257 |
+
for i in range(len(embeddings[0])):
|
258 |
+
df_data[f'Raw_Embedding_{i}'] = [embedding[i] for embedding in embeddings]
|
259 |
+
|
260 |
+
for i in range(optimal_components):
|
261 |
df_data[f'Comp {i + 1}'] = embeddings_reduced_normalized[:, i]
|
262 |
|
263 |
for emotion in emotions:
|
|
|
267 |
|
268 |
return df, largest_cluster
|
269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
|
271 |
class LSTMAutoencoder(nn.Module):
|
272 |
def __init__(self, input_size, hidden_size=64, num_layers=2):
|
|
|
282 |
out = self.fc(outputs)
|
283 |
return out
|
284 |
|
285 |
+
|
286 |
+
def lstm_anomaly_detection(X, feature_columns, raw_embedding_columns, epochs=100, batch_size=64):
|
287 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
288 |
X = torch.FloatTensor(X).to(device)
|
289 |
if X.dim() == 2:
|
290 |
X = X.unsqueeze(0)
|
291 |
elif X.dim() == 1:
|
292 |
X = X.unsqueeze(0).unsqueeze(2)
|
|
|
|
|
293 |
|
294 |
print(f"X shape after reshaping: {X.shape}")
|
295 |
|
|
|
|
|
|
|
296 |
model = LSTMAutoencoder(input_size=X.shape[2]).to(device)
|
297 |
criterion = nn.MSELoss()
|
298 |
optimizer = optim.Adam(model.parameters())
|
|
|
300 |
for epoch in range(epochs):
|
301 |
model.train()
|
302 |
optimizer.zero_grad()
|
303 |
+
output = model(X)
|
304 |
+
loss = criterion(output, X)
|
305 |
+
loss.backward()
|
306 |
optimizer.step()
|
307 |
|
308 |
+
if epoch % 10 == 0:
|
309 |
+
print(f"Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f}")
|
|
|
|
|
310 |
|
311 |
model.eval()
|
312 |
with torch.no_grad():
|
313 |
reconstructed = model(X).squeeze(0).cpu().numpy()
|
314 |
|
315 |
mse_all = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
|
|
|
316 |
|
317 |
component_columns = [col for col in feature_columns if col.startswith('Comp')]
|
318 |
component_indices = [feature_columns.index(col) for col in component_columns]
|
|
|
323 |
else:
|
324 |
mse_comp = mse_all
|
325 |
|
326 |
+
raw_embedding_indices = [feature_columns.index(col) for col in raw_embedding_columns]
|
327 |
+
mse_raw = np.mean(np.power(X.squeeze(0).cpu().numpy()[:, raw_embedding_indices] - reconstructed[:, raw_embedding_indices], 2), axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
|
329 |
+
return mse_all, mse_comp, mse_raw
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
|
331 |
def embedding_anomaly_detection(embeddings, epochs=100, batch_size=64):
|
332 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
353 |
reconstructed = model(X).squeeze(0).cpu().numpy()
|
354 |
|
355 |
mse = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
|
356 |
+
return mse
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
|
358 |
+
def determine_anomalies(mse_values, threshold=3.5):
|
359 |
+
mean = np.mean(mse_values)
|
360 |
+
std = np.std(mse_values)
|
361 |
+
anomalies = mse_values > (mean + threshold * std)
|
362 |
+
return anomalies
|
363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
|
365 |
+
def plot_mse(df, mse_values, title, color='blue', time_threshold=1, hide_first_n=3):
|
366 |
plt.figure(figsize=(16, 8), dpi=300)
|
367 |
fig, ax = plt.subplots(figsize=(16, 8))
|
368 |
|
369 |
df['Seconds'] = df['Timecode'].apply(
|
370 |
lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
|
371 |
|
372 |
+
# Plot all points
|
373 |
+
ax.scatter(df['Seconds'], mse_values, color=color, alpha=0.7, s=10)
|
|
|
|
|
374 |
|
375 |
+
# Determine anomalies
|
376 |
+
anomalies = determine_anomalies(mse_values)
|
377 |
|
378 |
+
# Hide the first n anomalies
|
379 |
+
visible_anomalies = np.where(anomalies)[0][hide_first_n:]
|
380 |
+
ax.scatter(df['Seconds'].iloc[visible_anomalies], mse_values[visible_anomalies], color='red', s=50, zorder=5)
|
381 |
|
382 |
+
# Group closely occurring anomalies and annotate only the highest MSE
|
383 |
+
anomaly_data = list(zip(df['Timecode'].iloc[visible_anomalies],
|
384 |
+
df['Seconds'].iloc[visible_anomalies],
|
385 |
+
mse_values[visible_anomalies]))
|
386 |
+
anomaly_data.sort(key=lambda x: x[1]) # Sort by seconds
|
|
|
387 |
|
388 |
+
grouped_anomalies = []
|
389 |
+
current_group = []
|
390 |
+
for timecode, sec, mse in anomaly_data:
|
391 |
+
if not current_group or sec - current_group[-1][1] <= time_threshold:
|
392 |
+
current_group.append((timecode, sec, mse))
|
393 |
+
else:
|
394 |
+
grouped_anomalies.append(current_group)
|
395 |
+
current_group = [(timecode, sec, mse)]
|
396 |
+
if current_group:
|
397 |
+
grouped_anomalies.append(current_group)
|
398 |
|
399 |
+
for group in grouped_anomalies:
|
400 |
+
highest_mse_anomaly = max(group, key=lambda x: x[2])
|
401 |
+
timecode, sec, mse = highest_mse_anomaly
|
402 |
+
ax.annotate(timecode, (sec, mse), textcoords="offset points", xytext=(0, 10),
|
403 |
+
ha='center', fontsize=8, color='red')
|
404 |
|
405 |
+
# Add baseline (mean MSE) line
|
406 |
+
mean_mse = np.mean(mse_values)
|
407 |
+
ax.axhline(y=mean_mse, color='black', linestyle='--', linewidth=1)
|
408 |
+
ax.text(df['Seconds'].max(), mean_mse, f'Baseline ({mean_mse:.6f})',
|
409 |
+
verticalalignment='bottom', horizontalalignment='right', color='black', fontsize=8)
|
|
|
|
|
410 |
|
411 |
+
# Set x-axis labels to timecodes
|
412 |
max_seconds = df['Seconds'].max()
|
|
|
413 |
num_ticks = 100
|
414 |
+
tick_locations = np.linspace(0, max_seconds, num_ticks)
|
415 |
+
tick_labels = [frame_to_timecode(int(s * df['Frame'].max() / max_seconds), df['Frame'].max(), max_seconds)
|
416 |
+
for s in tick_locations]
|
417 |
+
|
418 |
+
ax.set_xticks(tick_locations)
|
419 |
+
ax.set_xticklabels(tick_labels, rotation=90, ha='center', fontsize=6)
|
420 |
|
421 |
ax.set_xlabel('Time')
|
422 |
+
ax.set_ylabel('Mean Squared Error')
|
423 |
+
ax.set_title(title)
|
424 |
|
425 |
ax.grid(True, linestyle='--', alpha=0.7)
|
426 |
plt.tight_layout()
|
|
|
454 |
cv2.imwrite(output_path, small_face)
|
455 |
face_samples["others"].append(output_path)
|
456 |
return face_samples
|
457 |
+
|
458 |
+
def process_video(video_path, desired_fps, batch_size, progress=gr.Progress()):
|
459 |
output_folder = "output"
|
460 |
os.makedirs(output_folder, exist_ok=True)
|
461 |
|
462 |
+
# Initialize plot variables
|
463 |
+
mse_plot_all = None
|
464 |
+
mse_plot_comp = None
|
465 |
+
mse_plot_raw = None
|
466 |
+
emotion_plots = [None] * 6 # For the 6 emotions
|
467 |
+
face_samples = {"most_frequent": [], "others": []}
|
468 |
+
|
469 |
with tempfile.TemporaryDirectory() as temp_dir:
|
470 |
aligned_faces_folder = os.path.join(temp_dir, 'aligned_faces')
|
471 |
organized_faces_folder = os.path.join(temp_dir, 'organized_faces')
|
|
|
492 |
|
493 |
if not aligned_face_paths:
|
494 |
return ("No faces were extracted from the video.",
|
495 |
+
None, None, None, None, None, None, None, None, None, [], [])
|
496 |
|
497 |
progress(0.6, "Clustering faces")
|
498 |
embeddings = [embedding for _, embedding in embeddings_by_frame.items()]
|
|
|
504 |
|
505 |
progress(0.8, "Saving person data")
|
506 |
df, largest_cluster = save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps,
|
507 |
+
original_fps, temp_dir, video_duration)
|
508 |
|
509 |
progress(0.85, "Getting face samples")
|
510 |
face_samples = get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
|
|
|
512 |
progress(0.9, "Performing anomaly detection")
|
513 |
feature_columns = [col for col in df.columns if
|
514 |
col not in ['Frame', 'Timecode', 'Time (Minutes)', 'Embedding_Index']]
|
515 |
+
raw_embedding_columns = [col for col in df.columns if col.startswith('Raw_Embedding_')]
|
516 |
X = df[feature_columns].values
|
517 |
|
518 |
try:
|
519 |
+
mse_all, mse_comp, mse_raw = lstm_anomaly_detection(
|
520 |
+
X, feature_columns, raw_embedding_columns, batch_size=batch_size)
|
|
|
|
|
|
|
521 |
|
522 |
+
progress(0.95, "Generating plots")
|
523 |
+
mse_plot_all = plot_mse(df, mse_all, "Facial Features + Emotions", color='blue', hide_first_n=3)
|
524 |
+
mse_plot_comp = plot_mse(df, mse_comp, "Facial Features", color='deepskyblue', hide_first_n=3)
|
525 |
+
mse_plot_raw = plot_mse(df, mse_raw, "Facial Embeddings", color='steelblue', hide_first_n=3)
|
|
|
|
|
|
|
|
|
526 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
527 |
emotion_plots = [
|
528 |
+
plot_mse(df, embedding_anomaly_detection(df[emotion].values.reshape(-1, 1)),
|
529 |
+
f"MSE: {emotion.capitalize()}", color=color, hide_first_n=3)
|
|
|
|
|
|
|
530 |
for emotion, color in zip(['fear', 'sad', 'angry', 'happy', 'surprise', 'neutral'],
|
531 |
['purple', 'green', 'orange', 'darkblue', 'gold', 'grey'])
|
532 |
]
|
533 |
+
|
534 |
except Exception as e:
|
535 |
+
print(f"Error details: {str(e)}")
|
536 |
+
return (f"Error in anomaly detection: {str(e)}",
|
537 |
+
None, None, None, None, None, None, None, None, None, [], [])
|
538 |
|
539 |
progress(1.0, "Preparing results")
|
540 |
results = f"Number of persons/clusters detected: {num_clusters}\n\n"
|
|
|
542 |
for cluster_id in range(num_clusters):
|
543 |
results += f"Person/Cluster {cluster_id + 1}: {len([c for c in clusters if c == cluster_id])} frames\n"
|
544 |
|
|
|
545 |
return (
|
546 |
results,
|
547 |
+
mse_plot_all,
|
548 |
+
mse_plot_comp,
|
549 |
+
mse_plot_raw,
|
550 |
*emotion_plots,
|
551 |
face_samples["most_frequent"],
|
552 |
face_samples["others"]
|
553 |
)
|
554 |
|
555 |
+
# Define gallery outputs
|
556 |
gallery_outputs = [
|
557 |
gr.Gallery(label="Most Frequent Person Random Samples", columns=5, rows=2, height="auto"),
|
558 |
gr.Gallery(label="Other Persons Random Samples", columns=5, rows=1, height="auto")
|
559 |
]
|
560 |
|
561 |
+
# Update the Gradio interface
|
562 |
iface = gr.Interface(
|
563 |
fn=process_video,
|
564 |
inputs=[
|
565 |
gr.Video(),
|
|
|
566 |
gr.Slider(minimum=1, maximum=20, step=1, value=10, label="Desired FPS"),
|
567 |
gr.Slider(minimum=1, maximum=32, step=1, value=8, label="Batch Size")
|
568 |
],
|
569 |
outputs=[
|
570 |
gr.Textbox(label="Anomaly Detection Results"),
|
571 |
+
gr.Plot(label="MSE: Facial Features + Emotions"),
|
572 |
+
gr.Plot(label="MSE: Facial Features (UMAP)"),
|
573 |
+
gr.Plot(label="MSE: Raw Facial Embeddings"),
|
574 |
+
gr.Plot(label="MSE: Fear"),
|
575 |
+
gr.Plot(label="MSE: Sad"),
|
576 |
+
gr.Plot(label="MSE: Angry"),
|
577 |
+
gr.Plot(label="MSE: Happy"),
|
578 |
+
gr.Plot(label="MSE: Surprise"),
|
579 |
+
gr.Plot(label="MSE: Neutral"),
|
580 |
] + gallery_outputs,
|
581 |
title="Facial Expressions Anomaly Detection",
|
582 |
description="""
|
583 |
This application detects anomalies in facial expressions and emotions from a video input.
|
584 |
It identifies distinct persons in the video and provides sample faces for each, with multiple samples for the most frequent person.
|
585 |
|
586 |
+
The graphs show Mean Squared Error (MSE) values for different aspects of facial expressions and emotions over time.
|
587 |
+
Each point represents a frame, with red points indicating detected anomalies.
|
588 |
+
Anomalies are annotated with their corresponding timecodes.
|
589 |
+
Higher MSE values indicate more unusual or anomalous expressions or emotions at that point in the video.
|
590 |
+
|
591 |
Adjust the parameters as needed:
|
|
|
592 |
- Desired FPS: Frames per second to analyze (lower for faster processing)
|
593 |
- Batch Size: Affects processing speed and memory usage
|
|
|
|
|
594 |
""",
|
595 |
allow_flagging="never"
|
596 |
)
|
597 |
|
598 |
+
# Launch the interface
|
599 |
+
iface.launch()
|