reab5555 commited on
Commit
16178c1
·
verified ·
1 Parent(s): 5b70288

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +357 -267
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import cv2
3
  import numpy as np
@@ -5,38 +6,35 @@ import torch
5
  import torch.nn as nn
6
  import torch.optim as optim
7
  from facenet_pytorch import InceptionResnetV1, MTCNN
 
8
  import mediapipe as mp
9
  from fer import FER
10
  from sklearn.cluster import DBSCAN
11
- from sklearn.preprocessing import MinMaxScaler
12
- from sklearn.decomposition import PCA
13
- import umap
14
  import pandas as pd
15
  import matplotlib
16
  import matplotlib.pyplot as plt
 
17
  from moviepy.editor import VideoFileClip
18
  from PIL import Image
19
  import gradio as gr
20
  import tempfile
21
  import shutil
22
- import tensorflow as tf
23
-
24
- print(torch.__version__)
25
- print(torch.version.cuda)
26
 
27
  matplotlib.rcParams['figure.dpi'] = 500
28
  matplotlib.rcParams['savefig.dpi'] = 500
29
 
30
  # Initialize models and other global variables
31
- device = 'cuda'
32
 
33
- mtcnn = MTCNN(keep_all=False, device=device, thresholds=[0.985, 0.985, 0.985], min_face_size=80)
34
  model = InceptionResnetV1(pretrained='vggface2').eval().to(device)
35
  mp_face_mesh = mp.solutions.face_mesh
36
- face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.8)
37
  emotion_detector = FER(mtcnn=False)
38
 
39
-
40
  def frame_to_timecode(frame_num, total_frames, duration):
41
  total_seconds = (frame_num / total_frames) * duration
42
  hours = int(total_seconds // 3600)
@@ -45,6 +43,15 @@ def frame_to_timecode(frame_num, total_frames, duration):
45
  milliseconds = int((total_seconds - int(total_seconds)) * 1000)
46
  return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
47
 
 
 
 
 
 
 
 
 
 
48
 
49
  def get_face_embedding_and_emotion(face_img):
50
  face_tensor = torch.tensor(face_img).permute(2, 0, 1).unsqueeze(0).float() / 255
@@ -57,11 +64,10 @@ def get_face_embedding_and_emotion(face_img):
57
  if emotions:
58
  emotion_dict = emotions[0]['emotions']
59
  else:
60
- emotion_dict = {e: 0 for e in ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']}
61
 
62
  return embedding.cpu().numpy().flatten(), emotion_dict
63
 
64
-
65
  def alignFace(img):
66
  img_raw = img.copy()
67
  results = face_mesh.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
@@ -87,7 +93,6 @@ def alignFace(img):
87
  new_img = cv2.warpAffine(img_raw, rotation_matrix, (width, height))
88
  return new_img
89
 
90
-
91
  def extract_frames(video_path, output_folder, desired_fps, progress_callback=None):
92
  os.makedirs(output_folder, exist_ok=True)
93
  clip = VideoFileClip(video_path)
@@ -111,6 +116,19 @@ def extract_frames(video_path, output_folder, desired_fps, progress_callback=Non
111
  clip.close()
112
  return frame_count, original_fps
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  def process_frames(frames_folder, aligned_faces_folder, frame_count, progress, batch_size):
116
  embeddings_by_frame = {}
@@ -140,29 +158,29 @@ def process_frames(frames_folder, aligned_faces_folder, frame_count, progress, b
140
  x1, y1, x2, y2 = [int(b) for b in boxes[0]]
141
  face = frame[y1:y2, x1:x2]
142
  if face.size > 0:
143
- aligned_face = alignFace(face)
144
- if aligned_face is not None:
145
- aligned_face_resized = cv2.resize(aligned_face, (160, 160))
146
- output_path = os.path.join(aligned_faces_folder, f"frame_{frame_num}_face.jpg")
147
- cv2.imwrite(output_path, aligned_face_resized)
148
- aligned_face_paths.append(output_path)
149
- embedding, emotion = get_face_embedding_and_emotion(aligned_face_resized)
150
- embeddings_by_frame[frame_num] = embedding
151
- emotions_by_frame[frame_num] = emotion
152
-
153
- progress((i + len(batch_files)) / frame_count,
154
- f"Processing frames {i + 1} to {min(i + len(batch_files), frame_count)} of {frame_count}")
 
 
155
 
156
  return embeddings_by_frame, emotions_by_frame, aligned_face_paths
157
 
158
-
159
  def cluster_faces(embeddings):
160
  if len(embeddings) < 2:
161
  print("Not enough faces for clustering. Assigning all to one cluster.")
162
  return np.zeros(len(embeddings), dtype=int)
163
 
164
  X = np.stack(embeddings)
165
-
166
  dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine')
167
  clusters = dbscan.fit_predict(X)
168
 
@@ -172,7 +190,6 @@ def cluster_faces(embeddings):
172
 
173
  return clusters
174
 
175
-
176
  def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder):
177
  for (frame_num, embedding), cluster in zip(embeddings_by_frame.items(), clusters):
178
  person_folder = os.path.join(organized_faces_folder, f"person_{cluster}")
@@ -181,39 +198,11 @@ def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder
181
  dst = os.path.join(person_folder, f"frame_{frame_num}_face.jpg")
182
  shutil.copy(src, dst)
183
 
184
-
185
- def find_optimal_components(embeddings, max_components=20):
186
- pca = PCA(n_components=max_components)
187
- pca.fit(embeddings)
188
-
189
- explained_variance_ratio = pca.explained_variance_ratio_
190
- cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
191
-
192
- # Plot explained variance ratio
193
- plt.figure(figsize=(10, 6))
194
- plt.plot(range(1, max_components + 1), cumulative_variance_ratio, 'bo-')
195
- plt.xlabel('Number of Components')
196
- plt.ylabel('Cumulative Explained Variance Ratio')
197
- plt.title('Explained Variance Ratio vs. Number of Components')
198
- plt.grid(True)
199
-
200
- # Find elbow point
201
- differences = np.diff(cumulative_variance_ratio)
202
- elbow_point = np.argmin(differences) + 1
203
-
204
- plt.axvline(x=elbow_point, color='r', linestyle='--', label=f'Elbow point: {elbow_point}')
205
- plt.legend()
206
-
207
- return elbow_point, plt
208
-
209
-
210
- def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps, original_fps, output_folder,
211
- video_duration):
212
- emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
213
  person_data = {}
214
 
215
- for (frame_num, embedding), (_, emotion_dict), cluster in zip(embeddings_by_frame.items(),
216
- emotions_by_frame.items(), clusters):
217
  if cluster not in person_data:
218
  person_data[cluster] = []
219
  person_data[cluster].append((frame_num, embedding, {e: emotion_dict[e] for e in emotions}))
@@ -227,33 +216,18 @@ def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, de
227
  embeddings_array = np.array(embeddings)
228
  np.save(os.path.join(output_folder, 'face_embeddings.npy'), embeddings_array)
229
 
230
- # Find optimal number of components
231
- optimal_components, _ = find_optimal_components(embeddings_array)
232
-
233
- reducer = umap.UMAP(n_components=optimal_components, random_state=1)
234
- embeddings_reduced = reducer.fit_transform(embeddings)
235
-
236
- scaler = MinMaxScaler(feature_range=(0, 1))
237
- embeddings_reduced_normalized = scaler.fit_transform(embeddings_reduced)
238
-
239
  total_frames = max(frames)
240
  timecodes = [frame_to_timecode(frame, total_frames, video_duration) for frame in frames]
241
- times_in_minutes = [frame / total_frames * video_duration / 60 for frame in frames]
242
 
243
  df_data = {
244
  'Frame': frames,
245
  'Timecode': timecodes,
246
- 'Time (Minutes)': times_in_minutes,
247
  'Embedding_Index': range(len(embeddings))
248
  }
249
 
250
- # Add raw embeddings
251
  for i in range(len(embeddings[0])):
252
  df_data[f'Raw_Embedding_{i}'] = [embedding[i] for embedding in embeddings]
253
 
254
- for i in range(optimal_components):
255
- df_data[f'Comp {i + 1}'] = embeddings_reduced_normalized[:, i]
256
-
257
  for emotion in emotions:
258
  df_data[emotion] = [e[emotion] for e in emotions_data]
259
 
@@ -261,123 +235,139 @@ def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, de
261
 
262
  return df, largest_cluster
263
 
264
-
265
- class LSTMAutoencoder(nn.Module):
266
- def __init__(self, input_size, hidden_size=128, num_layers=2):
267
- super(LSTMAutoencoder, self).__init__()
268
- self.input_size = input_size
269
- self.hidden_size = hidden_size
270
- self.num_layers = num_layers
271
- self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
272
- self.fc = nn.Linear(hidden_size, input_size)
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  def forward(self, x):
275
- outputs, (hidden, _) = self.lstm(x)
276
- out = self.fc(outputs)
277
- return out
 
 
 
 
 
 
 
 
278
 
 
 
279
 
280
- def lstm_anomaly_detection(X, feature_columns, raw_embedding_columns, epochs=100, batch_size=8):
281
- device = 'cuda'
282
- X = torch.FloatTensor(X).to(device)
283
- if X.dim() == 2:
284
- X = X.unsqueeze(0)
285
- elif X.dim() == 1:
286
- X = X.unsqueeze(0).unsqueeze(2)
287
 
288
- print(f"X shape after reshaping: {X.shape}")
 
 
 
289
 
290
- model = LSTMAutoencoder(input_size=X.shape[2]).to(device)
291
  criterion = nn.MSELoss()
292
- optimizer = optim.Adam(model.parameters())
293
 
 
294
  for epoch in range(epochs):
295
- model.train()
296
- optimizer.zero_grad()
297
- output = model(X)
298
- loss = criterion(output, X)
299
- loss.backward()
300
- optimizer.step()
301
-
302
- if epoch % 10 == 0:
303
- print(f"Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f}")
304
-
305
- model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  with torch.no_grad():
307
- reconstructed = model(X).squeeze(0).cpu().numpy()
308
-
309
- mse_all = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
310
-
311
- component_columns = [col for col in feature_columns if col.startswith('Comp')]
312
- component_indices = [feature_columns.index(col) for col in component_columns]
313
 
314
- if len(component_indices) > 0:
315
- mse_comp = np.mean(
316
- np.power(X.squeeze(0).cpu().numpy()[:, component_indices] - reconstructed[:, component_indices], 2), axis=1)
317
- else:
318
- mse_comp = mse_all
319
-
320
- raw_embedding_indices = [feature_columns.index(col) for col in raw_embedding_columns]
321
- mse_raw = np.mean(np.power(X.squeeze(0).cpu().numpy()[:, raw_embedding_indices] - reconstructed[:, raw_embedding_indices], 2), axis=1)
322
 
323
- return mse_all, mse_comp, mse_raw
324
 
325
- def embedding_anomaly_detection(embeddings, epochs=100, batch_size=8):
326
- device = 'cuda'
327
- X = torch.FloatTensor(embeddings).to(device)
328
- if X.dim() == 2:
329
- X = X.unsqueeze(0)
330
- elif X.dim() == 1:
331
- X = X.unsqueeze(0).unsqueeze(2)
332
-
333
- model = LSTMAutoencoder(input_size=X.shape[2]).to(device)
334
- criterion = nn.MSELoss()
335
- optimizer = optim.Adam(model.parameters())
336
 
337
- for epoch in range(epochs):
338
- model.train()
339
- optimizer.zero_grad()
340
- output = model(X)
341
- loss = criterion(output, X)
342
- loss.backward()
343
- optimizer.step()
344
-
345
- model.eval()
346
- with torch.no_grad():
347
- reconstructed = model(X).squeeze(0).cpu().numpy()
348
 
349
- mse = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
350
- return mse
 
 
351
 
352
- def determine_anomalies(mse_values, threshold=5):
353
- mean = np.mean(mse_values)
354
- std = np.std(mse_values)
355
- anomalies = mse_values > (mean + threshold * std)
356
- return anomalies
357
 
 
 
 
358
 
359
- def plot_mse(df, mse_values, title, color='blue', time_threshold=1, hide_first_n=2):
360
- plt.figure(figsize=(16, 8), dpi=300)
361
- fig, ax = plt.subplots(figsize=(16, 8))
362
 
363
- df['Seconds'] = df['Timecode'].apply(
364
- lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
 
365
 
366
- # Plot all points
367
- ax.scatter(df['Seconds'], mse_values, color=color, alpha=0.7, s=10)
 
 
368
 
369
- # Determine anomalies
370
- anomalies = determine_anomalies(mse_values)
371
 
372
- # Hide the first n anomalies
373
- visible_anomalies = np.where(anomalies)[0][hide_first_n:]
374
- ax.scatter(df['Seconds'].iloc[visible_anomalies], mse_values[visible_anomalies], color='red', s=50, zorder=5)
375
 
376
- # Group closely occurring anomalies and annotate only the highest MSE
377
- anomaly_data = list(zip(df['Timecode'].iloc[visible_anomalies],
378
- df['Seconds'].iloc[visible_anomalies],
379
- mse_values[visible_anomalies]))
380
- anomaly_data.sort(key=lambda x: x[1]) # Sort by seconds
381
 
382
  grouped_anomalies = []
383
  current_group = []
@@ -390,38 +380,116 @@ def plot_mse(df, mse_values, title, color='blue', time_threshold=1, hide_first_n
390
  if current_group:
391
  grouped_anomalies.append(current_group)
392
 
 
 
 
 
 
 
 
393
  for group in grouped_anomalies:
394
  highest_mse_anomaly = max(group, key=lambda x: x[2])
395
  timecode, sec, mse = highest_mse_anomaly
396
  ax.annotate(timecode, (sec, mse), textcoords="offset points", xytext=(0, 10),
397
- ha='center', fontsize=8, color='red')
398
 
399
- # Add baseline (mean MSE) line
400
- mean_mse = np.mean(mse_values)
401
- ax.axhline(y=mean_mse, color='black', linestyle='--', linewidth=1)
402
- ax.text(df['Seconds'].max(), mean_mse, f'Baseline ({mean_mse:.6f})',
403
- verticalalignment='bottom', horizontalalignment='right', color='black', fontsize=8)
404
-
405
- # Set x-axis labels to timecodes
406
  max_seconds = df['Seconds'].max()
407
  num_ticks = 100
408
  tick_locations = np.linspace(0, max_seconds, num_ticks)
409
- tick_labels = [frame_to_timecode(int(s * df['Frame'].max() / max_seconds), df['Frame'].max(), max_seconds)
410
- for s in tick_locations]
411
 
412
  ax.set_xticks(tick_locations)
413
  ax.set_xticklabels(tick_labels, rotation=90, ha='center', fontsize=6)
414
 
415
- ax.set_xlabel('Time')
416
  ax.set_ylabel('Mean Squared Error')
417
  ax.set_title(title)
418
 
419
  ax.grid(True, linestyle='--', alpha=0.7)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  plt.tight_layout()
421
  plt.close()
422
  return fig
423
 
424
- def get_all_face_samples(organized_faces_folder, output_folder, largest_cluster):
425
  face_samples = {"most_frequent": [], "others": []}
426
  for cluster_folder in sorted(os.listdir(organized_faces_folder)):
427
  if cluster_folder.startswith("person_"):
@@ -430,7 +498,7 @@ def get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
430
  if face_files:
431
  cluster_id = int(cluster_folder.split('_')[1])
432
  if cluster_id == largest_cluster:
433
- for i, sample in enumerate(face_files):
434
  face_path = os.path.join(person_folder, sample)
435
  output_path = os.path.join(output_folder, f"face_sample_most_frequent_{i:04d}.jpg")
436
  face_img = cv2.imread(face_path)
@@ -438,27 +506,28 @@ def get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
438
  small_face = cv2.resize(face_img, (160, 160))
439
  cv2.imwrite(output_path, small_face)
440
  face_samples["most_frequent"].append(output_path)
 
 
441
  else:
442
- for i, sample in enumerate(face_files):
443
- face_path = os.path.join(person_folder, sample)
444
- output_path = os.path.join(output_folder, f"face_sample_other_{cluster_id:02d}_{i:04d}.jpg")
445
- face_img = cv2.imread(face_path)
446
- if face_img is not None:
447
- small_face = cv2.resize(face_img, (160, 160))
448
- cv2.imwrite(output_path, small_face)
449
- face_samples["others"].append(output_path)
 
 
 
 
450
  return face_samples
451
 
452
- def process_video(video_path, desired_fps, batch_size, progress=gr.Progress()):
 
453
  output_folder = "output"
454
  os.makedirs(output_folder, exist_ok=True)
455
-
456
- # Initialize plot variables
457
- mse_plot_all = None
458
- mse_plot_comp = None
459
- mse_plot_raw = None
460
- emotion_plots = [None] * 6 # For the 6 emotions
461
- face_samples = {"most_frequent": [], "others": []}
462
 
463
  with tempfile.TemporaryDirectory() as temp_dir:
464
  aligned_faces_folder = os.path.join(temp_dir, 'aligned_faces')
@@ -485,13 +554,12 @@ def process_video(video_path, desired_fps, batch_size, progress=gr.Progress()):
485
  progress, batch_size)
486
 
487
  if not aligned_face_paths:
488
- return ("No faces were extracted from the video.",
489
- None, None, None, None, None, None, None, None, None, [], [])
490
 
491
  progress(0.6, "Clustering faces")
492
  embeddings = [embedding for _, embedding in embeddings_by_frame.items()]
493
  clusters = cluster_faces(embeddings)
494
- num_clusters = len(set(clusters)) # Get the number of unique clusters
495
 
496
  progress(0.7, "Organizing faces")
497
  organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder)
@@ -500,35 +568,42 @@ def process_video(video_path, desired_fps, batch_size, progress=gr.Progress()):
500
  df, largest_cluster = save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps,
501
  original_fps, temp_dir, video_duration)
502
 
 
 
 
 
503
  progress(0.85, "Getting face samples")
504
  face_samples = get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
505
 
506
  progress(0.9, "Performing anomaly detection")
507
- feature_columns = [col for col in df.columns if
508
- col not in ['Frame', 'Timecode', 'Time (Minutes)', 'Embedding_Index']]
509
- raw_embedding_columns = [col for col in df.columns if col.startswith('Raw_Embedding_')]
510
- X = df[feature_columns].values
 
511
 
512
  try:
513
- mse_all, mse_comp, mse_raw = lstm_anomaly_detection(
514
- X, feature_columns, raw_embedding_columns, batch_size=batch_size)
515
 
516
  progress(0.95, "Generating plots")
517
- mse_plot_all = plot_mse(df, mse_all, "Facial Features + Emotions", color='blue', hide_first_n=2)
518
- mse_plot_comp = plot_mse(df, mse_comp, "Facial Features", color='deepskyblue', hide_first_n=2)
519
- mse_plot_raw = plot_mse(df, mse_raw, "Facial Embeddings", color='steelblue', hide_first_n=2)
 
 
520
 
521
- emotion_plots = [
522
- plot_mse(df, embedding_anomaly_detection(df[emotion].values.reshape(-1, 1)),
523
- f"MSE: {emotion.capitalize()}", color=color, hide_first_n=5)
524
- for emotion, color in zip(['fear', 'sad', 'angry', 'happy', 'surprise', 'neutral'],
525
- ['purple', 'green', 'orange', 'darkblue', 'gold', 'grey'])
526
- ]
 
 
527
 
528
  except Exception as e:
529
  print(f"Error details: {str(e)}")
530
- return (f"Error in anomaly detection: {str(e)}",
531
- None, None, None, None, None, None, None, None, None, [], [])
532
 
533
  progress(1.0, "Preparing results")
534
  results = f"Number of persons/clusters detected: {num_clusters}\n\n"
@@ -536,58 +611,73 @@ def process_video(video_path, desired_fps, batch_size, progress=gr.Progress()):
536
  for cluster_id in range(num_clusters):
537
  results += f"Person/Cluster {cluster_id + 1}: {len([c for c in clusters if c == cluster_id])} frames\n"
538
 
 
 
 
 
 
 
 
 
 
 
 
539
  return (
 
540
  results,
541
- mse_plot_all,
542
- mse_plot_comp,
543
- mse_plot_raw,
 
 
544
  *emotion_plots,
545
  face_samples["most_frequent"],
546
- face_samples["others"]
 
 
547
  )
548
 
549
- # Define gallery outputs
550
- gallery_outputs = [
551
- gr.Gallery(label="Most Frequent Person Random Samples", columns=10, rows=2, height="auto"),
552
- gr.Gallery(label="Other Persons Random Samples", columns=10, rows=1, height="auto")
553
- ]
554
-
555
- # Update the Gradio interface
556
- iface = gr.Interface(
557
- fn=process_video,
558
- inputs=[
559
- gr.Video(),
560
- gr.Slider(minimum=1, maximum=20, step=1, value=10, label="Desired FPS"),
561
- gr.Slider(minimum=1, maximum=32, step=1, value=10, label="Batch Size")
562
- ],
563
- outputs=[
564
- gr.Textbox(label="Anomaly Detection Results"),
565
- gr.Plot(label="MSE: Facial Features + Emotions"),
566
- gr.Plot(label="MSE: Facial Features"),
567
- gr.Plot(label="MSE: Facial Embeddings"),
568
- gr.Plot(label="MSE: Fear"),
569
- gr.Plot(label="MSE: Sad"),
570
- gr.Plot(label="MSE: Angry"),
571
- gr.Plot(label="MSE: Happy"),
572
- gr.Plot(label="MSE: Surprise"),
573
- gr.Plot(label="MSE: Neutral"),
574
- ] + gallery_outputs,
575
- title="Facial Expressions Anomaly Detection",
576
- description="""
577
- This application detects anomalies in facial expressions and emotions from a video input.
578
- It identifies distinct persons in the video and provides sample faces for each, with multiple samples for the most frequent person.
579
-
580
- The graphs show Mean Squared Error (MSE) values for different aspects of facial expressions and emotions over time.
581
- Each point represents a frame, with red points indicating detected anomalies.
582
- Anomalies are annotated with their corresponding timecodes.
583
- Higher MSE values indicate more unusual or anomalous expressions or emotions at that point in the video.
584
-
585
- Adjust the parameters as needed:
586
- - Desired FPS: Frames per second to analyze (lower for faster processing)
587
- - Batch Size: Affects processing speed and GPU memory usage
588
- """,
589
- allow_flagging="never"
590
- )
591
-
592
- # Launch the interface
593
- iface.launch()
 
1
+ import math
2
  import os
3
  import cv2
4
  import numpy as np
 
6
  import torch.nn as nn
7
  import torch.optim as optim
8
  from facenet_pytorch import InceptionResnetV1, MTCNN
9
+ import tensorflow as tf
10
  import mediapipe as mp
11
  from fer import FER
12
  from sklearn.cluster import DBSCAN
13
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
 
 
14
  import pandas as pd
15
  import matplotlib
16
  import matplotlib.pyplot as plt
17
+ from matplotlib.patches import Rectangle
18
  from moviepy.editor import VideoFileClip
19
  from PIL import Image
20
  import gradio as gr
21
  import tempfile
22
  import shutil
23
+ import copy
24
+ import time
 
 
25
 
26
  matplotlib.rcParams['figure.dpi'] = 500
27
  matplotlib.rcParams['savefig.dpi'] = 500
28
 
29
  # Initialize models and other global variables
30
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
31
 
32
+ mtcnn = MTCNN(keep_all=False, device=device, thresholds=[0.95, 0.95, 0.95], min_face_size=80)
33
  model = InceptionResnetV1(pretrained='vggface2').eval().to(device)
34
  mp_face_mesh = mp.solutions.face_mesh
35
+ face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.5)
36
  emotion_detector = FER(mtcnn=False)
37
 
 
38
  def frame_to_timecode(frame_num, total_frames, duration):
39
  total_seconds = (frame_num / total_frames) * duration
40
  hours = int(total_seconds // 3600)
 
43
  milliseconds = int((total_seconds - int(total_seconds)) * 1000)
44
  return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
45
 
46
+ def seconds_to_timecode(seconds):
47
+ hours = int(seconds // 3600)
48
+ minutes = int((seconds % 3600) // 60)
49
+ seconds = int(seconds % 60)
50
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
51
+
52
+ def timecode_to_seconds(timecode):
53
+ h, m, s = map(int, timecode.split(':'))
54
+ return h * 3600 + m * 60 + s
55
 
56
  def get_face_embedding_and_emotion(face_img):
57
  face_tensor = torch.tensor(face_img).permute(2, 0, 1).unsqueeze(0).float() / 255
 
64
  if emotions:
65
  emotion_dict = emotions[0]['emotions']
66
  else:
67
+ emotion_dict = {e: 0 for e in ['angry', 'disgust', 'fear', 'sad', 'happy']}
68
 
69
  return embedding.cpu().numpy().flatten(), emotion_dict
70
 
 
71
  def alignFace(img):
72
  img_raw = img.copy()
73
  results = face_mesh.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
 
93
  new_img = cv2.warpAffine(img_raw, rotation_matrix, (width, height))
94
  return new_img
95
 
 
96
  def extract_frames(video_path, output_folder, desired_fps, progress_callback=None):
97
  os.makedirs(output_folder, exist_ok=True)
98
  clip = VideoFileClip(video_path)
 
116
  clip.close()
117
  return frame_count, original_fps
118
 
119
+ def is_frontal_face(landmarks, threshold=40):
120
+ nose_tip = landmarks[4]
121
+ left_chin = landmarks[234]
122
+ right_chin = landmarks[454]
123
+ nose_to_left = [left_chin.x - nose_tip.x, left_chin.y - nose_tip.y]
124
+ nose_to_right = [right_chin.x - nose_tip.x, right_chin.y - nose_tip.y]
125
+ dot_product = nose_to_left[0] * nose_to_right[0] + nose_to_left[1] * nose_to_right[1]
126
+ magnitude_left = math.sqrt(nose_to_left[0] ** 2 + nose_to_left[1] ** 2)
127
+ magnitude_right = math.sqrt(nose_to_right[0] ** 2 + nose_to_right[1] ** 2)
128
+ cos_angle = dot_product / (magnitude_left * magnitude_right)
129
+ angle = math.acos(cos_angle)
130
+ angle_degrees = math.degrees(angle)
131
+ return abs(180 - angle_degrees) < threshold
132
 
133
  def process_frames(frames_folder, aligned_faces_folder, frame_count, progress, batch_size):
134
  embeddings_by_frame = {}
 
158
  x1, y1, x2, y2 = [int(b) for b in boxes[0]]
159
  face = frame[y1:y2, x1:x2]
160
  if face.size > 0:
161
+ results = face_mesh.process(cv2.cvtColor(face, cv2.COLOR_BGR2RGB))
162
+ if results.multi_face_landmarks and is_frontal_face(results.multi_face_landmarks[0].landmark):
163
+ aligned_face = alignFace(face)
164
+ if aligned_face is not None:
165
+ aligned_face_resized = cv2.resize(aligned_face, (160, 160))
166
+ output_path = os.path.join(aligned_faces_folder, f"frame_{frame_num}_face.jpg")
167
+ cv2.imwrite(output_path, aligned_face_resized)
168
+ aligned_face_paths.append(output_path)
169
+ embedding, emotion = get_face_embedding_and_emotion(aligned_face_resized)
170
+ embeddings_by_frame[frame_num] = embedding
171
+ emotions_by_frame[frame_num] = emotion
172
+
173
+ progress((i + len(batch_files)) / len(frame_files),
174
+ f"Processing frames {i + 1} to {min(i + len(batch_files), len(frame_files))} of {len(frame_files)}")
175
 
176
  return embeddings_by_frame, emotions_by_frame, aligned_face_paths
177
 
 
178
  def cluster_faces(embeddings):
179
  if len(embeddings) < 2:
180
  print("Not enough faces for clustering. Assigning all to one cluster.")
181
  return np.zeros(len(embeddings), dtype=int)
182
 
183
  X = np.stack(embeddings)
 
184
  dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine')
185
  clusters = dbscan.fit_predict(X)
186
 
 
190
 
191
  return clusters
192
 
 
193
  def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder):
194
  for (frame_num, embedding), cluster in zip(embeddings_by_frame.items(), clusters):
195
  person_folder = os.path.join(organized_faces_folder, f"person_{cluster}")
 
198
  dst = os.path.join(person_folder, f"frame_{frame_num}_face.jpg")
199
  shutil.copy(src, dst)
200
 
201
+ def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps, original_fps, output_folder, video_duration):
202
+ emotions = ['angry', 'disgust', 'fear', 'sad', 'happy']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  person_data = {}
204
 
205
+ for (frame_num, embedding), (_, emotion_dict), cluster in zip(embeddings_by_frame.items(), emotions_by_frame.items(), clusters):
 
206
  if cluster not in person_data:
207
  person_data[cluster] = []
208
  person_data[cluster].append((frame_num, embedding, {e: emotion_dict[e] for e in emotions}))
 
216
  embeddings_array = np.array(embeddings)
217
  np.save(os.path.join(output_folder, 'face_embeddings.npy'), embeddings_array)
218
 
 
 
 
 
 
 
 
 
 
219
  total_frames = max(frames)
220
  timecodes = [frame_to_timecode(frame, total_frames, video_duration) for frame in frames]
 
221
 
222
  df_data = {
223
  'Frame': frames,
224
  'Timecode': timecodes,
 
225
  'Embedding_Index': range(len(embeddings))
226
  }
227
 
 
228
  for i in range(len(embeddings[0])):
229
  df_data[f'Raw_Embedding_{i}'] = [embedding[i] for embedding in embeddings]
230
 
 
 
 
231
  for emotion in emotions:
232
  df_data[emotion] = [e[emotion] for e in emotions_data]
233
 
 
235
 
236
  return df, largest_cluster
237
 
238
+ class Autoencoder(nn.Module):
239
+ def __init__(self, input_size):
240
+ super(Autoencoder, self).__init__()
241
+ self.encoder = nn.Sequential(
242
+ nn.Linear(input_size, 512),
243
+ nn.ReLU(),
244
+ nn.Linear(512, 256),
245
+ nn.ReLU(),
246
+ nn.Linear(256, 128),
247
+ nn.ReLU(),
248
+ nn.Linear(128, 64)
249
+ )
250
+ self.decoder = nn.Sequential(
251
+ nn.Linear(64, 128),
252
+ nn.ReLU(),
253
+ nn.Linear(128, 256),
254
+ nn.ReLU(),
255
+ nn.Linear(256, 512),
256
+ nn.ReLU(),
257
+ nn.Linear(512, input_size)
258
+ )
259
 
260
  def forward(self, x):
261
+ batch_size, seq_len, _ = x.size()
262
+ x = x.view(batch_size * seq_len, -1)
263
+ encoded = self.encoder(x)
264
+ decoded = self.decoder(encoded)
265
+ return decoded.view(batch_size, seq_len, -1)
266
+
267
+ def determine_anomalies(mse_values, threshold):
268
+ mean = np.mean(mse_values)
269
+ std = np.std(mse_values)
270
+ anomalies = mse_values > (mean + threshold * std)
271
+ return anomalies
272
 
273
+ def anomaly_detection(X_emotions, X_embeddings, epochs=200, batch_size=8, patience=3):
274
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
275
 
276
+ # Normalize emotions
277
+ scaler_emotions = MinMaxScaler()
278
+ X_emotions_scaled = scaler_emotions.fit_transform(X_emotions)
 
 
 
 
279
 
280
+ # Process emotions
281
+ X_emotions_scaled = torch.FloatTensor(X_emotions_scaled).to(device)
282
+ if X_emotions_scaled.dim() == 2:
283
+ X_emotions_scaled = X_emotions_scaled.unsqueeze(0)
284
 
285
+ model_emotions = Autoencoder(input_size=X_emotions_scaled.shape[2]).to(device)
286
  criterion = nn.MSELoss()
287
+ optimizer_emotions = optim.Adam(model_emotions.parameters())
288
 
289
+ # Train emotions model
290
  for epoch in range(epochs):
291
+ model_emotions.train()
292
+ optimizer_emotions.zero_grad()
293
+ output_emotions = model_emotions(X_emotions_scaled)
294
+ loss_emotions = criterion(output_emotions, X_emotions_scaled)
295
+ loss_emotions.backward()
296
+ optimizer_emotions.step()
297
+
298
+ # Process facial embeddings
299
+ X_embeddings = torch.FloatTensor(X_embeddings).to(device)
300
+ if X_embeddings.dim() == 2:
301
+ X_embeddings = X_embeddings.unsqueeze(0)
302
+
303
+ model_embeddings = Autoencoder(input_size=X_embeddings.shape[2]).to(device)
304
+ optimizer_embeddings = optim.Adam(model_embeddings.parameters())
305
+
306
+ # Train embeddings model
307
+ for epoch in range(epochs):
308
+ model_embeddings.train()
309
+ optimizer_embeddings.zero_grad()
310
+ output_embeddings = model_embeddings(X_embeddings)
311
+ loss_embeddings = criterion(output_embeddings, X_embeddings)
312
+ loss_embeddings.backward()
313
+ optimizer_embeddings.step()
314
+
315
+ # Compute MSE for emotions and embeddings
316
+ model_emotions.eval()
317
+ model_embeddings.eval()
318
  with torch.no_grad():
319
+ reconstructed_emotions = model_emotions(X_emotions_scaled).cpu().numpy()
320
+ reconstructed_embeddings = model_embeddings(X_embeddings).cpu().numpy()
 
 
 
 
321
 
322
+ mse_emotions = np.mean(np.power(X_emotions_scaled.cpu().numpy() - reconstructed_emotions, 2), axis=2).squeeze()
323
+ mse_embeddings = np.mean(np.power(X_embeddings.cpu().numpy() - reconstructed_embeddings, 2), axis=2).squeeze()
 
 
 
 
 
 
324
 
325
+ return mse_emotions, mse_embeddings
326
 
327
+ def plot_mse(df, mse_values, title, color='blue', time_threshold=3, anomaly_threshold=4):
328
+ plt.figure(figsize=(16, 8), dpi=500)
329
+ fig, ax = plt.subplots(figsize=(16, 8))
 
 
 
 
 
 
 
 
330
 
331
+ if 'Seconds' not in df.columns:
332
+ df['Seconds'] = df['Timecode'].apply(
333
+ lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
 
 
 
 
 
 
 
 
334
 
335
+ # Ensure df and mse_values have the same length and remove NaN values
336
+ min_length = min(len(df), len(mse_values))
337
+ df = df.iloc[:min_length]
338
+ mse_values = mse_values[:min_length]
339
 
340
+ # Remove NaN values
341
+ mask = ~np.isnan(mse_values)
342
+ df = df[mask]
343
+ mse_values = mse_values[mask]
 
344
 
345
+ mean = pd.Series(mse_values).rolling(window=10).mean()
346
+ std = pd.Series(mse_values).rolling(window=10).std()
347
+ median = np.median(mse_values)
348
 
349
+ ax.scatter(df['Seconds'], mse_values, color=color, alpha=0.3, s=5)
350
+ ax.plot(df['Seconds'], mean, color=color, linewidth=2)
351
+ ax.fill_between(df['Seconds'], mean - std, mean + std, color=color, alpha=0.2)
352
 
353
+ # Add median line
354
+ ax.axhline(y=median, color='black', linestyle='--', label='Baseline')
355
+ ax.text(ax.get_xlim()[1], median, 'Baseline', verticalalignment='center', horizontalalignment='left', color='black')
356
 
357
+ # Add threshold line
358
+ threshold = np.mean(mse_values) + anomaly_threshold * np.std(mse_values)
359
+ ax.axhline(y=threshold, color='red', linestyle='--', label=f'Threshold: {anomaly_threshold:.1f}')
360
+ ax.text(ax.get_xlim()[1], threshold, f'Threshold: {anomaly_threshold:.1f}', verticalalignment='center', horizontalalignment='left', color='red')
361
 
362
+ anomalies = determine_anomalies(mse_values, anomaly_threshold)
363
+ anomaly_frames = df['Frame'].iloc[anomalies].tolist()
364
 
365
+ ax.scatter(df['Seconds'].iloc[anomalies], mse_values[anomalies], color='red', s=25, zorder=5)
 
 
366
 
367
+ anomaly_data = list(zip(df['Timecode'].iloc[anomalies],
368
+ df['Seconds'].iloc[anomalies],
369
+ mse_values[anomalies]))
370
+ anomaly_data.sort(key=lambda x: x[1])
 
371
 
372
  grouped_anomalies = []
373
  current_group = []
 
380
  if current_group:
381
  grouped_anomalies.append(current_group)
382
 
383
+ for group in grouped_anomalies:
384
+ start_sec = group[0][1]
385
+ end_sec = group[-1][1]
386
+ rect = Rectangle((start_sec, ax.get_ylim()[0]), end_sec - start_sec, ax.get_ylim()[1] - ax.get_ylim()[0],
387
+ facecolor='red', alpha=0.3, zorder=1)
388
+ ax.add_patch(rect)
389
+
390
  for group in grouped_anomalies:
391
  highest_mse_anomaly = max(group, key=lambda x: x[2])
392
  timecode, sec, mse = highest_mse_anomaly
393
  ax.annotate(timecode, (sec, mse), textcoords="offset points", xytext=(0, 10),
394
+ ha='center', fontsize=6, color='red')
395
 
 
 
 
 
 
 
 
396
  max_seconds = df['Seconds'].max()
397
  num_ticks = 100
398
  tick_locations = np.linspace(0, max_seconds, num_ticks)
399
+ tick_labels = [seconds_to_timecode(int(s)) for s in tick_locations]
 
400
 
401
  ax.set_xticks(tick_locations)
402
  ax.set_xticklabels(tick_labels, rotation=90, ha='center', fontsize=6)
403
 
404
+ ax.set_xlabel('Timecode')
405
  ax.set_ylabel('Mean Squared Error')
406
  ax.set_title(title)
407
 
408
  ax.grid(True, linestyle='--', alpha=0.7)
409
+ ax.legend()
410
+ plt.tight_layout()
411
+ plt.close()
412
+ return fig, anomaly_frames
413
+
414
+ def plot_mse_histogram(mse_values, title, anomaly_threshold, color='blue'):
415
+ plt.figure(figsize=(16, 8), dpi=500)
416
+ fig, ax = plt.subplots(figsize=(16, 8))
417
+
418
+ ax.hist(mse_values, bins=100, edgecolor='black', color=color, alpha=0.7)
419
+ ax.set_xlabel('Mean Squared Error')
420
+ ax.set_ylabel('Number of Samples')
421
+ ax.set_title(title)
422
+
423
+ mean = np.mean(mse_values)
424
+ std = np.std(mse_values)
425
+ threshold = mean + anomaly_threshold * std
426
+
427
+ ax.axvline(x=threshold, color='red', linestyle='--', linewidth=2)
428
+
429
+ # Move annotation to the bottom and away from the line
430
+ ax.annotate(f'Threshold: {anomaly_threshold:.1f}',
431
+ xy=(threshold, ax.get_ylim()[0]),
432
+ xytext=(0, -20),
433
+ textcoords='offset points',
434
+ ha='center', va='top',
435
+ bbox=dict(boxstyle='round,pad=0.5', fc='white', ec='none', alpha=0.7),
436
+ color='red')
437
+
438
+ plt.tight_layout()
439
+ plt.close()
440
+ return fig
441
+
442
+
443
+ def plot_emotion(df, emotion, color, anomaly_threshold):
444
+ plt.figure(figsize=(16, 8), dpi=500)
445
+ fig, ax = plt.subplots(figsize=(16, 8))
446
+
447
+ df['Seconds'] = df['Timecode'].apply(
448
+ lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
449
+
450
+ mean = df[emotion].rolling(window=10).mean()
451
+ std = df[emotion].rolling(window=10).std()
452
+ median = df[emotion].median()
453
+
454
+ ax.scatter(df['Seconds'], df[emotion], color=color, alpha=0.3, s=5)
455
+ ax.plot(df['Seconds'], mean, color=color, linewidth=2)
456
+ ax.fill_between(df['Seconds'], mean - std, mean + std, color=color, alpha=0.2)
457
+
458
+ # Add median line
459
+ ax.axhline(y=median, color='black', linestyle='--', label='Baseline')
460
+ ax.text(ax.get_xlim()[1], median, 'Baseline', verticalalignment='center', horizontalalignment='left', color='black')
461
+
462
+ # Convert anomaly threshold to probability
463
+ probability_threshold = (anomaly_threshold - 1) / 6 # Convert 1-7 scale to 0-1 probability
464
+
465
+ # Add threshold line and detect anomalies
466
+ ax.axhline(y=probability_threshold, color='red', linestyle='--', label=f'Threshold: {probability_threshold:.2f}')
467
+ ax.text(ax.get_xlim()[1], probability_threshold, f'Threshold: {probability_threshold:.2f}',
468
+ verticalalignment='center', horizontalalignment='left', color='red')
469
+
470
+ # Detect and highlight anomalies
471
+ anomalies = df[emotion] >= probability_threshold
472
+ ax.scatter(df['Seconds'][anomalies], df[emotion][anomalies], color='red', s=25, zorder=5)
473
+
474
+ max_seconds = df['Seconds'].max()
475
+ num_ticks = 100
476
+ tick_locations = np.linspace(0, max_seconds, num_ticks)
477
+ tick_labels = [seconds_to_timecode(int(s)) for s in tick_locations]
478
+
479
+ ax.set_xticks(tick_locations)
480
+ ax.set_xticklabels(tick_labels, rotation=90, ha='center', fontsize=6)
481
+
482
+ ax.set_xlabel('Timecode')
483
+ ax.set_ylabel('Emotion Probability')
484
+ ax.set_title(f"{emotion.capitalize()} Over Time")
485
+
486
+ ax.grid(True, linestyle='--', alpha=0.7)
487
+ ax.legend()
488
  plt.tight_layout()
489
  plt.close()
490
  return fig
491
 
492
+ def get_all_face_samples(organized_faces_folder, output_folder, largest_cluster, max_samples=500):
493
  face_samples = {"most_frequent": [], "others": []}
494
  for cluster_folder in sorted(os.listdir(organized_faces_folder)):
495
  if cluster_folder.startswith("person_"):
 
498
  if face_files:
499
  cluster_id = int(cluster_folder.split('_')[1])
500
  if cluster_id == largest_cluster:
501
+ for i, sample in enumerate(face_files[:max_samples]):
502
  face_path = os.path.join(person_folder, sample)
503
  output_path = os.path.join(output_folder, f"face_sample_most_frequent_{i:04d}.jpg")
504
  face_img = cv2.imread(face_path)
 
506
  small_face = cv2.resize(face_img, (160, 160))
507
  cv2.imwrite(output_path, small_face)
508
  face_samples["most_frequent"].append(output_path)
509
+ if len(face_samples["most_frequent"]) >= max_samples:
510
+ break
511
  else:
512
+ remaining_samples = max_samples - len(face_samples["others"])
513
+ if remaining_samples > 0:
514
+ for i, sample in enumerate(face_files[:remaining_samples]):
515
+ face_path = os.path.join(person_folder, sample)
516
+ output_path = os.path.join(output_folder, f"face_sample_other_{cluster_id:02d}_{i:04d}.jpg")
517
+ face_img = cv2.imread(face_path)
518
+ if face_img is not None:
519
+ small_face = cv2.resize(face_img, (160, 160))
520
+ cv2.imwrite(output_path, small_face)
521
+ face_samples["others"].append(output_path)
522
+ if len(face_samples["others"]) >= max_samples:
523
+ break
524
  return face_samples
525
 
526
+ def process_video(video_path, anomaly_threshold, desired_fps, progress=gr.Progress()):
527
+ start_time = time.time()
528
  output_folder = "output"
529
  os.makedirs(output_folder, exist_ok=True)
530
+ batch_size = 16
 
 
 
 
 
 
531
 
532
  with tempfile.TemporaryDirectory() as temp_dir:
533
  aligned_faces_folder = os.path.join(temp_dir, 'aligned_faces')
 
554
  progress, batch_size)
555
 
556
  if not aligned_face_paths:
557
+ return ("No faces were extracted from the video.",) + (None,) * 10
 
558
 
559
  progress(0.6, "Clustering faces")
560
  embeddings = [embedding for _, embedding in embeddings_by_frame.items()]
561
  clusters = cluster_faces(embeddings)
562
+ num_clusters = len(set(clusters))
563
 
564
  progress(0.7, "Organizing faces")
565
  organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder)
 
568
  df, largest_cluster = save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps,
569
  original_fps, temp_dir, video_duration)
570
 
571
+ # Add 'Seconds' column to df
572
+ df['Seconds'] = df['Timecode'].apply(
573
+ lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
574
+
575
  progress(0.85, "Getting face samples")
576
  face_samples = get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
577
 
578
  progress(0.9, "Performing anomaly detection")
579
+ emotion_columns = ['angry', 'disgust', 'fear', 'sad', 'happy']
580
+ embedding_columns = [col for col in df.columns if col.startswith('Raw_Embedding_')]
581
+
582
+ X_emotions = df[emotion_columns].values
583
+ X_embeddings = df[embedding_columns].values
584
 
585
  try:
586
+ mse_emotions, mse_embeddings = anomaly_detection(X_emotions, X_embeddings, batch_size=batch_size)
 
587
 
588
  progress(0.95, "Generating plots")
589
+ mse_plot_embeddings, anomaly_frames_embeddings = plot_mse(df, mse_embeddings, "Facial Embeddings",
590
+ color='green',
591
+ anomaly_threshold=anomaly_threshold)
592
+ mse_histogram_embeddings = plot_mse_histogram(mse_embeddings, "MSE Distribution: Facial Embeddings",
593
+ anomaly_threshold, color='green')
594
 
595
+ # Add emotion plots
596
+ emotion_plots = []
597
+ for emotion, color in zip(emotion_columns, ['purple', 'brown', 'green', 'orange', 'darkblue']):
598
+ emotion_plot = plot_emotion(df, emotion, color, anomaly_threshold)
599
+ emotion_plots.append(emotion_plot)
600
+
601
+ mse_var_emotions = np.var(mse_emotions)
602
+ mse_var_embeddings = np.var(mse_embeddings)
603
 
604
  except Exception as e:
605
  print(f"Error details: {str(e)}")
606
+ return (f"Error in anomaly detection: {str(e)}",) + (None,) * 15
 
607
 
608
  progress(1.0, "Preparing results")
609
  results = f"Number of persons/clusters detected: {num_clusters}\n\n"
 
611
  for cluster_id in range(num_clusters):
612
  results += f"Person/Cluster {cluster_id + 1}: {len([c for c in clusters if c == cluster_id])} frames\n"
613
 
614
+ end_time = time.time()
615
+ execution_time = end_time - start_time
616
+
617
+ # Load anomaly frames as images
618
+ anomaly_faces_embeddings = [
619
+ cv2.imread(os.path.join(aligned_faces_folder, f"frame_{frame}_face.jpg"))
620
+ for frame in anomaly_frames_embeddings
621
+ if os.path.exists(os.path.join(aligned_faces_folder, f"frame_{frame}_face.jpg"))
622
+ ]
623
+ anomaly_faces_embeddings = [cv2.cvtColor(face, cv2.COLOR_BGR2RGB) for face in anomaly_faces_embeddings if face is not None]
624
+
625
  return (
626
+ execution_time,
627
  results,
628
+ df,
629
+ mse_embeddings,
630
+ mse_emotions,
631
+ mse_plot_embeddings,
632
+ mse_histogram_embeddings,
633
  *emotion_plots,
634
  face_samples["most_frequent"],
635
+ face_samples["others"],
636
+ anomaly_faces_embeddings,
637
+ aligned_faces_folder
638
  )
639
 
640
+ with gr.Blocks() as iface:
641
+ gr.Markdown("# Facial Expressions Anomaly Detection")
642
+
643
+ with gr.Row():
644
+ video_input = gr.Video()
645
+ anomaly_threshold = gr.Slider(minimum=1, maximum=7, step=0.1, value=4.5, label="Anomaly Detection Threshold")
646
+ fps_slider = gr.Slider(minimum=10, maximum=20, step=5, value=20, label="Frames Per Second")
647
+
648
+ process_btn = gr.Button("Process Video")
649
+
650
+ execution_time = gr.Number(label="Execution Time (seconds)")
651
+ results_text = gr.Textbox(label="Anomaly Detection Results")
652
+
653
+ anomaly_frames_embeddings = gr.Gallery(label="Anomaly Frames (Facial Embeddings)", columns=6, rows=2, height="auto")
654
+
655
+ mse_embeddings_plot = gr.Plot(label="MSE: Facial Embeddings")
656
+ mse_embeddings_hist = gr.Plot(label="MSE Distribution: Facial Embeddings")
657
+
658
+ # Add emotion plots
659
+ emotion_plots = [gr.Plot(label=f"{emotion.capitalize()} Over Time") for emotion in ['angry', 'disgust', 'fear', 'sad', 'happy']]
660
+
661
+ face_samples_most_frequent = gr.Gallery(label="Most Frequent Person Samples (Target)", columns=6, rows=2, height="auto")
662
+ face_samples_others = gr.Gallery(label="Other Persons Samples", columns=6, rows=1, height="auto")
663
+
664
+ # Hidden components to store intermediate results
665
+ df_store = gr.State()
666
+ mse_emotions_store = gr.State()
667
+ mse_embeddings_store = gr.State()
668
+ aligned_faces_folder_store = gr.State()
669
+
670
+ process_btn.click(
671
+ process_video,
672
+ inputs=[video_input, anomaly_threshold, fps_slider],
673
+ outputs=[
674
+ execution_time, results_text, df_store, mse_embeddings_store, mse_emotions_store,
675
+ mse_embeddings_plot, mse_embeddings_hist,
676
+ *emotion_plots,
677
+ face_samples_most_frequent, face_samples_others, anomaly_frames_embeddings,
678
+ aligned_faces_folder_store
679
+ ]
680
+ )
681
+
682
+ if __name__ == "__main__":
683
+ iface.launch()