reab5555 commited on
Commit
1831948
·
verified ·
1 Parent(s): 21dc0af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -221
app.py CHANGED
@@ -12,6 +12,7 @@ from scipy import interpolate
12
  from sklearn.cluster import DBSCAN, KMeans
13
  from sklearn.preprocessing import StandardScaler, MinMaxScaler
14
  from sklearn.metrics import silhouette_score
 
15
  import umap
16
  import pandas as pd
17
  import matplotlib
@@ -41,6 +42,7 @@ mp_face_mesh = mp.solutions.face_mesh
41
  face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.5)
42
  emotion_detector = FER(mtcnn=False)
43
 
 
44
  def frame_to_timecode(frame_num, total_frames, duration):
45
  total_seconds = (frame_num / total_frames) * duration
46
  hours = int(total_seconds // 3600)
@@ -49,6 +51,7 @@ def frame_to_timecode(frame_num, total_frames, duration):
49
  milliseconds = int((total_seconds - int(total_seconds)) * 1000)
50
  return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
51
 
 
52
  def get_face_embedding_and_emotion(face_img):
53
  face_tensor = torch.tensor(face_img).permute(2, 0, 1).unsqueeze(0).float() / 255
54
  face_tensor = (face_tensor - 0.5) / 0.5
@@ -64,6 +67,7 @@ def get_face_embedding_and_emotion(face_img):
64
 
65
  return embedding.cpu().numpy().flatten(), emotion_dict
66
 
 
67
  def alignFace(img):
68
  img_raw = img.copy()
69
  results = face_mesh.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
@@ -89,6 +93,7 @@ def alignFace(img):
89
  new_img = cv2.warpAffine(img_raw, rotation_matrix, (width, height))
90
  return new_img
91
 
 
92
  def extract_frames(video_path, output_folder, desired_fps, progress_callback=None):
93
  os.makedirs(output_folder, exist_ok=True)
94
  clip = VideoFileClip(video_path)
@@ -112,6 +117,7 @@ def extract_frames(video_path, output_folder, desired_fps, progress_callback=Non
112
  clip.close()
113
  return frame_count, original_fps
114
 
 
115
  def process_frames(frames_folder, aligned_faces_folder, frame_count, progress, batch_size):
116
  embeddings_by_frame = {}
117
  emotions_by_frame = {}
@@ -155,6 +161,7 @@ def process_frames(frames_folder, aligned_faces_folder, frame_count, progress, b
155
 
156
  return embeddings_by_frame, emotions_by_frame, aligned_face_paths
157
 
 
158
  def cluster_faces(embeddings):
159
  if len(embeddings) < 2:
160
  print("Not enough faces for clustering. Assigning all to one cluster.")
@@ -171,6 +178,7 @@ def cluster_faces(embeddings):
171
 
172
  return clusters
173
 
 
174
  def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder):
175
  for (frame_num, embedding), cluster in zip(embeddings_by_frame.items(), clusters):
176
  person_folder = os.path.join(organized_faces_folder, f"person_{cluster}")
@@ -179,8 +187,34 @@ def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder
179
  dst = os.path.join(person_folder, f"frame_{frame_num}_face.jpg")
180
  shutil.copy(src, dst)
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps, original_fps, output_folder,
183
- num_components, video_duration):
184
  emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
185
  person_data = {}
186
 
@@ -199,7 +233,10 @@ def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, de
199
  embeddings_array = np.array(embeddings)
200
  np.save(os.path.join(output_folder, 'face_embeddings.npy'), embeddings_array)
201
 
202
- reducer = umap.UMAP(n_components=num_components, random_state=1)
 
 
 
203
  embeddings_reduced = reducer.fit_transform(embeddings)
204
 
205
  scaler = MinMaxScaler(feature_range=(0, 1))
@@ -216,7 +253,11 @@ def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, de
216
  'Embedding_Index': range(len(embeddings))
217
  }
218
 
219
- for i in range(num_components):
 
 
 
 
220
  df_data[f'Comp {i + 1}'] = embeddings_reduced_normalized[:, i]
221
 
222
  for emotion in emotions:
@@ -226,33 +267,6 @@ def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, de
226
 
227
  return df, largest_cluster
228
 
229
- def determine_optimal_anomalies(anomaly_scores, z_threshold=3.5):
230
- mean = np.mean(anomaly_scores)
231
- std = np.std(anomaly_scores)
232
- threshold = mean + z_threshold * std
233
- anomalies = anomaly_scores > threshold
234
- return anomalies, np.where(anomalies)[0]
235
-
236
- def timecode_to_seconds(timecode):
237
- h, m, s = map(float, timecode.split(':'))
238
- return h * 3600 + m * 60 + s
239
-
240
- def group_similar_timecodes(timecodes, scores, threshold_seconds=10):
241
- grouped = []
242
- current_group = []
243
-
244
- for i, (timecode, score) in enumerate(zip(timecodes, scores)):
245
- if not current_group or abs(
246
- timecode_to_seconds(timecode) - timecode_to_seconds(current_group[0][0])) <= threshold_seconds:
247
- current_group.append((timecode, score, i))
248
- else:
249
- grouped.append(current_group)
250
- current_group = [(timecode, score, i)]
251
-
252
- if current_group:
253
- grouped.append(current_group)
254
-
255
- return grouped
256
 
257
  class LSTMAutoencoder(nn.Module):
258
  def __init__(self, input_size, hidden_size=64, num_layers=2):
@@ -268,21 +282,17 @@ class LSTMAutoencoder(nn.Module):
268
  out = self.fc(outputs)
269
  return out
270
 
271
- def lstm_anomaly_detection(X, feature_columns, epochs=100, batch_size=64):
 
272
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
273
  X = torch.FloatTensor(X).to(device)
274
  if X.dim() == 2:
275
  X = X.unsqueeze(0)
276
  elif X.dim() == 1:
277
  X = X.unsqueeze(0).unsqueeze(2)
278
- elif X.dim() > 3:
279
- raise ValueError(f"Input X should be 1D, 2D or 3D, but got {X.dim()} dimensions")
280
 
281
  print(f"X shape after reshaping: {X.shape}")
282
 
283
- train_size = int(0.9 * X.shape[1])
284
- X_train, X_val = X[:, :train_size, :], X[:, train_size:, :]
285
-
286
  model = LSTMAutoencoder(input_size=X.shape[2]).to(device)
287
  criterion = nn.MSELoss()
288
  optimizer = optim.Adam(model.parameters())
@@ -290,22 +300,19 @@ def lstm_anomaly_detection(X, feature_columns, epochs=100, batch_size=64):
290
  for epoch in range(epochs):
291
  model.train()
292
  optimizer.zero_grad()
293
- output_train = model(X_train)
294
- loss_train = criterion(output_train, X_train.squeeze(0))
295
- loss_train.backward()
296
  optimizer.step()
297
 
298
- model.eval()
299
- with torch.no_grad():
300
- output_val = model(X_val)
301
- loss_val = criterion(output_val, X_val.squeeze(0))
302
 
303
  model.eval()
304
  with torch.no_grad():
305
  reconstructed = model(X).squeeze(0).cpu().numpy()
306
 
307
  mse_all = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
308
- anomalies_all, top_indices_all = determine_optimal_anomalies(mse_all)
309
 
310
  component_columns = [col for col in feature_columns if col.startswith('Comp')]
311
  component_indices = [feature_columns.index(col) for col in component_columns]
@@ -316,53 +323,10 @@ def lstm_anomaly_detection(X, feature_columns, epochs=100, batch_size=64):
316
  else:
317
  mse_comp = mse_all
318
 
319
- anomalies_comp, top_indices_comp = determine_optimal_anomalies(mse_comp)
320
-
321
- return (anomalies_all, mse_all, top_indices_all,
322
- anomalies_comp, mse_comp, top_indices_comp,
323
- model)
324
-
325
- def emotion_anomaly_detection(emotion_data, epochs=100, batch_size=64):
326
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
327
- X = torch.FloatTensor(emotion_data.values).to(device)
328
- if X.dim() == 1:
329
- X = X.unsqueeze(0).unsqueeze(2) # Add batch and feature dimensions
330
- elif X.dim() == 2:
331
- X = X.unsqueeze(0) # Add batch dimension
332
-
333
- model = LSTMAutoencoder(input_size=1).to(device)
334
- criterion = nn.MSELoss()
335
- optimizer = optim.Adam(model.parameters())
336
 
337
- for epoch in range(epochs):
338
- model.train()
339
- optimizer.zero_grad()
340
- output = model(X)
341
- loss = criterion(output, X)
342
- loss.backward()
343
- optimizer.step()
344
-
345
- model.eval()
346
- with torch.no_grad():
347
- reconstructed = model(X).squeeze(0).cpu().numpy()
348
-
349
- mse = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
350
- anomalies, top_indices = determine_optimal_anomalies(mse)
351
-
352
- return anomalies, mse, top_indices
353
-
354
- def normalize_scores(scores):
355
- min_score = np.min(scores)
356
- max_score = np.max(scores)
357
- if max_score == min_score:
358
- return np.full_like(scores, 100)
359
- return ((scores - min_score) / (max_score - min_score)) * 100
360
-
361
- def plot_to_image(fig):
362
- buf = io.BytesIO()
363
- fig.savefig(buf, format='png', dpi=300, bbox_inches='tight')
364
- buf.seek(0)
365
- return buf
366
 
367
  def embedding_anomaly_detection(embeddings, epochs=100, batch_size=64):
368
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -389,106 +353,74 @@ def embedding_anomaly_detection(embeddings, epochs=100, batch_size=64):
389
  reconstructed = model(X).squeeze(0).cpu().numpy()
390
 
391
  mse = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
392
- anomalies, top_indices = determine_optimal_anomalies(mse)
393
-
394
- return anomalies, mse, top_indices
395
- def plot_anomaly_scores(df, anomaly_scores, top_indices, title, timecodes):
396
- plt.figure(figsize=(16, 8), dpi=300)
397
- fig, ax = plt.subplots(figsize=(16, 8))
398
-
399
- df['Seconds'] = df['Timecode'].apply(
400
- lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
401
-
402
- # Filter out data points without faces
403
- valid_indices = [i for i in range(len(anomaly_scores)) if i in df.index]
404
- seconds = df['Seconds'].iloc[valid_indices].values
405
- scores = anomaly_scores[valid_indices]
406
 
407
- ax.scatter(seconds, scores, color='blue', alpha=0.7, s=10)
 
 
 
 
408
 
409
- top_indices = [idx for idx in top_indices if idx in valid_indices]
410
- ax.scatter(df['Seconds'].iloc[top_indices], anomaly_scores[top_indices], color='red', s=50, zorder=5)
411
-
412
- # Calculate and plot baseline
413
- non_anomalous_scores = np.delete(scores, top_indices)
414
- baseline = np.mean(non_anomalous_scores)
415
- ax.axhline(y=baseline, color='black', linestyle='--', linewidth=2.5)
416
- ax.text(df['Seconds'].max(), baseline, f'Baseline ({baseline:.2f})',
417
- verticalalignment='bottom', horizontalalignment='right', color='black')
418
-
419
- grouped_timecodes = group_similar_timecodes([df['Timecode'].iloc[idx] for idx in top_indices],
420
- scores[top_indices])
421
-
422
- for group in grouped_timecodes:
423
- max_score_idx = max(range(len(group)), key=lambda i: group[i][1])
424
- timecode, score, idx = group[max_score_idx]
425
- ax.annotate(timecode,
426
- (df['Seconds'].iloc[top_indices[idx]], score),
427
- xytext=(5, 5), textcoords='offset points',
428
- fontsize=6, color='red')
429
-
430
- max_seconds = df['Seconds'].max()
431
- ax.set_xlim(0, max_seconds)
432
- num_ticks = 100
433
- ax.set_xticks(np.linspace(0, max_seconds, num_ticks))
434
- ax.set_xticklabels([f"{int(x // 60):02d}:{int(x % 60):02d}" for x in ax.get_xticks()],
435
- rotation=90, ha='center', va='top')
436
-
437
- ax.set_xlabel('Time')
438
- ax.set_ylabel('Anomaly Score')
439
- ax.set_title(title)
440
-
441
- ax.grid(True, linestyle='--', alpha=0.7)
442
- plt.tight_layout()
443
- plt.close()
444
- return fig
445
 
446
- def plot_emotion(df, emotion, anomaly_scores, top_indices, color, timecodes):
447
  plt.figure(figsize=(16, 8), dpi=300)
448
  fig, ax = plt.subplots(figsize=(16, 8))
449
 
450
  df['Seconds'] = df['Timecode'].apply(
451
  lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
452
 
453
- # Filter out data points without faces
454
- valid_indices = [i for i in range(len(anomaly_scores)) if i in df.index]
455
- seconds = df['Seconds'].iloc[valid_indices].values
456
- scores = anomaly_scores[valid_indices]
457
 
458
- ax.scatter(seconds, scores, color=color, alpha=0.7, s=10)
 
459
 
460
- top_indices = [idx for idx in top_indices if idx in valid_indices]
461
- ax.scatter(df['Seconds'].iloc[top_indices], anomaly_scores[top_indices], color='red', s=50, zorder=5)
 
462
 
463
- # Calculate and plot baseline
464
- non_anomalous_scores = np.delete(anomaly_scores, top_indices)
465
- baseline = np.mean(non_anomalous_scores)
466
- ax.axhline(y=baseline, color='black', linestyle='--', linewidth=2.5)
467
- ax.text(df['Seconds'].max(), baseline, f'Baseline ({baseline:.2f})',
468
- verticalalignment='bottom', horizontalalignment='right', color='black')
469
 
 
 
 
 
 
 
 
 
 
 
470
 
471
- grouped_timecodes = group_similar_timecodes([df['Timecode'].iloc[idx] for idx in top_indices],
472
- anomaly_scores[top_indices])
 
 
 
473
 
474
- for group in grouped_timecodes:
475
- max_score_idx = max(range(len(group)), key=lambda i: group[i][1])
476
- timecode, score, idx = group[max_score_idx]
477
- ax.annotate(timecode,
478
- (df['Seconds'].iloc[top_indices[idx]], score),
479
- xytext=(5, 5), textcoords='offset points',
480
- fontsize=6, color='red')
481
 
 
482
  max_seconds = df['Seconds'].max()
483
- ax.set_xlim(0, max_seconds)
484
  num_ticks = 100
485
- ax.set_xticks(np.linspace(0, max_seconds, num_ticks))
486
- ax.set_xticklabels([f"{int(x // 60):02d}:{int(x % 60):02d}" for x in ax.get_xticks()],
487
- rotation=90, ha='center', va='top')
 
 
 
488
 
489
  ax.set_xlabel('Time')
490
- ax.set_ylabel(f'{emotion.capitalize()} Anomaly Score')
491
- ax.set_title(f'{emotion.capitalize()} Anomaly Scores')
492
 
493
  ax.grid(True, linestyle='--', alpha=0.7)
494
  plt.tight_layout()
@@ -522,10 +454,18 @@ def get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
522
  cv2.imwrite(output_path, small_face)
523
  face_samples["others"].append(output_path)
524
  return face_samples
525
- def process_video(video_path, num_components, desired_fps, batch_size, progress=gr.Progress()):
 
526
  output_folder = "output"
527
  os.makedirs(output_folder, exist_ok=True)
528
 
 
 
 
 
 
 
 
529
  with tempfile.TemporaryDirectory() as temp_dir:
530
  aligned_faces_folder = os.path.join(temp_dir, 'aligned_faces')
531
  organized_faces_folder = os.path.join(temp_dir, 'organized_faces')
@@ -552,7 +492,7 @@ def process_video(video_path, num_components, desired_fps, batch_size, progress=
552
 
553
  if not aligned_face_paths:
554
  return ("No faces were extracted from the video.",
555
- None, None, None, None, None, None, None, None)
556
 
557
  progress(0.6, "Clustering faces")
558
  embeddings = [embedding for _, embedding in embeddings_by_frame.items()]
@@ -564,7 +504,7 @@ def process_video(video_path, num_components, desired_fps, batch_size, progress=
564
 
565
  progress(0.8, "Saving person data")
566
  df, largest_cluster = save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps,
567
- original_fps, temp_dir, num_components, video_duration)
568
 
569
  progress(0.85, "Getting face samples")
570
  face_samples = get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
@@ -572,46 +512,29 @@ def process_video(video_path, num_components, desired_fps, batch_size, progress=
572
  progress(0.9, "Performing anomaly detection")
573
  feature_columns = [col for col in df.columns if
574
  col not in ['Frame', 'Timecode', 'Time (Minutes)', 'Embedding_Index']]
 
575
  X = df[feature_columns].values
576
 
577
  try:
578
- anomalies_all, anomaly_scores_all, top_indices_all, anomalies_comp, anomaly_scores_comp, top_indices_comp, _ = lstm_anomaly_detection(
579
- X, feature_columns, batch_size=batch_size)
580
-
581
- anomaly_scores_all = normalize_scores(anomaly_scores_all)
582
- anomaly_scores_comp = normalize_scores(anomaly_scores_comp)
583
 
584
- emotion_anomalies = {}
585
- for emotion in ['fear', 'sad', 'angry', 'happy', 'surprise', 'neutral']:
586
- anomalies, scores, indices = emotion_anomaly_detection(df[emotion])
587
- emotion_anomalies[emotion] = {
588
- 'anomalies': anomalies,
589
- 'scores': normalize_scores(scores),
590
- 'indices': indices
591
- }
592
 
593
- except Exception as e:
594
- print(f"Error details: {str(e)}")
595
- return f"Error in anomaly detection: {str(e)}", None, None, None, None, None, None, None, None
596
-
597
- progress(0.95, "Generating plots")
598
- try:
599
- anomaly_plot_all = plot_anomaly_scores(df, anomaly_scores_all, top_indices_all,
600
- "Facial Features + Emotions",
601
- df['Timecode'].iloc[top_indices_all].values)
602
- anomaly_plot_comp = plot_anomaly_scores(df, anomaly_scores_comp, top_indices_comp, "Facial Features",
603
- df['Timecode'].iloc[top_indices_comp].values)
604
  emotion_plots = [
605
- plot_emotion(df, emotion,
606
- emotion_anomalies[emotion]['scores'],
607
- emotion_anomalies[emotion]['indices'],
608
- color,
609
- df['Timecode'].iloc[emotion_anomalies[emotion]['indices']].values)
610
  for emotion, color in zip(['fear', 'sad', 'angry', 'happy', 'surprise', 'neutral'],
611
  ['purple', 'green', 'orange', 'darkblue', 'gold', 'grey'])
612
  ]
 
613
  except Exception as e:
614
- return f"Error generating plots: {str(e)}", None, None, None, None, None, None, None, None
 
 
615
 
616
  progress(1.0, "Preparing results")
617
  results = f"Number of persons/clusters detected: {num_clusters}\n\n"
@@ -619,55 +542,58 @@ def process_video(video_path, num_components, desired_fps, batch_size, progress=
619
  for cluster_id in range(num_clusters):
620
  results += f"Person/Cluster {cluster_id + 1}: {len([c for c in clusters if c == cluster_id])} frames\n"
621
 
622
-
623
  return (
624
  results,
625
- anomaly_plot_all,
626
- anomaly_plot_comp,
 
627
  *emotion_plots,
628
  face_samples["most_frequent"],
629
  face_samples["others"]
630
  )
631
 
632
-
633
  gallery_outputs = [
634
  gr.Gallery(label="Most Frequent Person Random Samples", columns=5, rows=2, height="auto"),
635
  gr.Gallery(label="Other Persons Random Samples", columns=5, rows=1, height="auto")
636
  ]
637
 
 
638
  iface = gr.Interface(
639
  fn=process_video,
640
  inputs=[
641
  gr.Video(),
642
- gr.Slider(minimum=1, maximum=10, step=1, value=5, label="Number of Components"),
643
  gr.Slider(minimum=1, maximum=20, step=1, value=10, label="Desired FPS"),
644
  gr.Slider(minimum=1, maximum=32, step=1, value=8, label="Batch Size")
645
  ],
646
  outputs=[
647
  gr.Textbox(label="Anomaly Detection Results"),
648
- gr.Plot(label="Anomaly Scores (Facial Features + Emotions)"),
649
- gr.Plot(label="Anomaly Scores (Facial Features)"),
650
- gr.Plot(label="Fear Anomalies"),
651
- gr.Plot(label="Sad Anomalies"),
652
- gr.Plot(label="Angry Anomalies"),
653
- gr.Plot(label="Happy Anomalies"),
654
- gr.Plot(label="Surprise Anomalies"),
655
- gr.Plot(label="Neutral Anomalies"),
 
656
  ] + gallery_outputs,
657
  title="Facial Expressions Anomaly Detection",
658
  description="""
659
  This application detects anomalies in facial expressions and emotions from a video input.
660
  It identifies distinct persons in the video and provides sample faces for each, with multiple samples for the most frequent person.
661
 
 
 
 
 
 
662
  Adjust the parameters as needed:
663
- - Number of Components: Complexity of the facial expression model
664
  - Desired FPS: Frames per second to analyze (lower for faster processing)
665
  - Batch Size: Affects processing speed and memory usage
666
-
667
- Click on any graph to enlarge it.
668
  """,
669
  allow_flagging="never"
670
  )
671
 
672
-
673
- iface.launch()
 
12
  from sklearn.cluster import DBSCAN, KMeans
13
  from sklearn.preprocessing import StandardScaler, MinMaxScaler
14
  from sklearn.metrics import silhouette_score
15
+ from sklearn.decomposition import PCA
16
  import umap
17
  import pandas as pd
18
  import matplotlib
 
42
  face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.5)
43
  emotion_detector = FER(mtcnn=False)
44
 
45
+
46
  def frame_to_timecode(frame_num, total_frames, duration):
47
  total_seconds = (frame_num / total_frames) * duration
48
  hours = int(total_seconds // 3600)
 
51
  milliseconds = int((total_seconds - int(total_seconds)) * 1000)
52
  return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
53
 
54
+
55
  def get_face_embedding_and_emotion(face_img):
56
  face_tensor = torch.tensor(face_img).permute(2, 0, 1).unsqueeze(0).float() / 255
57
  face_tensor = (face_tensor - 0.5) / 0.5
 
67
 
68
  return embedding.cpu().numpy().flatten(), emotion_dict
69
 
70
+
71
  def alignFace(img):
72
  img_raw = img.copy()
73
  results = face_mesh.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
 
93
  new_img = cv2.warpAffine(img_raw, rotation_matrix, (width, height))
94
  return new_img
95
 
96
+
97
  def extract_frames(video_path, output_folder, desired_fps, progress_callback=None):
98
  os.makedirs(output_folder, exist_ok=True)
99
  clip = VideoFileClip(video_path)
 
117
  clip.close()
118
  return frame_count, original_fps
119
 
120
+
121
  def process_frames(frames_folder, aligned_faces_folder, frame_count, progress, batch_size):
122
  embeddings_by_frame = {}
123
  emotions_by_frame = {}
 
161
 
162
  return embeddings_by_frame, emotions_by_frame, aligned_face_paths
163
 
164
+
165
  def cluster_faces(embeddings):
166
  if len(embeddings) < 2:
167
  print("Not enough faces for clustering. Assigning all to one cluster.")
 
178
 
179
  return clusters
180
 
181
+
182
  def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder):
183
  for (frame_num, embedding), cluster in zip(embeddings_by_frame.items(), clusters):
184
  person_folder = os.path.join(organized_faces_folder, f"person_{cluster}")
 
187
  dst = os.path.join(person_folder, f"frame_{frame_num}_face.jpg")
188
  shutil.copy(src, dst)
189
 
190
+
191
+ def find_optimal_components(embeddings, max_components=10):
192
+ pca = PCA(n_components=max_components)
193
+ pca.fit(embeddings)
194
+
195
+ explained_variance_ratio = pca.explained_variance_ratio_
196
+ cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
197
+
198
+ # Plot explained variance ratio
199
+ plt.figure(figsize=(10, 6))
200
+ plt.plot(range(1, max_components + 1), cumulative_variance_ratio, 'bo-')
201
+ plt.xlabel('Number of Components')
202
+ plt.ylabel('Cumulative Explained Variance Ratio')
203
+ plt.title('Explained Variance Ratio vs. Number of Components')
204
+ plt.grid(True)
205
+
206
+ # Find elbow point
207
+ differences = np.diff(cumulative_variance_ratio)
208
+ elbow_point = np.argmin(differences) + 1
209
+
210
+ plt.axvline(x=elbow_point, color='r', linestyle='--', label=f'Elbow point: {elbow_point}')
211
+ plt.legend()
212
+
213
+ return elbow_point, plt
214
+
215
+
216
  def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps, original_fps, output_folder,
217
+ video_duration):
218
  emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
219
  person_data = {}
220
 
 
233
  embeddings_array = np.array(embeddings)
234
  np.save(os.path.join(output_folder, 'face_embeddings.npy'), embeddings_array)
235
 
236
+ # Find optimal number of components
237
+ optimal_components, _ = find_optimal_components(embeddings_array)
238
+
239
+ reducer = umap.UMAP(n_components=optimal_components, random_state=1)
240
  embeddings_reduced = reducer.fit_transform(embeddings)
241
 
242
  scaler = MinMaxScaler(feature_range=(0, 1))
 
253
  'Embedding_Index': range(len(embeddings))
254
  }
255
 
256
+ # Add raw embeddings
257
+ for i in range(len(embeddings[0])):
258
+ df_data[f'Raw_Embedding_{i}'] = [embedding[i] for embedding in embeddings]
259
+
260
+ for i in range(optimal_components):
261
  df_data[f'Comp {i + 1}'] = embeddings_reduced_normalized[:, i]
262
 
263
  for emotion in emotions:
 
267
 
268
  return df, largest_cluster
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
  class LSTMAutoencoder(nn.Module):
272
  def __init__(self, input_size, hidden_size=64, num_layers=2):
 
282
  out = self.fc(outputs)
283
  return out
284
 
285
+
286
+ def lstm_anomaly_detection(X, feature_columns, raw_embedding_columns, epochs=100, batch_size=64):
287
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
288
  X = torch.FloatTensor(X).to(device)
289
  if X.dim() == 2:
290
  X = X.unsqueeze(0)
291
  elif X.dim() == 1:
292
  X = X.unsqueeze(0).unsqueeze(2)
 
 
293
 
294
  print(f"X shape after reshaping: {X.shape}")
295
 
 
 
 
296
  model = LSTMAutoencoder(input_size=X.shape[2]).to(device)
297
  criterion = nn.MSELoss()
298
  optimizer = optim.Adam(model.parameters())
 
300
  for epoch in range(epochs):
301
  model.train()
302
  optimizer.zero_grad()
303
+ output = model(X)
304
+ loss = criterion(output, X)
305
+ loss.backward()
306
  optimizer.step()
307
 
308
+ if epoch % 10 == 0:
309
+ print(f"Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f}")
 
 
310
 
311
  model.eval()
312
  with torch.no_grad():
313
  reconstructed = model(X).squeeze(0).cpu().numpy()
314
 
315
  mse_all = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
 
316
 
317
  component_columns = [col for col in feature_columns if col.startswith('Comp')]
318
  component_indices = [feature_columns.index(col) for col in component_columns]
 
323
  else:
324
  mse_comp = mse_all
325
 
326
+ raw_embedding_indices = [feature_columns.index(col) for col in raw_embedding_columns]
327
+ mse_raw = np.mean(np.power(X.squeeze(0).cpu().numpy()[:, raw_embedding_indices] - reconstructed[:, raw_embedding_indices], 2), axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
+ return mse_all, mse_comp, mse_raw
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  def embedding_anomaly_detection(embeddings, epochs=100, batch_size=64):
332
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
353
  reconstructed = model(X).squeeze(0).cpu().numpy()
354
 
355
  mse = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
356
+ return mse
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
+ def determine_anomalies(mse_values, threshold=3.5):
359
+ mean = np.mean(mse_values)
360
+ std = np.std(mse_values)
361
+ anomalies = mse_values > (mean + threshold * std)
362
+ return anomalies
363
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
365
+ def plot_mse(df, mse_values, title, color='blue', time_threshold=1, hide_first_n=3):
366
  plt.figure(figsize=(16, 8), dpi=300)
367
  fig, ax = plt.subplots(figsize=(16, 8))
368
 
369
  df['Seconds'] = df['Timecode'].apply(
370
  lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
371
 
372
+ # Plot all points
373
+ ax.scatter(df['Seconds'], mse_values, color=color, alpha=0.7, s=10)
 
 
374
 
375
+ # Determine anomalies
376
+ anomalies = determine_anomalies(mse_values)
377
 
378
+ # Hide the first n anomalies
379
+ visible_anomalies = np.where(anomalies)[0][hide_first_n:]
380
+ ax.scatter(df['Seconds'].iloc[visible_anomalies], mse_values[visible_anomalies], color='red', s=50, zorder=5)
381
 
382
+ # Group closely occurring anomalies and annotate only the highest MSE
383
+ anomaly_data = list(zip(df['Timecode'].iloc[visible_anomalies],
384
+ df['Seconds'].iloc[visible_anomalies],
385
+ mse_values[visible_anomalies]))
386
+ anomaly_data.sort(key=lambda x: x[1]) # Sort by seconds
 
387
 
388
+ grouped_anomalies = []
389
+ current_group = []
390
+ for timecode, sec, mse in anomaly_data:
391
+ if not current_group or sec - current_group[-1][1] <= time_threshold:
392
+ current_group.append((timecode, sec, mse))
393
+ else:
394
+ grouped_anomalies.append(current_group)
395
+ current_group = [(timecode, sec, mse)]
396
+ if current_group:
397
+ grouped_anomalies.append(current_group)
398
 
399
+ for group in grouped_anomalies:
400
+ highest_mse_anomaly = max(group, key=lambda x: x[2])
401
+ timecode, sec, mse = highest_mse_anomaly
402
+ ax.annotate(timecode, (sec, mse), textcoords="offset points", xytext=(0, 10),
403
+ ha='center', fontsize=8, color='red')
404
 
405
+ # Add baseline (mean MSE) line
406
+ mean_mse = np.mean(mse_values)
407
+ ax.axhline(y=mean_mse, color='black', linestyle='--', linewidth=1)
408
+ ax.text(df['Seconds'].max(), mean_mse, f'Baseline ({mean_mse:.6f})',
409
+ verticalalignment='bottom', horizontalalignment='right', color='black', fontsize=8)
 
 
410
 
411
+ # Set x-axis labels to timecodes
412
  max_seconds = df['Seconds'].max()
 
413
  num_ticks = 100
414
+ tick_locations = np.linspace(0, max_seconds, num_ticks)
415
+ tick_labels = [frame_to_timecode(int(s * df['Frame'].max() / max_seconds), df['Frame'].max(), max_seconds)
416
+ for s in tick_locations]
417
+
418
+ ax.set_xticks(tick_locations)
419
+ ax.set_xticklabels(tick_labels, rotation=90, ha='center', fontsize=6)
420
 
421
  ax.set_xlabel('Time')
422
+ ax.set_ylabel('Mean Squared Error')
423
+ ax.set_title(title)
424
 
425
  ax.grid(True, linestyle='--', alpha=0.7)
426
  plt.tight_layout()
 
454
  cv2.imwrite(output_path, small_face)
455
  face_samples["others"].append(output_path)
456
  return face_samples
457
+
458
+ def process_video(video_path, desired_fps, batch_size, progress=gr.Progress()):
459
  output_folder = "output"
460
  os.makedirs(output_folder, exist_ok=True)
461
 
462
+ # Initialize plot variables
463
+ mse_plot_all = None
464
+ mse_plot_comp = None
465
+ mse_plot_raw = None
466
+ emotion_plots = [None] * 6 # For the 6 emotions
467
+ face_samples = {"most_frequent": [], "others": []}
468
+
469
  with tempfile.TemporaryDirectory() as temp_dir:
470
  aligned_faces_folder = os.path.join(temp_dir, 'aligned_faces')
471
  organized_faces_folder = os.path.join(temp_dir, 'organized_faces')
 
492
 
493
  if not aligned_face_paths:
494
  return ("No faces were extracted from the video.",
495
+ None, None, None, None, None, None, None, None, None, [], [])
496
 
497
  progress(0.6, "Clustering faces")
498
  embeddings = [embedding for _, embedding in embeddings_by_frame.items()]
 
504
 
505
  progress(0.8, "Saving person data")
506
  df, largest_cluster = save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps,
507
+ original_fps, temp_dir, video_duration)
508
 
509
  progress(0.85, "Getting face samples")
510
  face_samples = get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
 
512
  progress(0.9, "Performing anomaly detection")
513
  feature_columns = [col for col in df.columns if
514
  col not in ['Frame', 'Timecode', 'Time (Minutes)', 'Embedding_Index']]
515
+ raw_embedding_columns = [col for col in df.columns if col.startswith('Raw_Embedding_')]
516
  X = df[feature_columns].values
517
 
518
  try:
519
+ mse_all, mse_comp, mse_raw = lstm_anomaly_detection(
520
+ X, feature_columns, raw_embedding_columns, batch_size=batch_size)
 
 
 
521
 
522
+ progress(0.95, "Generating plots")
523
+ mse_plot_all = plot_mse(df, mse_all, "Facial Features + Emotions", color='blue', hide_first_n=3)
524
+ mse_plot_comp = plot_mse(df, mse_comp, "Facial Features", color='deepskyblue', hide_first_n=3)
525
+ mse_plot_raw = plot_mse(df, mse_raw, "Facial Embeddings", color='steelblue', hide_first_n=3)
 
 
 
 
526
 
 
 
 
 
 
 
 
 
 
 
 
527
  emotion_plots = [
528
+ plot_mse(df, embedding_anomaly_detection(df[emotion].values.reshape(-1, 1)),
529
+ f"MSE: {emotion.capitalize()}", color=color, hide_first_n=3)
 
 
 
530
  for emotion, color in zip(['fear', 'sad', 'angry', 'happy', 'surprise', 'neutral'],
531
  ['purple', 'green', 'orange', 'darkblue', 'gold', 'grey'])
532
  ]
533
+
534
  except Exception as e:
535
+ print(f"Error details: {str(e)}")
536
+ return (f"Error in anomaly detection: {str(e)}",
537
+ None, None, None, None, None, None, None, None, None, [], [])
538
 
539
  progress(1.0, "Preparing results")
540
  results = f"Number of persons/clusters detected: {num_clusters}\n\n"
 
542
  for cluster_id in range(num_clusters):
543
  results += f"Person/Cluster {cluster_id + 1}: {len([c for c in clusters if c == cluster_id])} frames\n"
544
 
 
545
  return (
546
  results,
547
+ mse_plot_all,
548
+ mse_plot_comp,
549
+ mse_plot_raw,
550
  *emotion_plots,
551
  face_samples["most_frequent"],
552
  face_samples["others"]
553
  )
554
 
555
+ # Define gallery outputs
556
  gallery_outputs = [
557
  gr.Gallery(label="Most Frequent Person Random Samples", columns=5, rows=2, height="auto"),
558
  gr.Gallery(label="Other Persons Random Samples", columns=5, rows=1, height="auto")
559
  ]
560
 
561
+ # Update the Gradio interface
562
  iface = gr.Interface(
563
  fn=process_video,
564
  inputs=[
565
  gr.Video(),
 
566
  gr.Slider(minimum=1, maximum=20, step=1, value=10, label="Desired FPS"),
567
  gr.Slider(minimum=1, maximum=32, step=1, value=8, label="Batch Size")
568
  ],
569
  outputs=[
570
  gr.Textbox(label="Anomaly Detection Results"),
571
+ gr.Plot(label="MSE: Facial Features + Emotions"),
572
+ gr.Plot(label="MSE: Facial Features (UMAP)"),
573
+ gr.Plot(label="MSE: Raw Facial Embeddings"),
574
+ gr.Plot(label="MSE: Fear"),
575
+ gr.Plot(label="MSE: Sad"),
576
+ gr.Plot(label="MSE: Angry"),
577
+ gr.Plot(label="MSE: Happy"),
578
+ gr.Plot(label="MSE: Surprise"),
579
+ gr.Plot(label="MSE: Neutral"),
580
  ] + gallery_outputs,
581
  title="Facial Expressions Anomaly Detection",
582
  description="""
583
  This application detects anomalies in facial expressions and emotions from a video input.
584
  It identifies distinct persons in the video and provides sample faces for each, with multiple samples for the most frequent person.
585
 
586
+ The graphs show Mean Squared Error (MSE) values for different aspects of facial expressions and emotions over time.
587
+ Each point represents a frame, with red points indicating detected anomalies.
588
+ Anomalies are annotated with their corresponding timecodes.
589
+ Higher MSE values indicate more unusual or anomalous expressions or emotions at that point in the video.
590
+
591
  Adjust the parameters as needed:
 
592
  - Desired FPS: Frames per second to analyze (lower for faster processing)
593
  - Batch Size: Affects processing speed and memory usage
 
 
594
  """,
595
  allow_flagging="never"
596
  )
597
 
598
+ # Launch the interface
599
+ iface.launch()