File size: 9,334 Bytes
bafab47
 
 
 
c500bb3
 
 
bafab47
c500bb3
 
 
bafab47
 
 
 
 
 
 
 
 
d5b3c67
bafab47
 
 
 
 
 
 
 
ac264be
bafab47
 
 
 
 
 
 
 
 
 
 
 
 
ac264be
bafab47
 
 
 
 
 
 
 
 
c500bb3
bafab47
c500bb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bafab47
162ac6d
 
863699a
e48aa26
aeb4947
 
162ac6d
 
 
 
 
93212ca
 
162ac6d
 
c500bb3
 
162ac6d
c500bb3
 
90fe712
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c500bb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90fe712
 
c500bb3
d05a31e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162ac6d
c500bb3
162ac6d
 
c500bb3
 
aeb4947
 
162ac6d
c500bb3
 
162ac6d
 
 
 
c500bb3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import gradio as gr
import time
from video_processing import process_video
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Ensure high DPI plots
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

def process_and_show_completion(video_input_path, anomaly_threshold_input, fps, progress=gr.Progress()):
    try:
        print("Starting video processing...")
        results = process_video(video_input_path, anomaly_threshold_input, fps, progress=progress)
        print("Video processing completed.")

        if isinstance(results[0], str) and results[0].startswith("Error"):
            print(f"Error occurred: {results[0]}")
            return [results[0]] + [None] * 27

        exec_time, results_summary, df, mse_embeddings, mse_posture, mse_voice, \
            mse_plot_embeddings, mse_plot_posture, mse_plot_voice, \
            mse_histogram_embeddings, mse_histogram_posture, mse_histogram_voice, \
            mse_heatmap_embeddings, mse_heatmap_posture, mse_heatmap_voice, \
            face_samples_frequent, \
            anomaly_faces_embeddings, anomaly_frames_posture_images, \
            aligned_faces_folder, frames_folder, \
            heatmap_video_path, combined_mse_plot, correlation_heatmap = results

        anomaly_faces_embeddings_pil = [Image.fromarray(face) for face in anomaly_faces_embeddings] if anomaly_faces_embeddings is not None else []
        anomaly_frames_posture_pil = [Image.fromarray(frame) for frame in anomaly_frames_posture_images] if anomaly_frames_posture_images is not None else []

        face_samples_frequent = [Image.open(path) for path in face_samples_frequent] if face_samples_frequent is not None else []

        output = [
            exec_time, results_summary,
            mse_plot_embeddings, mse_plot_posture, mse_plot_voice,
            mse_histogram_embeddings, mse_histogram_posture, mse_histogram_voice,
            mse_heatmap_embeddings, mse_heatmap_posture, mse_heatmap_voice,
            anomaly_faces_embeddings_pil, anomaly_frames_posture_pil,
            face_samples_frequent,
            heatmap_video_path, combined_mse_plot, correlation_heatmap
        ]

        return output

    except Exception as e:
        error_message = f"An error occurred: {str(e)}"
        print(error_message)
        import traceback
        traceback.print_exc()
        return [error_message] + [None] * 16

def on_button_click(video, threshold, fps):
    results = process_and_show_completion(video, threshold, fps)
    
    return {
        execution_time: gr.update(visible=True, value=results[0]),
        results_tab: gr.update(visible=True),
        description_tab: gr.update(visible=False),
        results_text: results[1],
        mse_features_plot: results[2],
        mse_posture_plot: results[3],
        mse_voice_plot: results[4],
        mse_features_hist: results[5],
        mse_posture_hist: results[6],
        mse_voice_hist: results[7],
        mse_features_heatmap: results[8],
        mse_posture_heatmap: results[9],
        mse_voice_heatmap: results[10],
        anomaly_frames_features: results[11],
        anomaly_frames_posture: results[12],
        face_samples_most_frequent: results[13],
        heatmap_video: results[14],
        combined_mse_plot: results[15],
        correlation_heatmap_plot: results[16]
    }

with gr.Blocks() as iface:
    gr.Markdown("""
    # Multimodal Behavioral Anomalies Detection

    This tool detects anomalies in facial expressions, body language, and voice over the timeline of a video.
    It extracts faces, postures, and voice from video frames, and analyzes them to identify anomalies using time series analysis and a variational autoencoder (VAE) approach.
    """)

    with gr.Row():
        video_input = gr.Video()

    anomaly_threshold = gr.Slider(minimum=1, maximum=5, step=0.1, value=3, label="Anomaly Detection Threshold (Standard deviation)")
    fps_slider = gr.Slider(minimum=5, maximum=20, step=1, value=10, label="Frames Per Second (FPS)")
    process_btn = gr.Button("Detect Anomalies")
    progress_bar = gr.Progress()
    
    execution_time = gr.Number(label="Execution Time (seconds)", visible=False)

    with gr.Tabs() as tabs:
        with gr.TabItem("Description", id="description_tab") as description_tab:
            with gr.Column():
                gr.Markdown("""
                # Multimodal Behavioral Anomalies Detection

                The purpose of this tool is to detect anomalies in facial expressions, body language, and voice over the timeline of a video.   

                It extracts faces, postures, and voice features from video frames, detects unique facial features, body postures, and speaker embeddings, and analyzes them to identify anomalies using time series analysis, specifically utilizing a variational autoencoder (VAE) approach.   

                ## Applications

                - Identify suspicious behavior in surveillance footage.
                - Analyze micro-expressions.
                - Monitor and assess emotional states in communications.
                - Evaluate changes in vocal tone and speech patterns.

                ## Features

                - **Face Extraction**: Extracts faces from video frames using the MTCNN model.
                - **Feature Embeddings**: Extracts facial feature embeddings using the InceptionResnetV1 model.
                - **Body Posture Analysis**: Evaluates body postures using MediaPipe Pose.
                - **Voice Analysis**: Extracts and segment speaker embeddings from audio using PyAnnote.
                - **Anomaly Detection**: Uses Variational Autoencoder (VAE) to detect anomalies in facial expressions, body postures, and voice features over time.
                - **Visualization**: Represents changes in facial expressions, body postures, and vocal tone over time, marking anomaly key points.

                ## Limitations

                - **Evaluation Challenges**: Since this is an unsupervised method, there is no labeled data to compare against.
                - **Subjectivity**: The concept of what constitutes an "anomaly" can be subjective and context-dependent.
                - **Lighting and Resolution**: Variability in lighting conditions and camera resolution can affect the quality of detected features.
                - **Audio Quality**: Background noise and poor audio quality can affect the accuracy of voice analysis.
                - **Generalization**: The model may not generalize well to all types of videos and contexts.
                - **Computationally Intensive**: Processing high-resolution video frames can be computationally demanding.

                ## Conclusion
                This tool offers solutions for detecting behavioral anomalies in video content. However, users should be aware of its limitations and interpret results with caution.
                """)

        with gr.TabItem("Results", id="results_tab", visible=False) as results_tab:
            with gr.Tabs():
                with gr.TabItem("Facial Features"):
                    results_text = gr.TextArea(label="Faces Breakdown", lines=5)
                    mse_features_plot = gr.Plot(label="MSE: Facial Features")
                    mse_features_hist = gr.Plot(label="MSE Distribution: Facial Features")
                    mse_features_heatmap = gr.Plot(label="MSE Heatmap: Facial Features")
                    anomaly_frames_features = gr.Gallery(label="Anomaly Frames (Facial Features)", columns=6, rows=2, height="auto")
                    face_samples_most_frequent = gr.Gallery(label="Most Frequent Person Samples", columns=10, rows=2, height="auto")

                with gr.TabItem("Body Posture"):
                    mse_posture_plot = gr.Plot(label="MSE: Body Posture")
                    mse_posture_hist = gr.Plot(label="MSE Distribution: Body Posture")
                    mse_posture_heatmap = gr.Plot(label="MSE Heatmap: Body Posture")
                    anomaly_frames_posture = gr.Gallery(label="Anomaly Frames (Body Posture)", columns=6, rows=2, height="auto")

                with gr.TabItem("Voice"):
                    mse_voice_plot = gr.Plot(label="MSE: Voice")
                    mse_voice_hist = gr.Plot(label="MSE Distribution: Voice")
                    mse_voice_heatmap = gr.Plot(label="MSE Heatmap: Voice")

                with gr.TabItem("Combined"):
                    heatmap_video = gr.Video(label="Video with Anomaly Heatmap")
                    combined_mse_plot = gr.Plot(label="Combined MSE Plot")
                    correlation_heatmap_plot = gr.Plot(label="Correlation Heatmap")

    process_btn.click(
        fn=on_button_click,
        inputs=[video_input, anomaly_threshold, fps_slider],
        outputs=[
            execution_time, results_tab, description_tab,
            results_text, mse_features_plot, mse_posture_plot, mse_voice_plot,
            mse_features_hist, mse_posture_hist, mse_voice_hist,
            mse_features_heatmap, mse_posture_heatmap, mse_voice_heatmap,
            anomaly_frames_features, anomaly_frames_posture,
            face_samples_most_frequent, heatmap_video, combined_mse_plot,
            correlation_heatmap_plot
        ]
    )

if __name__ == "__main__":
    iface.launch()