import gradio as gr
import time
from video_processing import process_video
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Ensure high DPI plots
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

def process_and_show_completion(video_input_path, anomaly_threshold_input, fps, progress=gr.Progress()):
    try:
        print("Starting video processing...")
        results = process_video(video_input_path, anomaly_threshold_input, fps, progress=progress)
        print("Video processing completed.")

        if isinstance(results[0], str) and results[0].startswith("Error"):
            print(f"Error occurred: {results[0]}")
            return [results[0]] + [None] * 27

        exec_time, results_summary, df, mse_embeddings, mse_posture, mse_voice, \
            mse_plot_embeddings, mse_plot_posture, mse_plot_voice, \
            mse_histogram_embeddings, mse_histogram_posture, mse_histogram_voice, \
            mse_heatmap_embeddings, mse_heatmap_posture, mse_heatmap_voice, \
            face_samples_frequent, \
            anomaly_faces_embeddings, anomaly_frames_posture_images, \
            aligned_faces_folder, frames_folder, \
            heatmap_video_path, combined_mse_plot, correlation_heatmap = results

        anomaly_faces_embeddings_pil = [Image.fromarray(face) for face in anomaly_faces_embeddings] if anomaly_faces_embeddings is not None else []
        anomaly_frames_posture_pil = [Image.fromarray(frame) for frame in anomaly_frames_posture_images] if anomaly_frames_posture_images is not None else []

        face_samples_frequent = [Image.open(path) for path in face_samples_frequent] if face_samples_frequent is not None else []

        output = [
            exec_time, results_summary,
            mse_plot_embeddings, mse_plot_posture, mse_plot_voice,
            mse_histogram_embeddings, mse_histogram_posture, mse_histogram_voice,
            mse_heatmap_embeddings, mse_heatmap_posture, mse_heatmap_voice,
            anomaly_faces_embeddings_pil, anomaly_frames_posture_pil,
            face_samples_frequent,
            heatmap_video_path, combined_mse_plot, correlation_heatmap
        ]

        return output

    except Exception as e:
        error_message = f"An error occurred: {str(e)}"
        print(error_message)
        import traceback
        traceback.print_exc()
        return [error_message] + [None] * 16

def on_button_click(video, threshold, fps):
    start_time = time.time()
    
    # Show execution time immediately and hide description
    yield {
        execution_time: gr.update(visible=True, value=0),
        description: gr.update(visible=False),
        results: gr.update(visible=True)
    }
    
    results = process_and_show_completion(video, threshold, fps)
    end_time = time.time()
    exec_time = end_time - start_time
    
    return {
        execution_time: gr.update(visible=True, value=exec_time),
        results_text: results[1],
        mse_features_plot: results[2],
        mse_posture_plot: results[3],
        mse_voice_plot: results[4],
        mse_features_hist: results[5],
        mse_posture_hist: results[6],
        mse_voice_hist: results[7],
        mse_features_heatmap: results[8],
        mse_posture_heatmap: results[9],
        mse_voice_heatmap: results[10],
        anomaly_frames_features: results[11],
        anomaly_frames_posture: results[12],
        face_samples_most_frequent: results[13],
        heatmap_video: results[14],
        combined_mse_plot: results[15],
        correlation_heatmap_plot: results[16],
        video_display_facial: video,
        video_display_body: video,
        video_display_voice: video
    }

with gr.Blocks() as iface:
    gr.Markdown("""
    # Multimodal Behavioral Anomalies Detection

    This tool detects anomalies in facial expressions, body language, and voice over the timeline of a video.
    It extracts faces, postures, and voice from video frames, and analyzes them to identify anomalies using time series analysis and a variational autoencoder (VAE) approach.
    """)

    video_input = gr.Video(label="Input Video", visible=True)

    anomaly_threshold = gr.Slider(minimum=1, maximum=5, step=0.1, value=3, label="Anomaly Detection Threshold (Standard deviation)")
    fps_slider = gr.Slider(minimum=5, maximum=20, step=1, value=10, label="Frames Per Second (FPS)")
    process_btn = gr.Button("Detect Anomalies")
    progress_bar = gr.Progress()
    
    execution_time = gr.Number(label="Execution Time (seconds)", visible=False)

    description = gr.Markdown("""
    # Multimodal Behavioral Anomalies Detection
    
    The purpose of this tool is to detect anomalies in facial expressions, body language, and voice over the timeline of a video.   
    
    It extracts faces, postures, and voice features from video frames, detects unique facial features, body postures, and speaker embeddings, and analyzes them to identify anomalies using time series analysis, specifically utilizing a variational autoencoder (VAE) approach.   
    
    ## Applications
    
    - Identify suspicious behavior in surveillance footage.
    - Analyze micro-expressions.
    - Monitor and assess emotional states in communications.
    - Evaluate changes in vocal tone and speech patterns.
    
    ## Features
    
    - **Face Extraction**: Extracts faces from video frames using the MTCNN model.
    - **Feature Embeddings**: Extracts facial feature embeddings using the InceptionResnetV1 model.
    - **Body Posture Analysis**: Evaluates body postures using MediaPipe Pose.
    - **Voice Analysis**: Extracts and segment speaker embeddings from audio using PyAnnote.
    - **Anomaly Detection**: Uses Variational Autoencoder (VAE) to detect anomalies in facial expressions, body postures, and voice features over time.
    - **Visualization**: Represents changes in facial expressions, body postures, and vocal tone over time, marking anomaly key points.
    
    ## Limitations
    
    - **Evaluation Challenges**: Since this is an unsupervised method, there is no labeled data to compare against.
    - **Subjectivity**: The concept of what constitutes an "anomaly" can be subjective and context-dependent.
    - **Lighting and Resolution**: Variability in lighting conditions and camera resolution can affect the quality of detected features.
    - **Audio Quality**: Background noise and poor audio quality can affect the accuracy of voice analysis.
    - **Generalization**: The model may not generalize well to all types of videos and contexts.
    - **Computationally Intensive**: Processing high-resolution video frames can be computationally demanding.
    
    ## Conclusion
    This tool offers solutions for detecting behavioral anomalies in video content. However, users should be aware of its limitations and interpret results with caution.
    """, visible=True)

    with gr.Tabs(visible=False) as results:
        with gr.TabItem("Facial Features"):
            video_display_facial = gr.Video(label="Input Video")
            results_text = gr.TextArea(label="Faces Breakdown", lines=5)
            mse_features_plot = gr.Plot(label="MSE: Facial Features")
            mse_features_hist = gr.Plot(label="MSE Distribution: Facial Features")
            mse_features_heatmap = gr.Plot(label="MSE Heatmap: Facial Features")
            anomaly_frames_features = gr.Gallery(label="Anomaly Frames (Facial Features)", columns=6, rows=2, height="auto")
            face_samples_most_frequent = gr.Gallery(label="Most Frequent Person Samples", columns=10, rows=2, height="auto")

        with gr.TabItem("Body Posture"):
            video_display_body = gr.Video(label="Input Video")
            mse_posture_plot = gr.Plot(label="MSE: Body Posture")
            mse_posture_hist = gr.Plot(label="MSE Distribution: Body Posture")
            mse_posture_heatmap = gr.Plot(label="MSE Heatmap: Body Posture")
            anomaly_frames_posture = gr.Gallery(label="Anomaly Frames (Body Posture)", columns=6, rows=2, height="auto")

        with gr.TabItem("Voice"):
            video_display_voice = gr.Video(label="Input Video")
            mse_voice_plot = gr.Plot(label="MSE: Voice")
            mse_voice_hist = gr.Plot(label="MSE Distribution: Voice")
            mse_voice_heatmap = gr.Plot(label="MSE Heatmap: Voice")

        with gr.TabItem("Combined"):
            heatmap_video = gr.Video(label="Video with Anomaly Heatmap")
            combined_mse_plot = gr.Plot(label="Combined MSE Plot")
            correlation_heatmap_plot = gr.Plot(label="Correlation Heatmap")

    process_btn.click(
        fn=on_button_click,
        inputs=[video_input, anomaly_threshold, fps_slider],
        outputs=[
            execution_time, description, results,
            results_text, mse_features_plot, mse_posture_plot, mse_voice_plot,
            mse_features_hist, mse_posture_hist, mse_voice_hist,
            mse_features_heatmap, mse_posture_heatmap, mse_voice_heatmap,
            anomaly_frames_features, anomaly_frames_posture,
            face_samples_most_frequent, heatmap_video, combined_mse_plot,
            correlation_heatmap_plot, video_display_facial, video_display_body, video_display_voice
        ]
    )

if __name__ == "__main__":
    iface.launch()