import streamlit as st
import tempfile
import os
import cv2
import numpy as np
import torch
import librosa
import speech_recognition as sr
import noisereduce as nr
import pandas as pd
import plotly.express as px
from deepface import DeepFace
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Ensure Pydub uses ffmpeg
AudioSegment.converter = "/usr/bin/ffmpeg"

# Title & Instructions
st.title("🤗 AI Child Behavior Assessment")
st.markdown(
    """
    ### How to Use:
    1️⃣ Choose an **analysis type** below.  
    2️⃣ Upload the required file(s).  
    3️⃣ Click the **Analyze** button to process the data.  
    """
)

# Load AI Model for Speech Recognition
st.write("⏳ Loading AI Speech Model...")
try:
    processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    st.success("✅ AI Speech Model Loaded!")
except Exception as e:
    st.error(f"❌ Error loading speech model: {e}")

# ======================== DEFINE VIDEO ANALYSIS FUNCTION ========================
def analyze_video(video_path):
    """Processes video and extracts emotions with visualization"""
    st.write("🔎 Analyzing Emotions in Video...")
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    emotions_detected = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % 10 == 0:  # Analyze every 10th frame
            try:
                analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
                emotions_detected.append(analysis[0]['dominant_emotion'])
            except Exception as e:
                st.error(f"⚠️ DeepFace error: {e}")
        frame_count += 1

    cap.release()
    if emotions_detected:
        most_common_emotion = max(set(emotions_detected), key=emotions_detected.count)
        st.success(f"🧐 Most detected emotion: {most_common_emotion}")

        # Visualization
        emotion_counts = pd.Series(emotions_detected).value_counts()
        emotion_df = pd.DataFrame({'Emotion': emotion_counts.index, 'Count': emotion_counts.values})
        fig = px.bar(emotion_df, x='Emotion', y='Count', title="Emotion Distribution in Video", color='Emotion')
        st.plotly_chart(fig)
    else:
        st.warning("⚠️ No emotions detected. Try a different video.")

# ======================== DEFINE AUDIO ANALYSIS FUNCTION ========================
def transcribe_audio(audio_path):
    """Processes audio and extracts transcription with visualization"""
    try:
        st.write(f"🔎 Processing Audio File...")
        speech, sr = librosa.load(audio_path, sr=16000)

        # Enhanced Preprocessing
        speech = nr.reduce_noise(y=speech, sr=sr, prop_decrease=0.4)
        speech = librosa.effects.trim(speech)[0]
        speech = librosa.util.normalize(speech)

        st.write("🤖 Processing audio with AI model...")
        input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values

        with torch.no_grad():
            logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]

        st.success(f"📝 Transcription (AI Model): {transcription}")

        # Visualization
        word_count = pd.Series(transcription.split()).value_counts()
        word_df = pd.DataFrame({'Word': word_count.index, 'Count': word_count.values})
        fig = px.bar(word_df, x='Word', y='Count', title="Word Frequency in Transcription", color='Word')
        st.plotly_chart(fig)
    except Exception as e:
        st.error(f"⚠️ Error in AI Speech Processing: {e}")

# ======================== USER SELECTS ANALYSIS MODE ========================
analysis_option = st.radio(
    "Select Analysis Type:",
    ["📹 Video Only (Facial Emotion)", "🎤 Audio Only (Speech Analysis)", "🎬 Video & Audio (Multimodal)"]
)

# ======================== VIDEO ONLY ANALYSIS ========================
if analysis_option == "📹 Video Only (Facial Emotion)":
    st.header("📂 Upload a Video for Emotion Analysis")
    video_file = st.file_uploader("Upload a video file", type=["mp4", "avi", "mov"])

    if video_file:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
            temp_video.write(video_file.read())
            video_path = temp_video.name
        st.success("📂 Video uploaded successfully!")

        if st.button("Analyze Video"):
            analyze_video(video_path)

# ======================== AUDIO ONLY ANALYSIS ========================
elif analysis_option == "🎤 Audio Only (Speech Analysis)":
    st.header("🎤 Upload an Audio File for Speech Analysis")
    audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3"])

    if audio_file:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
            temp_audio.write(audio_file.read())
            audio_path = temp_audio.name
        st.success("🎤 Audio uploaded successfully!")

        if st.button("Analyze Audio"):
            transcribe_audio(audio_path)

# ======================== MULTIMODAL ANALYSIS (VIDEO + AUDIO) ========================
elif analysis_option == "🎬 Video & Audio (Multimodal)":
    st.header("🎥 Upload a **Single File** for Video & Audio Combined Analysis")
    multimodal_file = st.file_uploader("Upload a **video file with audio**", type=["mp4", "avi", "mov"])

    if multimodal_file:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
            temp_file.write(multimodal_file.read())
            multimodal_path = temp_file.name

        st.success("✅ Multimodal file uploaded successfully!")

        if st.button("Analyze Video & Audio Together"):
            def analyze_multimodal(multimodal_path):
                st.write("🔎 Extracting Video & Audio...")

                # Extract Video Emotion
                video_emotions = analyze_video(multimodal_path)

                # Extract Audio for Speech Processing
                audio_transcription = transcribe_audio(multimodal_path)

                # Multimodal Analysis Visualization
                st.header("🔍 Multimodal Analysis Results")
                if not video_emotions or not audio_transcription:
                    st.error("❌ Could not extract both Video & Audio insights.")
                    return

                # Emotion-Speech Comparison
                speech_emotion = "Neutral"
                if any(word in audio_transcription.lower() for word in ["angry", "mad"]):
                    speech_emotion = "Angry"
                elif any(word in audio_transcription.lower() for word in ["happy", "excited"]):
                    speech_emotion = "Happy"
                elif any(word in audio_transcription.lower() for word in ["sad", "crying"]):
                    speech_emotion = "Sad"

                fig = px.pie(
                    names=["Video Emotion", "Speech Emotion"],
                    values=[len(video_emotions), 1],
                    title=f"Comparison: Video ({video_emotions[0]}) vs. Speech ({speech_emotion})"
                )
                st.plotly_chart(fig)

            analyze_multimodal(multimodal_path)