File size: 7,440 Bytes
99188db
 
 
51f70e4
99188db
 
 
51f70e4
99188db
 
 
51f70e4
99188db
 
51f70e4
99188db
 
51f70e4
99188db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51f70e4
99188db
 
 
 
51f70e4
 
 
99188db
 
 
 
 
 
 
 
51f70e4
99188db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import streamlit as st
import tempfile
import os
import cv2
import numpy as np
import torch
import librosa
import speech_recognition as sr
import noisereduce as nr
import pandas as pd
import plotly.express as px
from deepface import DeepFace
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Ensure Pydub uses ffmpeg
AudioSegment.converter = "/usr/bin/ffmpeg"

# Title & Instructions
st.title("πŸ€— AI Child Behavior Assessment")
st.markdown(
    """
    ### How to Use:
    1️⃣ Choose an **analysis type** below.  
    2️⃣ Upload the required file(s).  
    3️⃣ Click the **Analyze** button to process the data.  
    """
)

# Load AI Model for Speech Recognition
st.write("⏳ Loading AI Speech Model...")
try:
    processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    st.success("βœ… AI Speech Model Loaded!")
except Exception as e:
    st.error(f"❌ Error loading speech model: {e}")

# ======================== DEFINE VIDEO ANALYSIS FUNCTION ========================
def analyze_video(video_path):
    """Processes video and extracts emotions with visualization"""
    st.write("πŸ”Ž Analyzing Emotions in Video...")
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    emotions_detected = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % 10 == 0:  # Analyze every 10th frame
            try:
                analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
                emotions_detected.append(analysis[0]['dominant_emotion'])
            except Exception as e:
                st.error(f"⚠️ DeepFace error: {e}")
        frame_count += 1

    cap.release()
    if emotions_detected:
        most_common_emotion = max(set(emotions_detected), key=emotions_detected.count)
        st.success(f"🧐 Most detected emotion: {most_common_emotion}")

        # Visualization
        emotion_counts = pd.Series(emotions_detected).value_counts()
        emotion_df = pd.DataFrame({'Emotion': emotion_counts.index, 'Count': emotion_counts.values})
        fig = px.bar(emotion_df, x='Emotion', y='Count', title="Emotion Distribution in Video", color='Emotion')
        st.plotly_chart(fig)
    else:
        st.warning("⚠️ No emotions detected. Try a different video.")

# ======================== DEFINE AUDIO ANALYSIS FUNCTION ========================
def transcribe_audio(audio_path):
    """Processes audio and extracts transcription with visualization"""
    try:
        st.write(f"πŸ”Ž Processing Audio File...")
        speech, sr = librosa.load(audio_path, sr=16000)

        # Enhanced Preprocessing
        speech = nr.reduce_noise(y=speech, sr=sr, prop_decrease=0.4)
        speech = librosa.effects.trim(speech)[0]
        speech = librosa.util.normalize(speech)

        st.write("πŸ€– Processing audio with AI model...")
        input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values

        with torch.no_grad():
            logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]

        st.success(f"πŸ“ Transcription (AI Model): {transcription}")

        # Visualization
        word_count = pd.Series(transcription.split()).value_counts()
        word_df = pd.DataFrame({'Word': word_count.index, 'Count': word_count.values})
        fig = px.bar(word_df, x='Word', y='Count', title="Word Frequency in Transcription", color='Word')
        st.plotly_chart(fig)
    except Exception as e:
        st.error(f"⚠️ Error in AI Speech Processing: {e}")

# ======================== USER SELECTS ANALYSIS MODE ========================
analysis_option = st.radio(
    "Select Analysis Type:",
    ["πŸ“Ή Video Only (Facial Emotion)", "🎀 Audio Only (Speech Analysis)", "🎬 Video & Audio (Multimodal)"]
)

# ======================== VIDEO ONLY ANALYSIS ========================
if analysis_option == "πŸ“Ή Video Only (Facial Emotion)":
    st.header("πŸ“‚ Upload a Video for Emotion Analysis")
    video_file = st.file_uploader("Upload a video file", type=["mp4", "avi", "mov"])

    if video_file:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
            temp_video.write(video_file.read())
            video_path = temp_video.name
        st.success("πŸ“‚ Video uploaded successfully!")

        if st.button("Analyze Video"):
            analyze_video(video_path)

# ======================== AUDIO ONLY ANALYSIS ========================
elif analysis_option == "🎀 Audio Only (Speech Analysis)":
    st.header("🎀 Upload an Audio File for Speech Analysis")
    audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3"])

    if audio_file:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
            temp_audio.write(audio_file.read())
            audio_path = temp_audio.name
        st.success("🎀 Audio uploaded successfully!")

        if st.button("Analyze Audio"):
            transcribe_audio(audio_path)

# ======================== MULTIMODAL ANALYSIS (VIDEO + AUDIO) ========================
elif analysis_option == "🎬 Video & Audio (Multimodal)":
    st.header("πŸŽ₯ Upload a **Single File** for Video & Audio Combined Analysis")
    multimodal_file = st.file_uploader("Upload a **video file with audio**", type=["mp4", "avi", "mov"])

    if multimodal_file:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
            temp_file.write(multimodal_file.read())
            multimodal_path = temp_file.name

        st.success("βœ… Multimodal file uploaded successfully!")

        if st.button("Analyze Video & Audio Together"):
            def analyze_multimodal(multimodal_path):
                st.write("πŸ”Ž Extracting Video & Audio...")

                # Extract Video Emotion
                video_emotions = analyze_video(multimodal_path)

                # Extract Audio for Speech Processing
                audio_transcription = transcribe_audio(multimodal_path)

                # Multimodal Analysis Visualization
                st.header("πŸ” Multimodal Analysis Results")
                if not video_emotions or not audio_transcription:
                    st.error("❌ Could not extract both Video & Audio insights.")
                    return

                # Emotion-Speech Comparison
                speech_emotion = "Neutral"
                if any(word in audio_transcription.lower() for word in ["angry", "mad"]):
                    speech_emotion = "Angry"
                elif any(word in audio_transcription.lower() for word in ["happy", "excited"]):
                    speech_emotion = "Happy"
                elif any(word in audio_transcription.lower() for word in ["sad", "crying"]):
                    speech_emotion = "Sad"

                fig = px.pie(
                    names=["Video Emotion", "Speech Emotion"],
                    values=[len(video_emotions), 1],
                    title=f"Comparison: Video ({video_emotions[0]}) vs. Speech ({speech_emotion})"
                )
                st.plotly_chart(fig)

            analyze_multimodal(multimodal_path)