import streamlit as st
import tempfile
import os
import cv2
import numpy as np
import torch
import librosa
import speech_recognition as sr
import noisereduce as nr
import pandas as pd
import plotly.express as px
from deepface import DeepFace
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# Ensure Pydub uses ffmpeg
AudioSegment.converter = "/usr/bin/ffmpeg"
# Title & Instructions
st.title("πŸ€— AI Child Behavior Assessment")
### How to Use:
1️⃣ Choose an **analysis type** below.
2️⃣ Upload the required file(s).
3️⃣ Click the **Analyze** button to process the data.
# Load AI Model for Speech Recognition
st.write("⏳ Loading AI Speech Model...")
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
st.success("βœ… AI Speech Model Loaded!")
except Exception as e:
st.error(f"❌ Error loading speech model: {e}")
# ======================== DEFINE VIDEO ANALYSIS FUNCTION ========================
def analyze_video(video_path):
"""Processes video and extracts emotions with visualization"""
st.write("πŸ”Ž Analyzing Emotions in Video...")
cap = cv2.VideoCapture(video_path)
frame_count = 0
emotions_detected = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
if frame_count % 10 == 0: # Analyze every 10th frame
analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
except Exception as e:
st.error(f"⚠️ DeepFace error: {e}")
frame_count += 1
if emotions_detected:
most_common_emotion = max(set(emotions_detected), key=emotions_detected.count)
st.success(f"🧐 Most detected emotion: {most_common_emotion}")
# Visualization
emotion_counts = pd.Series(emotions_detected).value_counts()
emotion_df = pd.DataFrame({'Emotion': emotion_counts.index, 'Count': emotion_counts.values})
fig = px.bar(emotion_df, x='Emotion', y='Count', title="Emotion Distribution in Video", color='Emotion')
st.warning("⚠️ No emotions detected. Try a different video.")
# ======================== DEFINE AUDIO ANALYSIS FUNCTION ========================
def transcribe_audio(audio_path):
"""Processes audio and extracts transcription with visualization"""
st.write(f"πŸ”Ž Processing Audio File...")
speech, sr = librosa.load(audio_path, sr=16000)
# Enhanced Preprocessing
speech = nr.reduce_noise(y=speech, sr=sr, prop_decrease=0.4)
speech = librosa.effects.trim(speech)[0]
speech = librosa.util.normalize(speech)
st.write("πŸ€– Processing audio with AI model...")
input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
st.success(f"πŸ“ Transcription (AI Model): {transcription}")
# Visualization
word_count = pd.Series(transcription.split()).value_counts()
word_df = pd.DataFrame({'Word': word_count.index, 'Count': word_count.values})
fig = px.bar(word_df, x='Word', y='Count', title="Word Frequency in Transcription", color='Word')
except Exception as e:
st.error(f"⚠️ Error in AI Speech Processing: {e}")
# ======================== USER SELECTS ANALYSIS MODE ========================
analysis_option = st.radio(
"Select Analysis Type:",
["πŸ“Ή Video Only (Facial Emotion)", "🎀 Audio Only (Speech Analysis)", "🎬 Video & Audio (Multimodal)"]
# ======================== VIDEO ONLY ANALYSIS ========================
if analysis_option == "πŸ“Ή Video Only (Facial Emotion)":
st.header("πŸ“‚ Upload a Video for Emotion Analysis")
video_file = st.file_uploader("Upload a video file", type=["mp4", "avi", "mov"])
if video_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
video_path = temp_video.name
st.success("πŸ“‚ Video uploaded successfully!")
if st.button("Analyze Video"):
# ======================== AUDIO ONLY ANALYSIS ========================
elif analysis_option == "🎀 Audio Only (Speech Analysis)":
st.header("🎀 Upload an Audio File for Speech Analysis")
audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3"])
if audio_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
audio_path = temp_audio.name
st.success("🎀 Audio uploaded successfully!")
if st.button("Analyze Audio"):
# ======================== MULTIMODAL ANALYSIS (VIDEO + AUDIO) ========================
elif analysis_option == "🎬 Video & Audio (Multimodal)":
st.header("πŸŽ₯ Upload a **Single File** for Video & Audio Combined Analysis")
multimodal_file = st.file_uploader("Upload a **video file with audio**", type=["mp4", "avi", "mov"])
if multimodal_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
multimodal_path = temp_file.name
st.success("βœ… Multimodal file uploaded successfully!")
if st.button("Analyze Video & Audio Together"):
def analyze_multimodal(multimodal_path):
st.write("πŸ”Ž Extracting Video & Audio...")
# Extract Video Emotion
video_emotions = analyze_video(multimodal_path)
# Extract Audio for Speech Processing
audio_transcription = transcribe_audio(multimodal_path)
# Multimodal Analysis Visualization
st.header("πŸ” Multimodal Analysis Results")
if not video_emotions or not audio_transcription:
st.error("❌ Could not extract both Video & Audio insights.")
# Emotion-Speech Comparison
speech_emotion = "Neutral"
if any(word in audio_transcription.lower() for word in ["angry", "mad"]):
speech_emotion = "Angry"
elif any(word in audio_transcription.lower() for word in ["happy", "excited"]):
speech_emotion = "Happy"
elif any(word in audio_transcription.lower() for word in ["sad", "crying"]):
speech_emotion = "Sad"
fig = px.pie(
names=["Video Emotion", "Speech Emotion"],
values=[len(video_emotions), 1],
title=f"Comparison: Video ({video_emotions[0]}) vs. Speech ({speech_emotion})"