Spaces:

Durganihantri
/

AI-Child-Behavior-Assessment

Running

AI-Child-Behavior-Assessment / backend /app.py

Durganihantri

Update app.py

99188db unverified about 2 months ago

7.44 kB

	import streamlit as st
	import tempfile
	import os
	import cv2
	import numpy as np
	import torch
	import librosa
	import speech_recognition as sr
	import noisereduce as nr
	import pandas as pd
	import plotly.express as px
	from deepface import DeepFace
	from pydub import AudioSegment
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

	# Ensure Pydub uses ffmpeg
	AudioSegment.converter = "/usr/bin/ffmpeg"

	# Title & Instructions
	st.title("🤗 AI Child Behavior Assessment")
	st.markdown(
	"""
	### How to Use:
	1️⃣ Choose an analysis type below.
	2️⃣ Upload the required file(s).
	3️⃣ Click the Analyze button to process the data.
	"""
	)

	# Load AI Model for Speech Recognition
	st.write("⏳ Loading AI Speech Model...")
	try:
	processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
	model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
	st.success("✅ AI Speech Model Loaded!")
	except Exception as e:
	st.error(f"❌ Error loading speech model: {e}")

	# ======================== DEFINE VIDEO ANALYSIS FUNCTION ========================
	def analyze_video(video_path):
	"""Processes video and extracts emotions with visualization"""
	st.write("🔎 Analyzing Emotions in Video...")
	cap = cv2.VideoCapture(video_path)
	frame_count = 0
	emotions_detected = []

	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break
	if frame_count % 10 == 0: # Analyze every 10th frame
	try:
	analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
	emotions_detected.append(analysis[0]['dominant_emotion'])
	except Exception as e:
	st.error(f"⚠️ DeepFace error: {e}")
	frame_count += 1

	cap.release()
	if emotions_detected:
	most_common_emotion = max(set(emotions_detected), key=emotions_detected.count)
	st.success(f"🧐 Most detected emotion: {most_common_emotion}")

	# Visualization
	emotion_counts = pd.Series(emotions_detected).value_counts()
	emotion_df = pd.DataFrame({'Emotion': emotion_counts.index, 'Count': emotion_counts.values})
	fig = px.bar(emotion_df, x='Emotion', y='Count', title="Emotion Distribution in Video", color='Emotion')
	st.plotly_chart(fig)
	else:
	st.warning("⚠️ No emotions detected. Try a different video.")

	# ======================== DEFINE AUDIO ANALYSIS FUNCTION ========================
	def transcribe_audio(audio_path):
	"""Processes audio and extracts transcription with visualization"""
	try:
	st.write(f"🔎 Processing Audio File...")
	speech, sr = librosa.load(audio_path, sr=16000)

	# Enhanced Preprocessing
	speech = nr.reduce_noise(y=speech, sr=sr, prop_decrease=0.4)
	speech = librosa.effects.trim(speech)[0]
	speech = librosa.util.normalize(speech)

	st.write("🤖 Processing audio with AI model...")
	input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values

	with torch.no_grad():
	logits = model(input_values).logits

	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.batch_decode(predicted_ids)[0]

	st.success(f"📝 Transcription (AI Model): {transcription}")

	# Visualization
	word_count = pd.Series(transcription.split()).value_counts()
	word_df = pd.DataFrame({'Word': word_count.index, 'Count': word_count.values})
	fig = px.bar(word_df, x='Word', y='Count', title="Word Frequency in Transcription", color='Word')
	st.plotly_chart(fig)
	except Exception as e:
	st.error(f"⚠️ Error in AI Speech Processing: {e}")

	# ======================== USER SELECTS ANALYSIS MODE ========================
	analysis_option = st.radio(
	"Select Analysis Type:",
	["📹 Video Only (Facial Emotion)", "🎤 Audio Only (Speech Analysis)", "🎬 Video & Audio (Multimodal)"]
	)

	# ======================== VIDEO ONLY ANALYSIS ========================
	if analysis_option == "📹 Video Only (Facial Emotion)":
	st.header("📂 Upload a Video for Emotion Analysis")
	video_file = st.file_uploader("Upload a video file", type=["mp4", "avi", "mov"])

	if video_file:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
	temp_video.write(video_file.read())
	video_path = temp_video.name
	st.success("📂 Video uploaded successfully!")

	if st.button("Analyze Video"):
	analyze_video(video_path)

	# ======================== AUDIO ONLY ANALYSIS ========================
	elif analysis_option == "🎤 Audio Only (Speech Analysis)":
	st.header("🎤 Upload an Audio File for Speech Analysis")
	audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3"])

	if audio_file:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
	temp_audio.write(audio_file.read())
	audio_path = temp_audio.name
	st.success("🎤 Audio uploaded successfully!")

	if st.button("Analyze Audio"):
	transcribe_audio(audio_path)

	# ======================== MULTIMODAL ANALYSIS (VIDEO + AUDIO) ========================
	elif analysis_option == "🎬 Video & Audio (Multimodal)":
	st.header("🎥 Upload a Single File for Video & Audio Combined Analysis")
	multimodal_file = st.file_uploader("Upload a video file with audio", type=["mp4", "avi", "mov"])

	if multimodal_file:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
	temp_file.write(multimodal_file.read())
	multimodal_path = temp_file.name

	st.success("✅ Multimodal file uploaded successfully!")

	if st.button("Analyze Video & Audio Together"):
	def analyze_multimodal(multimodal_path):
	st.write("🔎 Extracting Video & Audio...")

	# Extract Video Emotion
	video_emotions = analyze_video(multimodal_path)

	# Extract Audio for Speech Processing
	audio_transcription = transcribe_audio(multimodal_path)

	# Multimodal Analysis Visualization
	st.header("🔍 Multimodal Analysis Results")
	if not video_emotions or not audio_transcription:
	st.error("❌ Could not extract both Video & Audio insights.")
	return

	# Emotion-Speech Comparison
	speech_emotion = "Neutral"
	if any(word in audio_transcription.lower() for word in ["angry", "mad"]):
	speech_emotion = "Angry"
	elif any(word in audio_transcription.lower() for word in ["happy", "excited"]):
	speech_emotion = "Happy"
	elif any(word in audio_transcription.lower() for word in ["sad", "crying"]):
	speech_emotion = "Sad"

	fig = px.pie(
	names=["Video Emotion", "Speech Emotion"],
	values=[len(video_emotions), 1],
	title=f"Comparison: Video ({video_emotions[0]}) vs. Speech ({speech_emotion})"
	)
	st.plotly_chart(fig)

	analyze_multimodal(multimodal_path)