File size: 7,440 Bytes
99188db 51f70e4 99188db 51f70e4 99188db 51f70e4 99188db 51f70e4 99188db 51f70e4 99188db 51f70e4 99188db 51f70e4 99188db 51f70e4 99188db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import streamlit as st
import tempfile
import os
import cv2
import numpy as np
import torch
import librosa
import speech_recognition as sr
import noisereduce as nr
import pandas as pd
import plotly.express as px
from deepface import DeepFace
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# Ensure Pydub uses ffmpeg
AudioSegment.converter = "/usr/bin/ffmpeg"
# Title & Instructions
st.title("π€ AI Child Behavior Assessment")
st.markdown(
"""
### How to Use:
1οΈβ£ Choose an **analysis type** below.
2οΈβ£ Upload the required file(s).
3οΈβ£ Click the **Analyze** button to process the data.
"""
)
# Load AI Model for Speech Recognition
st.write("β³ Loading AI Speech Model...")
try:
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
st.success("β
AI Speech Model Loaded!")
except Exception as e:
st.error(f"β Error loading speech model: {e}")
# ======================== DEFINE VIDEO ANALYSIS FUNCTION ========================
def analyze_video(video_path):
"""Processes video and extracts emotions with visualization"""
st.write("π Analyzing Emotions in Video...")
cap = cv2.VideoCapture(video_path)
frame_count = 0
emotions_detected = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_count % 10 == 0: # Analyze every 10th frame
try:
analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
emotions_detected.append(analysis[0]['dominant_emotion'])
except Exception as e:
st.error(f"β οΈ DeepFace error: {e}")
frame_count += 1
cap.release()
if emotions_detected:
most_common_emotion = max(set(emotions_detected), key=emotions_detected.count)
st.success(f"π§ Most detected emotion: {most_common_emotion}")
# Visualization
emotion_counts = pd.Series(emotions_detected).value_counts()
emotion_df = pd.DataFrame({'Emotion': emotion_counts.index, 'Count': emotion_counts.values})
fig = px.bar(emotion_df, x='Emotion', y='Count', title="Emotion Distribution in Video", color='Emotion')
st.plotly_chart(fig)
else:
st.warning("β οΈ No emotions detected. Try a different video.")
# ======================== DEFINE AUDIO ANALYSIS FUNCTION ========================
def transcribe_audio(audio_path):
"""Processes audio and extracts transcription with visualization"""
try:
st.write(f"π Processing Audio File...")
speech, sr = librosa.load(audio_path, sr=16000)
# Enhanced Preprocessing
speech = nr.reduce_noise(y=speech, sr=sr, prop_decrease=0.4)
speech = librosa.effects.trim(speech)[0]
speech = librosa.util.normalize(speech)
st.write("π€ Processing audio with AI model...")
input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
st.success(f"π Transcription (AI Model): {transcription}")
# Visualization
word_count = pd.Series(transcription.split()).value_counts()
word_df = pd.DataFrame({'Word': word_count.index, 'Count': word_count.values})
fig = px.bar(word_df, x='Word', y='Count', title="Word Frequency in Transcription", color='Word')
st.plotly_chart(fig)
except Exception as e:
st.error(f"β οΈ Error in AI Speech Processing: {e}")
# ======================== USER SELECTS ANALYSIS MODE ========================
analysis_option = st.radio(
"Select Analysis Type:",
["πΉ Video Only (Facial Emotion)", "π€ Audio Only (Speech Analysis)", "π¬ Video & Audio (Multimodal)"]
)
# ======================== VIDEO ONLY ANALYSIS ========================
if analysis_option == "πΉ Video Only (Facial Emotion)":
st.header("π Upload a Video for Emotion Analysis")
video_file = st.file_uploader("Upload a video file", type=["mp4", "avi", "mov"])
if video_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
temp_video.write(video_file.read())
video_path = temp_video.name
st.success("π Video uploaded successfully!")
if st.button("Analyze Video"):
analyze_video(video_path)
# ======================== AUDIO ONLY ANALYSIS ========================
elif analysis_option == "π€ Audio Only (Speech Analysis)":
st.header("π€ Upload an Audio File for Speech Analysis")
audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3"])
if audio_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
temp_audio.write(audio_file.read())
audio_path = temp_audio.name
st.success("π€ Audio uploaded successfully!")
if st.button("Analyze Audio"):
transcribe_audio(audio_path)
# ======================== MULTIMODAL ANALYSIS (VIDEO + AUDIO) ========================
elif analysis_option == "π¬ Video & Audio (Multimodal)":
st.header("π₯ Upload a **Single File** for Video & Audio Combined Analysis")
multimodal_file = st.file_uploader("Upload a **video file with audio**", type=["mp4", "avi", "mov"])
if multimodal_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
temp_file.write(multimodal_file.read())
multimodal_path = temp_file.name
st.success("β
Multimodal file uploaded successfully!")
if st.button("Analyze Video & Audio Together"):
def analyze_multimodal(multimodal_path):
st.write("π Extracting Video & Audio...")
# Extract Video Emotion
video_emotions = analyze_video(multimodal_path)
# Extract Audio for Speech Processing
audio_transcription = transcribe_audio(multimodal_path)
# Multimodal Analysis Visualization
st.header("π Multimodal Analysis Results")
if not video_emotions or not audio_transcription:
st.error("β Could not extract both Video & Audio insights.")
return
# Emotion-Speech Comparison
speech_emotion = "Neutral"
if any(word in audio_transcription.lower() for word in ["angry", "mad"]):
speech_emotion = "Angry"
elif any(word in audio_transcription.lower() for word in ["happy", "excited"]):
speech_emotion = "Happy"
elif any(word in audio_transcription.lower() for word in ["sad", "crying"]):
speech_emotion = "Sad"
fig = px.pie(
names=["Video Emotion", "Speech Emotion"],
values=[len(video_emotions), 1],
title=f"Comparison: Video ({video_emotions[0]}) vs. Speech ({speech_emotion})"
)
st.plotly_chart(fig)
analyze_multimodal(multimodal_path)
|