Spaces:
Build error
Build error
import streamlit as st | |
import tempfile | |
import os | |
import cv2 | |
import numpy as np | |
import torch | |
import librosa | |
import speech_recognition as sr | |
import noisereduce as nr | |
import pandas as pd | |
import plotly.express as px | |
from deepface import DeepFace | |
from pydub import AudioSegment | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
# Ensure Pydub uses ffmpeg | |
AudioSegment.converter = "/usr/bin/ffmpeg" | |
# Title & Instructions | |
st.title("π€ AI Child Behavior Assessment") | |
st.markdown( | |
""" | |
### How to Use: | |
1οΈβ£ Choose an **analysis type** below. | |
2οΈβ£ Upload the required file(s). | |
3οΈβ£ Click the **Analyze** button to process the data. | |
""" | |
) | |
# Load AI Model for Speech Recognition | |
st.write("β³ Loading AI Speech Model...") | |
try: | |
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english") | |
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english") | |
st.success("β AI Speech Model Loaded!") | |
except Exception as e: | |
st.error(f"β Error loading speech model: {e}") | |
# ======================== DEFINE VIDEO ANALYSIS FUNCTION ======================== | |
def analyze_video(video_path): | |
"""Processes video and extracts emotions with visualization""" | |
st.write("π Analyzing Emotions in Video...") | |
cap = cv2.VideoCapture(video_path) | |
frame_count = 0 | |
emotions_detected = [] | |
while cap.isOpened(): | |
ret, frame = cap.read() | |
if not ret: | |
break | |
if frame_count % 10 == 0: # Analyze every 10th frame | |
try: | |
analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False) | |
emotions_detected.append(analysis[0]['dominant_emotion']) | |
except Exception as e: | |
st.error(f"β οΈ DeepFace error: {e}") | |
frame_count += 1 | |
cap.release() | |
if emotions_detected: | |
most_common_emotion = max(set(emotions_detected), key=emotions_detected.count) | |
st.success(f"π§ Most detected emotion: {most_common_emotion}") | |
# Visualization | |
emotion_counts = pd.Series(emotions_detected).value_counts() | |
emotion_df = pd.DataFrame({'Emotion': emotion_counts.index, 'Count': emotion_counts.values}) | |
fig = px.bar(emotion_df, x='Emotion', y='Count', title="Emotion Distribution in Video", color='Emotion') | |
st.plotly_chart(fig) | |
else: | |
st.warning("β οΈ No emotions detected. Try a different video.") | |
# ======================== DEFINE AUDIO ANALYSIS FUNCTION ======================== | |
def transcribe_audio(audio_path): | |
"""Processes audio and extracts transcription with visualization""" | |
try: | |
st.write(f"π Processing Audio File...") | |
speech, sr = librosa.load(audio_path, sr=16000) | |
# Enhanced Preprocessing | |
speech = nr.reduce_noise(y=speech, sr=sr, prop_decrease=0.4) | |
speech = librosa.effects.trim(speech)[0] | |
speech = librosa.util.normalize(speech) | |
st.write("π€ Processing audio with AI model...") | |
input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.batch_decode(predicted_ids)[0] | |
st.success(f"π Transcription (AI Model): {transcription}") | |
# Visualization | |
word_count = pd.Series(transcription.split()).value_counts() | |
word_df = pd.DataFrame({'Word': word_count.index, 'Count': word_count.values}) | |
fig = px.bar(word_df, x='Word', y='Count', title="Word Frequency in Transcription", color='Word') | |
st.plotly_chart(fig) | |
except Exception as e: | |
st.error(f"β οΈ Error in AI Speech Processing: {e}") | |
# ======================== USER SELECTS ANALYSIS MODE ======================== | |
analysis_option = st.radio( | |
"Select Analysis Type:", | |
["πΉ Video Only (Facial Emotion)", "π€ Audio Only (Speech Analysis)", "π¬ Video & Audio (Multimodal)"] | |
) | |
# ======================== VIDEO ONLY ANALYSIS ======================== | |
if analysis_option == "πΉ Video Only (Facial Emotion)": | |
st.header("π Upload a Video for Emotion Analysis") | |
video_file = st.file_uploader("Upload a video file", type=["mp4", "avi", "mov"]) | |
if video_file: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video: | |
temp_video.write(video_file.read()) | |
video_path = temp_video.name | |
st.success("π Video uploaded successfully!") | |
if st.button("Analyze Video"): | |
analyze_video(video_path) | |
# ======================== AUDIO ONLY ANALYSIS ======================== | |
elif analysis_option == "π€ Audio Only (Speech Analysis)": | |
st.header("π€ Upload an Audio File for Speech Analysis") | |
audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3"]) | |
if audio_file: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: | |
temp_audio.write(audio_file.read()) | |
audio_path = temp_audio.name | |
st.success("π€ Audio uploaded successfully!") | |
if st.button("Analyze Audio"): | |
transcribe_audio(audio_path) | |
# ======================== MULTIMODAL ANALYSIS (VIDEO + AUDIO) ======================== | |
elif analysis_option == "π¬ Video & Audio (Multimodal)": | |
st.header("π₯ Upload a **Single File** for Video & Audio Combined Analysis") | |
multimodal_file = st.file_uploader("Upload a **video file with audio**", type=["mp4", "avi", "mov"]) | |
if multimodal_file: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file: | |
temp_file.write(multimodal_file.read()) | |
multimodal_path = temp_file.name | |
st.success("β Multimodal file uploaded successfully!") | |
if st.button("Analyze Video & Audio Together"): | |
def analyze_multimodal(multimodal_path): | |
st.write("π Extracting Video & Audio...") | |
# Extract Video Emotion | |
video_emotions = analyze_video(multimodal_path) | |
# Extract Audio for Speech Processing | |
audio_transcription = transcribe_audio(multimodal_path) | |
# Multimodal Analysis Visualization | |
st.header("π Multimodal Analysis Results") | |
if not video_emotions or not audio_transcription: | |
st.error("β Could not extract both Video & Audio insights.") | |
return | |
# Emotion-Speech Comparison | |
speech_emotion = "Neutral" | |
if any(word in audio_transcription.lower() for word in ["angry", "mad"]): | |
speech_emotion = "Angry" | |
elif any(word in audio_transcription.lower() for word in ["happy", "excited"]): | |
speech_emotion = "Happy" | |
elif any(word in audio_transcription.lower() for word in ["sad", "crying"]): | |
speech_emotion = "Sad" | |
fig = px.pie( | |
names=["Video Emotion", "Speech Emotion"], | |
values=[len(video_emotions), 1], | |
title=f"Comparison: Video ({video_emotions[0]}) vs. Speech ({speech_emotion})" | |
) | |
st.plotly_chart(fig) | |
analyze_multimodal(multimodal_path) | |