Durganihantri commited on
Commit
99188db
Β·
unverified Β·
1 Parent(s): 77efb88

Update app.py

Browse files
Files changed (1) hide show
  1. backend/app.py +171 -32
backend/app.py CHANGED
@@ -1,42 +1,181 @@
1
- from flask import Flask, request, jsonify
 
 
2
  import cv2
 
 
 
3
  import speech_recognition as sr
4
- import soundfile as sf
 
 
5
  from deepface import DeepFace
 
 
6
 
7
- app = Flask(__name__)
 
8
 
9
- @app.route('/analyze_face', methods=['POST'])
10
- def analyze_face():
11
- file = request.files['video']
12
- video_path = "uploaded_video.mp4"
13
- file.save(video_path)
14
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  cap = cv2.VideoCapture(video_path)
16
- emotions = []
17
-
18
- while True:
 
19
  ret, frame = cap.read()
20
  if not ret:
21
  break
22
- analysis = DeepFace.analyze(frame, actions=['emotion'])
23
- emotions.append(analysis[0]['dominant_emotion'])
24
-
 
 
 
 
 
25
  cap.release()
26
- return jsonify({"emotions": emotions})
27
-
28
- @app.route('/analyze_speech', methods=['POST'])
29
- def analyze_speech():
30
- file = request.files['audio']
31
- audio_path = "uploaded_audio.wav"
32
- file.save(audio_path)
33
-
34
- recognizer = sr.Recognizer()
35
- with sr.AudioFile(audio_path) as source:
36
- audio = recognizer.record(source)
37
- text = recognizer.recognize_google(audio)
38
-
39
- return jsonify({"transcribed_text": text})
40
-
41
- if __name__ == '__main__':
42
- app.run(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import os
4
  import cv2
5
+ import numpy as np
6
+ import torch
7
+ import librosa
8
  import speech_recognition as sr
9
+ import noisereduce as nr
10
+ import pandas as pd
11
+ import plotly.express as px
12
  from deepface import DeepFace
13
+ from pydub import AudioSegment
14
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
15
 
16
+ # Ensure Pydub uses ffmpeg
17
+ AudioSegment.converter = "/usr/bin/ffmpeg"
18
 
19
+ # Title & Instructions
20
+ st.title("πŸ€— AI Child Behavior Assessment")
21
+ st.markdown(
22
+ """
23
+ ### How to Use:
24
+ 1️⃣ Choose an **analysis type** below.
25
+ 2️⃣ Upload the required file(s).
26
+ 3️⃣ Click the **Analyze** button to process the data.
27
+ """
28
+ )
29
+
30
+ # Load AI Model for Speech Recognition
31
+ st.write("⏳ Loading AI Speech Model...")
32
+ try:
33
+ processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
34
+ model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
35
+ st.success("βœ… AI Speech Model Loaded!")
36
+ except Exception as e:
37
+ st.error(f"❌ Error loading speech model: {e}")
38
+
39
+ # ======================== DEFINE VIDEO ANALYSIS FUNCTION ========================
40
+ def analyze_video(video_path):
41
+ """Processes video and extracts emotions with visualization"""
42
+ st.write("πŸ”Ž Analyzing Emotions in Video...")
43
  cap = cv2.VideoCapture(video_path)
44
+ frame_count = 0
45
+ emotions_detected = []
46
+
47
+ while cap.isOpened():
48
  ret, frame = cap.read()
49
  if not ret:
50
  break
51
+ if frame_count % 10 == 0: # Analyze every 10th frame
52
+ try:
53
+ analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
54
+ emotions_detected.append(analysis[0]['dominant_emotion'])
55
+ except Exception as e:
56
+ st.error(f"⚠️ DeepFace error: {e}")
57
+ frame_count += 1
58
+
59
  cap.release()
60
+ if emotions_detected:
61
+ most_common_emotion = max(set(emotions_detected), key=emotions_detected.count)
62
+ st.success(f"🧐 Most detected emotion: {most_common_emotion}")
63
+
64
+ # Visualization
65
+ emotion_counts = pd.Series(emotions_detected).value_counts()
66
+ emotion_df = pd.DataFrame({'Emotion': emotion_counts.index, 'Count': emotion_counts.values})
67
+ fig = px.bar(emotion_df, x='Emotion', y='Count', title="Emotion Distribution in Video", color='Emotion')
68
+ st.plotly_chart(fig)
69
+ else:
70
+ st.warning("⚠️ No emotions detected. Try a different video.")
71
+
72
+ # ======================== DEFINE AUDIO ANALYSIS FUNCTION ========================
73
+ def transcribe_audio(audio_path):
74
+ """Processes audio and extracts transcription with visualization"""
75
+ try:
76
+ st.write(f"πŸ”Ž Processing Audio File...")
77
+ speech, sr = librosa.load(audio_path, sr=16000)
78
+
79
+ # Enhanced Preprocessing
80
+ speech = nr.reduce_noise(y=speech, sr=sr, prop_decrease=0.4)
81
+ speech = librosa.effects.trim(speech)[0]
82
+ speech = librosa.util.normalize(speech)
83
+
84
+ st.write("πŸ€– Processing audio with AI model...")
85
+ input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
86
+
87
+ with torch.no_grad():
88
+ logits = model(input_values).logits
89
+
90
+ predicted_ids = torch.argmax(logits, dim=-1)
91
+ transcription = processor.batch_decode(predicted_ids)[0]
92
+
93
+ st.success(f"πŸ“ Transcription (AI Model): {transcription}")
94
+
95
+ # Visualization
96
+ word_count = pd.Series(transcription.split()).value_counts()
97
+ word_df = pd.DataFrame({'Word': word_count.index, 'Count': word_count.values})
98
+ fig = px.bar(word_df, x='Word', y='Count', title="Word Frequency in Transcription", color='Word')
99
+ st.plotly_chart(fig)
100
+ except Exception as e:
101
+ st.error(f"⚠️ Error in AI Speech Processing: {e}")
102
+
103
+ # ======================== USER SELECTS ANALYSIS MODE ========================
104
+ analysis_option = st.radio(
105
+ "Select Analysis Type:",
106
+ ["πŸ“Ή Video Only (Facial Emotion)", "🎀 Audio Only (Speech Analysis)", "🎬 Video & Audio (Multimodal)"]
107
+ )
108
+
109
+ # ======================== VIDEO ONLY ANALYSIS ========================
110
+ if analysis_option == "πŸ“Ή Video Only (Facial Emotion)":
111
+ st.header("πŸ“‚ Upload a Video for Emotion Analysis")
112
+ video_file = st.file_uploader("Upload a video file", type=["mp4", "avi", "mov"])
113
+
114
+ if video_file:
115
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
116
+ temp_video.write(video_file.read())
117
+ video_path = temp_video.name
118
+ st.success("πŸ“‚ Video uploaded successfully!")
119
+
120
+ if st.button("Analyze Video"):
121
+ analyze_video(video_path)
122
+
123
+ # ======================== AUDIO ONLY ANALYSIS ========================
124
+ elif analysis_option == "🎀 Audio Only (Speech Analysis)":
125
+ st.header("🎀 Upload an Audio File for Speech Analysis")
126
+ audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3"])
127
+
128
+ if audio_file:
129
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
130
+ temp_audio.write(audio_file.read())
131
+ audio_path = temp_audio.name
132
+ st.success("🎀 Audio uploaded successfully!")
133
+
134
+ if st.button("Analyze Audio"):
135
+ transcribe_audio(audio_path)
136
+
137
+ # ======================== MULTIMODAL ANALYSIS (VIDEO + AUDIO) ========================
138
+ elif analysis_option == "🎬 Video & Audio (Multimodal)":
139
+ st.header("πŸŽ₯ Upload a **Single File** for Video & Audio Combined Analysis")
140
+ multimodal_file = st.file_uploader("Upload a **video file with audio**", type=["mp4", "avi", "mov"])
141
+
142
+ if multimodal_file:
143
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
144
+ temp_file.write(multimodal_file.read())
145
+ multimodal_path = temp_file.name
146
+
147
+ st.success("βœ… Multimodal file uploaded successfully!")
148
+
149
+ if st.button("Analyze Video & Audio Together"):
150
+ def analyze_multimodal(multimodal_path):
151
+ st.write("πŸ”Ž Extracting Video & Audio...")
152
+
153
+ # Extract Video Emotion
154
+ video_emotions = analyze_video(multimodal_path)
155
+
156
+ # Extract Audio for Speech Processing
157
+ audio_transcription = transcribe_audio(multimodal_path)
158
+
159
+ # Multimodal Analysis Visualization
160
+ st.header("πŸ” Multimodal Analysis Results")
161
+ if not video_emotions or not audio_transcription:
162
+ st.error("❌ Could not extract both Video & Audio insights.")
163
+ return
164
+
165
+ # Emotion-Speech Comparison
166
+ speech_emotion = "Neutral"
167
+ if any(word in audio_transcription.lower() for word in ["angry", "mad"]):
168
+ speech_emotion = "Angry"
169
+ elif any(word in audio_transcription.lower() for word in ["happy", "excited"]):
170
+ speech_emotion = "Happy"
171
+ elif any(word in audio_transcription.lower() for word in ["sad", "crying"]):
172
+ speech_emotion = "Sad"
173
+
174
+ fig = px.pie(
175
+ names=["Video Emotion", "Speech Emotion"],
176
+ values=[len(video_emotions), 1],
177
+ title=f"Comparison: Video ({video_emotions[0]}) vs. Speech ({speech_emotion})"
178
+ )
179
+ st.plotly_chart(fig)
180
+
181
+ analyze_multimodal(multimodal_path)