Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,6 @@ import json
|
|
6 |
import ffmpeg
|
7 |
import speech_recognition as sr
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
9 |
-
import tensorflow as tf
|
10 |
from tensorflow.keras.preprocessing.text import tokenizer_from_json
|
11 |
from tensorflow.keras.models import load_model
|
12 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
@@ -15,17 +14,17 @@ from collections import Counter
|
|
15 |
import os
|
16 |
|
17 |
# Load necessary models and files
|
18 |
-
text_model = load_model('model_for_text_emotion_updated(1).keras') #
|
19 |
with open('tokenizer.json') as json_file:
|
20 |
tokenizer = tokenizer_from_json(json.load(json_file)) # Tokenizer for text emotion
|
21 |
-
audio_model = load_model('my_model.h5') #
|
22 |
-
image_model = load_model('model_emotion.h5') #
|
23 |
|
24 |
# Load LLM model from Hugging Face
|
25 |
-
llama_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") #
|
26 |
llama_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
|
27 |
|
28 |
-
# Emotion mapping
|
29 |
emotion_mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
|
30 |
|
31 |
# Preprocess text for emotion prediction
|
@@ -45,10 +44,12 @@ def predict_text_emotion(text):
|
|
45 |
# Extract audio features and predict emotion
|
46 |
def extract_audio_features(audio_data, sample_rate):
|
47 |
if not isinstance(audio_data, np.ndarray):
|
48 |
-
audio_data = np.array(audio_data)
|
49 |
-
|
50 |
-
mfcc = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate).T, axis=0)
|
51 |
-
|
|
|
|
|
52 |
|
53 |
def predict_audio_emotion(audio_data, sample_rate):
|
54 |
features = extract_audio_features(audio_data, sample_rate)
|
@@ -114,16 +115,11 @@ def transcribe_and_predict_video(video_path):
|
|
114 |
image_emotion = process_video(video_path)
|
115 |
|
116 |
# Predict emotion from audio (sound-based)
|
117 |
-
|
118 |
-
|
119 |
-
# Debugging print statements
|
120 |
-
print(f"Type of audio_data: {type(audio_data)}") # Ensure audio_data is numpy.ndarray
|
121 |
-
print(f"Sample rate: {sample_rate}")
|
122 |
-
|
123 |
audio_emotion = predict_audio_emotion(audio_data, sample_rate)
|
124 |
|
125 |
-
# Combine
|
126 |
-
final_emotion = image_emotion #
|
127 |
|
128 |
# Get response from LLM
|
129 |
llm_response = interact_with_llm(final_emotion, text)
|
|
|
6 |
import ffmpeg
|
7 |
import speech_recognition as sr
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
9 |
from tensorflow.keras.preprocessing.text import tokenizer_from_json
|
10 |
from tensorflow.keras.models import load_model
|
11 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
|
14 |
import os
|
15 |
|
16 |
# Load necessary models and files
|
17 |
+
text_model = load_model('model_for_text_emotion_updated(1).keras') # Text emotion model
|
18 |
with open('tokenizer.json') as json_file:
|
19 |
tokenizer = tokenizer_from_json(json.load(json_file)) # Tokenizer for text emotion
|
20 |
+
audio_model = load_model('my_model.h5') # Audio emotion model
|
21 |
+
image_model = load_model('model_emotion.h5') # Image emotion model
|
22 |
|
23 |
# Load LLM model from Hugging Face
|
24 |
+
llama_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") # Small OPT model
|
25 |
llama_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
|
26 |
|
27 |
+
# Emotion mapping
|
28 |
emotion_mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
|
29 |
|
30 |
# Preprocess text for emotion prediction
|
|
|
44 |
# Extract audio features and predict emotion
|
45 |
def extract_audio_features(audio_data, sample_rate):
|
46 |
if not isinstance(audio_data, np.ndarray):
|
47 |
+
audio_data = np.array(audio_data)
|
48 |
+
|
49 |
+
mfcc = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=704).T, axis=0)
|
50 |
+
features = np.expand_dims(mfcc, axis=0)
|
51 |
+
features = np.reshape(features, (1, 704))
|
52 |
+
return features
|
53 |
|
54 |
def predict_audio_emotion(audio_data, sample_rate):
|
55 |
features = extract_audio_features(audio_data, sample_rate)
|
|
|
115 |
image_emotion = process_video(video_path)
|
116 |
|
117 |
# Predict emotion from audio (sound-based)
|
118 |
+
sample_rate, audio_data = librosa.load(audio_file, sr=None)
|
|
|
|
|
|
|
|
|
|
|
119 |
audio_emotion = predict_audio_emotion(audio_data, sample_rate)
|
120 |
|
121 |
+
# Combine detected emotions for final output (majority voting can be implemented)
|
122 |
+
final_emotion = image_emotion # Using image emotion as primary
|
123 |
|
124 |
# Get response from LLM
|
125 |
llm_response = interact_with_llm(final_emotion, text)
|