Spaces:

Garvitj
/

emotion-llm

Sleeping

App Files Files Community

Garvitj commited on Jan 17

Commit

c5cc96b

verified ·

1 Parent(s): 9b5af99

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -18

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ import json
 import ffmpeg
 import speech_recognition as sr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-import tensorflow as tf
 from tensorflow.keras.preprocessing.text import tokenizer_from_json
 from tensorflow.keras.models import load_model
 from tensorflow.keras.preprocessing.sequence import pad_sequences
@@ -15,17 +14,17 @@ from collections import Counter
 import os
 # Load necessary models and files
-text_model = load_model('model_for_text_emotion_updated(1).keras')  # Load text emotion model
 with open('tokenizer.json') as json_file:
     tokenizer = tokenizer_from_json(json.load(json_file))  # Tokenizer for text emotion
-audio_model = load_model('my_model.h5')  # Load audio emotion model
-image_model = load_model('model_emotion.h5')  # Load image emotion model
 # Load LLM model from Hugging Face
-llama_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")  # Example: small OPT model
 llama_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
-# Emotion mapping (from your model output)
 emotion_mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
 # Preprocess text for emotion prediction
@@ -45,10 +44,12 @@ def predict_text_emotion(text):
 # Extract audio features and predict emotion
 def extract_audio_features(audio_data, sample_rate):
     if not isinstance(audio_data, np.ndarray):
-        audio_data = np.array(audio_data)  # Ensure it's a NumPy array
-    mfcc = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate).T, axis=0)
-    return np.expand_dims(mfcc, axis=0)
 def predict_audio_emotion(audio_data, sample_rate):
     features = extract_audio_features(audio_data, sample_rate)
@@ -114,16 +115,11 @@ def transcribe_and_predict_video(video_path):
     image_emotion = process_video(video_path)
     # Predict emotion from audio (sound-based)
-    audio_data, sample_rate = librosa.load(audio_file, sr=None)
-    # Debugging print statements
-    print(f"Type of audio_data: {type(audio_data)}")  # Ensure audio_data is numpy.ndarray
-    print(f"Sample rate: {sample_rate}")
     audio_emotion = predict_audio_emotion(audio_data, sample_rate)
-    # Combine the detected emotions for final output (you could average them or choose the most common)
-    final_emotion = image_emotion  # Or decide based on some logic (e.g., majority vote)
     # Get response from LLM
     llm_response = interact_with_llm(final_emotion, text)

 import ffmpeg
 import speech_recognition as sr
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from tensorflow.keras.preprocessing.text import tokenizer_from_json
 from tensorflow.keras.models import load_model
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 import os
 # Load necessary models and files
+text_model = load_model('model_for_text_emotion_updated(1).keras')  # Text emotion model
 with open('tokenizer.json') as json_file:
     tokenizer = tokenizer_from_json(json.load(json_file))  # Tokenizer for text emotion
+audio_model = load_model('my_model.h5')  # Audio emotion model
+image_model = load_model('model_emotion.h5')  # Image emotion model
 # Load LLM model from Hugging Face
+llama_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")  # Small OPT model
 llama_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+# Emotion mapping
 emotion_mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
 # Preprocess text for emotion prediction
 # Extract audio features and predict emotion
 def extract_audio_features(audio_data, sample_rate):
     if not isinstance(audio_data, np.ndarray):
+        audio_data = np.array(audio_data)
+    mfcc = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=704).T, axis=0)
+    features = np.expand_dims(mfcc, axis=0)
+    features = np.reshape(features, (1, 704))
+    return features
 def predict_audio_emotion(audio_data, sample_rate):
     features = extract_audio_features(audio_data, sample_rate)
     image_emotion = process_video(video_path)
     # Predict emotion from audio (sound-based)
+    sample_rate, audio_data = librosa.load(audio_file, sr=None)
     audio_emotion = predict_audio_emotion(audio_data, sample_rate)
+    # Combine detected emotions for final output (majority voting can be implemented)
+    final_emotion = image_emotion  # Using image emotion as primary
     # Get response from LLM
     llm_response = interact_with_llm(final_emotion, text)