Spaces:

Garvitj
/

emotion-llm

Sleeping

App Files Files Community

Garvitj commited on Jan 17

Commit

a22f055

verified ·

1 Parent(s): 2ffd189

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -114

app.py CHANGED Viewed

@@ -1,140 +1,127 @@
 import gradio as gr
 import numpy as np
-import librosa
 import cv2
-import json
-import ffmpeg
 import speech_recognition as sr
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from tensorflow.keras.preprocessing.text import tokenizer_from_json
 from tensorflow.keras.models import load_model
 from tensorflow.keras.preprocessing.sequence import pad_sequences
-from tensorflow.keras.preprocessing.image import img_to_array
 from collections import Counter
-import os
-# Load necessary models and files
-text_model = load_model('model_for_text_emotion_updated(1).keras')  # Text emotion model
 with open('tokenizer.json') as json_file:
-    tokenizer = tokenizer_from_json(json.load(json_file))  # Tokenizer for text emotion
-audio_model = load_model('my_model.h5')  # Audio emotion model
-image_model = load_model('model_emotion.h5')  # Image emotion model
-# Load LLM model from Hugging Face
-llama_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")  # Small OPT model
-llama_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
-# Emotion mapping
-emotion_mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
 # Preprocess text for emotion prediction
 def preprocess_text(text):
-    tokens = [word for word in text.lower().split() if word.isalnum()]
-    return ' '.join(tokens)
-# Predict emotion from text
-def predict_text_emotion(text):
-    preprocessed_text = preprocess_text(text)
-    seq = tokenizer.texts_to_sequences([preprocessed_text])
-    padded_seq = pad_sequences(seq, maxlen=35)
-    prediction = text_model.predict(padded_seq)
-    emotion_index = np.argmax(prediction)
-    return emotion_mapping[emotion_index]
 # Extract audio features and predict emotion
-def extract_audio_features(audio_data, sample_rate):
-    if not isinstance(audio_data, np.ndarray):
-        audio_data = np.array(audio_data, dtype=np.float32)  # Ensure it's a NumPy array with float type
-    else:
-        audio_data = audio_data.astype(np.float32)  # Convert to float32
-    mfcc = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=704)
-    mfcc = np.mean(mfcc.T, axis=0)  # Compute mean across time
-    features = np.expand_dims(mfcc, axis=0)  # Add batch dimension
-    return features
-def predict_audio_emotion(audio_data, sample_rate):
-    features = extract_audio_features(audio_data, sample_rate)
-    features = np.reshape(features, (1, 40))  # Match model expected input
-    prediction = audio_model.predict(features)
     emotion_index = np.argmax(prediction)
-    return emotion_mapping[emotion_index]
-# Process video and predict emotions from frames
-def process_video(video_path):
-    cap = cv2.VideoCapture(video_path)
-    frame_rate = cap.get(cv2.CAP_PROP_FPS)
-    predictions = []
-    while cap.isOpened():
-        ret, frame = cap.read()
-        if not ret:
-            break
-        if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % int(frame_rate) == 0:
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-            frame = cv2.resize(frame, (48, 48))
-            frame = img_to_array(frame) / 255.0
-            frame = np.expand_dims(frame, axis=0)
-            prediction = image_model.predict(frame)
-            predictions.append(np.argmax(prediction))
-    cap.release()
-    most_common_emotion = Counter(predictions).most_common(1)[0][0]
-    return emotion_mapping[most_common_emotion]
-# Extract audio from video using ffmpeg-python
-def extract_audio_from_video(video_path):
-    audio_file = 'audio.wav'
-    (ffmpeg
-        .input(video_path)
-        .output(audio_file, format='wav', acodec='pcm_s16le')
-        .run(overwrite_output=True))
-    return audio_file
-def transcribe_audio(audio_file):
     recognizer = sr.Recognizer()
-    with sr.AudioFile(audio_file) as source:
-        audio_record = recognizer.record(source)
-        return recognizer.recognize_google(audio_record)
-# Integrating with LLM to adjust responses based on detected emotion
-def interact_with_llm(emotion, user_input):
-    prompt = f"The user is feeling {emotion}. Respond to their question in an empathetic and appropriate manner: {user_input}"
-    inputs = llama_tokenizer(prompt, return_tensors="pt")
-    outputs = llama_model.generate(**inputs, max_length=200)
-    response = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return response
-# Main function to process video and predict emotions
-def transcribe_and_predict_video(video_path):
-    # Extract audio from video and predict text-based emotion
-    audio_file = extract_audio_from_video(video_path)
-    text = transcribe_audio(audio_file)
-    text_emotion = predict_text_emotion(text)
-    # Predict emotion from video frames (image-based)
-    image_emotion = process_video(video_path)
-    # Predict emotion from audio (sound-based)
-    audio_data, sample_rate = librosa.load(audio_file, sr=None)
-    audio_emotion = predict_audio_emotion(audio_data, sample_rate)
-    # Combine detected emotions for final output (majority voting can be implemented)
-    final_emotion = image_emotion  # Using image emotion as primary
-    # Get response from LLM
-    llm_response = interact_with_llm(final_emotion, text)
-    return f"Emotion Detected: {final_emotion}\nLLM Response: {llm_response}"
-# Create Gradio interface
-iface = gr.Interface(fn=transcribe_and_predict_video,
-                     inputs=gr.Video(),
                      outputs="text",
-                     title="Emotion-Responsive LLM for Video",
-                     description="Upload a video to get emotion predictions and LLM responses based on detected emotions.")
 iface.launch()

 import gradio as gr
 import numpy as np
 import cv2
+import librosa
+import tempfile
+import wave
+import os
 import speech_recognition as sr
+import pickle
+import json
 from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing.text import tokenizer_from_json
 from tensorflow.keras.preprocessing.sequence import pad_sequences
+import nltk
 from collections import Counter
+from transformers import LlamaTokenizer, LlamaForCausalLM
+# Initialize necessary models and tools
+# Load the tokenizer and model for text-based emotion prediction
 with open('tokenizer.json') as json_file:
+    tokenizer_json = json.load(json_file)
+tokenizer = tokenizer_from_json(tokenizer_json)
+text_model = load_model('model_for_text_emotion_updated(1).keras')
+# Load the audio emotion model and scaler
+with open('encoder.pkl', 'rb') as file:
+    encoder = pickle.load(file)
+with open('scaler.pkl', 'rb') as file:
+    scaler = pickle.load(file)
+audio_model = load_model('my_model.h5')
+# Load the LLaMA model for question answering
+llama_tokenizer = LlamaTokenizer.from_pretrained('huggingface/llama-7b')
+llama_model = LlamaForCausalLM.from_pretrained('huggingface/llama-7b')
+# Initialize NLTK tools
+nltk.download('punkt')
+nltk.download('wordnet')
+nltk.download('stopwords')
+lemmatizer = nltk.WordNetLemmatizer()
+stop_words = set(nltk.corpus.stopwords.words('english'))
 # Preprocess text for emotion prediction
 def preprocess_text(text):
+    tokens = nltk.word_tokenize(text.lower())
+    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
+    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
+    return ' '.join(lemmatized_tokens)
 # Extract audio features and predict emotion
+def extract_audio_features(data, sample_rate):
+    result = np.array([])
+    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
+    result = np.hstack((result, zcr))
+    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
+    result = np.hstack((result, mfcc))
+    return result
+def predict_emotion_from_audio(audio_data):
+    sample_rate, data = audio_data
+    features = extract_audio_features(data, sample_rate)
+    features = np.expand_dims(features, axis=0)
+    scaled_features = scaler.transform(features)
+    prediction = audio_model.predict(scaled_features)
     emotion_index = np.argmax(prediction)
+    emotion_array = np.zeros((1, len(encoder.categories_[0])))
+    emotion_array[0, emotion_index] = 1
+    emotion_label = encoder.inverse_transform(emotion_array)[0]
+    return emotion_label
+# Extract text from audio (speech recognition)
+def extract_text_from_audio(audio_path):
     recognizer = sr.Recognizer()
+    with sr.AudioFile(audio_path) as source:
+        audio_data = recognizer.record(source)
+    text = recognizer.recognize_google(audio_data)
+    return text
+# Use LLaMA to answer questions based on the text
+def ask_llama(question, context):
+    inputs = llama_tokenizer(question, context, return_tensors="pt")
+    outputs = llama_model.generate(inputs['input_ids'], max_length=150)
+    answer = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return answer
+# Process the video and extract text, emotion, and context for LLaMA
+def process_video(video_path):
+    # Extract audio from the video
+    video = mp.VideoFileClip(video_path)
+    if video.audio is None:
+        raise ValueError("No audio found in the video.")
+    audio = video.audio
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
+        temp_audio_path = temp_audio_file.name
+        audio.write_audiofile(temp_audio_path)
+    # Extract text from the audio
+    video_text = extract_text_from_audio(temp_audio_path)
+    # Predict emotions from the text and audio
+    preprocessed_text = preprocess_text(video_text)
+    title_seq = tokenizer.texts_to_sequences([preprocessed_text])
+    padded_title_seq = pad_sequences(title_seq, maxlen=35, padding='post', truncating='post')
+    text_emotion_prediction = text_model.predict(np.array(padded_title_seq))
+    text_emotion = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'][np.argmax(text_emotion_prediction)]
+    audio_data = audio.to_soundarray(fps=audio.fps)
+    audio_emotion = predict_emotion_from_audio((audio.fps, audio_data))
+    # Answer user queries based on the video text
+    context = video_text
+    return context, text_emotion, audio_emotion
+# Define Gradio Interface
+def video_query_interface(video, question):
+    context, text_emotion, audio_emotion = process_video(video)
+    answer = ask_llama(question, context)
+    return f"Text Emotion: {text_emotion}, Audio Emotion: {audio_emotion}\nAnswer: {answer}"
+iface = gr.Interface(fn=video_query_interface,
+                     inputs=[gr.Video(), gr.Textbox()],
                      outputs="text",
+                     title="Video Emotion and Q&A",
+                     description="Upload a video and ask a question based on the audio content.")
 iface.launch()