Spaces:
Sleeping
Sleeping
File size: 5,268 Bytes
ab4fd59 18a71f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import gradio as gr
import numpy as np
import librosa
import cv2
import moviepy.editor as mp
import speech_recognition as sr
from transformers import AutoModelForCausalLM, AutoTokenizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from collections import Counter
# Load necessary models and files
text_model = load_model('model_for_text_emotion_updated(1).keras') # Load your text emotion model
with open('tokenizer.json') as json_file:
tokenizer = tokenizer_from_json(json.load(json_file)) # Tokenizer for text emotion
audio_model = load_model('my_model.h5') # Load audio emotion model
image_model = load_model('model_emotion.h5') # Load image emotion model
# Load LLM model from Hugging Face
llama_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") # Example: small OPT model
llama_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
# Emotion mapping (from your model output)
emotion_mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
# Preprocess text for emotion prediction
def preprocess_text(text):
tokens = [word for word in text.lower().split() if word.isalnum()]
return ' '.join(tokens)
# Predict emotion from text
def predict_text_emotion(text):
preprocessed_text = preprocess_text(text)
seq = tokenizer.texts_to_sequences([preprocessed_text])
padded_seq = pad_sequences(seq, maxlen=35)
prediction = text_model.predict(padded_seq)
emotion_index = np.argmax(prediction)
return emotion_mapping[emotion_index]
# Extract audio features and predict emotion
def extract_audio_features(audio_data, sample_rate):
mfcc = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate).T, axis=0)
return np.expand_dims(mfcc, axis=0)
def predict_audio_emotion(audio_data, sample_rate):
features = extract_audio_features(audio_data, sample_rate)
prediction = audio_model.predict(features)
emotion_index = np.argmax(prediction)
return emotion_mapping[emotion_index]
# Process video and predict emotions from frames
def process_video(video_path):
cap = cv2.VideoCapture(video_path)
frame_rate = cap.get(cv2.CAP_PROP_FPS)
predictions = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % int(frame_rate) == 0:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
frame = cv2.resize(frame, (48, 48))
frame = img_to_array(frame) / 255.0
frame = np.expand_dims(frame, axis=0)
prediction = image_model.predict(frame)
predictions.append(np.argmax(prediction))
cap.release()
most_common_emotion = Counter(predictions).most_common(1)[0][0]
return emotion_mapping[most_common_emotion]
# Extract audio from video and process
def extract_audio_from_video(video_path):
video = mp.VideoFileClip(video_path)
audio = video.audio
audio_file = 'audio.wav'
audio.write_audiofile(audio_file)
return audio_file
def transcribe_audio(audio_file):
recognizer = sr.Recognizer()
with sr.AudioFile(audio_file) as source:
audio_record = recognizer.record(source)
return recognizer.recognize_google(audio_record)
# Integrating with LLM to adjust responses based on detected emotion
def interact_with_llm(emotion, user_input):
prompt = f"The user is feeling {emotion}. Respond to their question in an empathetic and appropriate manner: {user_input}"
inputs = llama_tokenizer(prompt, return_tensors="pt")
outputs = llama_model.generate(**inputs, max_length=200)
response = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
# Main function to process video and predict emotions
def transcribe_and_predict_video(video_path):
# Extract audio from video and predict text-based emotion
audio_file = extract_audio_from_video(video_path)
text = transcribe_audio(audio_file)
text_emotion = predict_text_emotion(text)
# Predict emotion from video frames (image-based)
image_emotion = process_video(video_path)
# Predict emotion from audio (sound-based)
sample_rate, audio_data = librosa.load(audio_file, sr=None)
audio_emotion = predict_audio_emotion(audio_data, sample_rate)
# Combine the detected emotions for final output (you could average them or choose the most common)
final_emotion = image_emotion # Or decide based on some logic (e.g., majority vote)
# Get response from LLM
llm_response = interact_with_llm(final_emotion, text)
return f"Emotion Detected: {final_emotion}\nLLM Response: {llm_response}"
# Create Gradio interface
iface = gr.Interface(fn=transcribe_and_predict_video,
inputs=gr.Video(),
outputs="text",
title="Emotion-Responsive LLM for Video",
description="Upload a video to get emotion predictions and LLM responses based on detected emotions.")
iface.launch()
|