Spaces:

Garvitj
/

emotion-llm

Sleeping

App Files Files Community

Garvitj commited on Jan 18

Commit

3a2b7bc

verified ·

1 Parent(s): dac5e9b

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -6

app.py CHANGED Viewed

@@ -103,7 +103,7 @@ def find_emotion_using_text(sample_rate, audio_data, recognizer):
     os.remove(temp_audio_path)
     max_index = text_prediction.argmax()
-    return mapping[max_index]
 # Predict emotion from audio
 def predict_emotion(audio_data):
@@ -244,14 +244,58 @@ def process_audio_from_video(video_path):
         print(f"Error processing audio features: {e}")
         audio_emotion = "Error in audio processing"
-    return text_emotion, audio_emotion
-# Main function to handle video emotion recognition
-def transcribe_and_predict_video(video):
     image_emotion = process_video(video)
-    text_emotion, audio_emotion = process_audio_from_video(video)
-    return f"Text Emotion: {text_emotion}, Audio Emotion: {audio_emotion}, Image Emotion: {image_emotion}"
 # Create Gradio interface
 iface = gr.Interface(fn=transcribe_and_predict_video,
@@ -261,3 +305,5 @@ iface = gr.Interface(fn=transcribe_and_predict_video,
                      description="Upload a video to get text, audio, and image emotion predictions.")
 iface.launch()

     os.remove(temp_audio_path)
     max_index = text_prediction.argmax()
+    return mapping[max_index],text
 # Predict emotion from audio
 def predict_emotion(audio_data):
         print(f"Error processing audio features: {e}")
         audio_emotion = "Error in audio processing"
+    return text_emotion, audio_emotion,text
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import gradio as gr
+# Load Mistral 7B
+model_name = "mistralai/Mistral-7B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+def transcribe_and_predict_video(video, chat_history=[]):
+    # Process the video for emotions
     image_emotion = process_video(video)
+    text_emotion, audio_emotion, user_input = process_audio_from_video(video)
+    em = [image_emotion, text_emotion, audio_emotion]
+    # Format the conversation history
+    history_text = "".join([f"User ({msg[2]}): {msg[0]}\nBot: {msg[1]}\n" for msg in chat_history])
+    # Construct the prompt with emotion context and history
+    prompt = f"""
+    You are a helpful AI assistant. Respond like a human while considering the user's emotion.
+    User's Emotion: {em}
+    Conversation History:
+    {history_text}
+    User ({em}): {user_input}
+    Bot:"""
+    # Tokenize input
+    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+    # Generate response
+    output = model.generate(**inputs, max_length=512, temperature=0.7, top_p=0.9, do_sample=True)
+    response = tokenizer.decode(output[0], skip_special_tokens=True).split("Bot:")[-1].strip()
+    # Store the current emotion for the user input (You can modify the emotion detection based on your needs)
+    emotion = detect_emotion(user_input)  # Assuming `detect_emotion` is a function that returns the user's emotion
+    # Update the chat history with the current conversation and emotion
+    chat_history.append((user_input, response, emotion))
+    return response, chat_history
 # Create Gradio interface
 iface = gr.Interface(fn=transcribe_and_predict_video,
                      description="Upload a video to get text, audio, and image emotion predictions.")
 iface.launch()