Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -103,7 +103,7 @@ def find_emotion_using_text(sample_rate, audio_data, recognizer):
|
|
103 |
|
104 |
os.remove(temp_audio_path)
|
105 |
max_index = text_prediction.argmax()
|
106 |
-
return mapping[max_index]
|
107 |
|
108 |
# Predict emotion from audio
|
109 |
def predict_emotion(audio_data):
|
@@ -244,14 +244,58 @@ def process_audio_from_video(video_path):
|
|
244 |
print(f"Error processing audio features: {e}")
|
245 |
audio_emotion = "Error in audio processing"
|
246 |
|
247 |
-
return text_emotion, audio_emotion
|
248 |
|
249 |
|
250 |
-
|
251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
image_emotion = process_video(video)
|
253 |
-
text_emotion, audio_emotion = process_audio_from_video(video)
|
254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
|
256 |
# Create Gradio interface
|
257 |
iface = gr.Interface(fn=transcribe_and_predict_video,
|
@@ -261,3 +305,5 @@ iface = gr.Interface(fn=transcribe_and_predict_video,
|
|
261 |
description="Upload a video to get text, audio, and image emotion predictions.")
|
262 |
|
263 |
iface.launch()
|
|
|
|
|
|
103 |
|
104 |
os.remove(temp_audio_path)
|
105 |
max_index = text_prediction.argmax()
|
106 |
+
return mapping[max_index],text
|
107 |
|
108 |
# Predict emotion from audio
|
109 |
def predict_emotion(audio_data):
|
|
|
244 |
print(f"Error processing audio features: {e}")
|
245 |
audio_emotion = "Error in audio processing"
|
246 |
|
247 |
+
return text_emotion, audio_emotion,text
|
248 |
|
249 |
|
250 |
+
|
251 |
+
|
252 |
+
|
253 |
+
import torch
|
254 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
255 |
+
import gradio as gr
|
256 |
+
|
257 |
+
# Load Mistral 7B
|
258 |
+
model_name = "mistralai/Mistral-7B-Instruct"
|
259 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
260 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
|
261 |
+
|
262 |
+
|
263 |
+
|
264 |
+
def transcribe_and_predict_video(video, chat_history=[]):
|
265 |
+
# Process the video for emotions
|
266 |
image_emotion = process_video(video)
|
267 |
+
text_emotion, audio_emotion, user_input = process_audio_from_video(video)
|
268 |
+
em = [image_emotion, text_emotion, audio_emotion]
|
269 |
+
|
270 |
+
# Format the conversation history
|
271 |
+
history_text = "".join([f"User ({msg[2]}): {msg[0]}\nBot: {msg[1]}\n" for msg in chat_history])
|
272 |
+
|
273 |
+
# Construct the prompt with emotion context and history
|
274 |
+
prompt = f"""
|
275 |
+
You are a helpful AI assistant. Respond like a human while considering the user's emotion.
|
276 |
+
|
277 |
+
User's Emotion: {em}
|
278 |
+
|
279 |
+
Conversation History:
|
280 |
+
{history_text}
|
281 |
+
|
282 |
+
User ({em}): {user_input}
|
283 |
+
Bot:"""
|
284 |
+
|
285 |
+
# Tokenize input
|
286 |
+
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
287 |
+
|
288 |
+
# Generate response
|
289 |
+
output = model.generate(**inputs, max_length=512, temperature=0.7, top_p=0.9, do_sample=True)
|
290 |
+
response = tokenizer.decode(output[0], skip_special_tokens=True).split("Bot:")[-1].strip()
|
291 |
+
|
292 |
+
# Store the current emotion for the user input (You can modify the emotion detection based on your needs)
|
293 |
+
emotion = detect_emotion(user_input) # Assuming `detect_emotion` is a function that returns the user's emotion
|
294 |
+
|
295 |
+
# Update the chat history with the current conversation and emotion
|
296 |
+
chat_history.append((user_input, response, emotion))
|
297 |
+
|
298 |
+
return response, chat_history
|
299 |
|
300 |
# Create Gradio interface
|
301 |
iface = gr.Interface(fn=transcribe_and_predict_video,
|
|
|
305 |
description="Upload a video to get text, audio, and image emotion predictions.")
|
306 |
|
307 |
iface.launch()
|
308 |
+
|
309 |
+
|