Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -249,13 +249,17 @@ def process_audio_from_video(video_path):
|
|
249 |
|
250 |
|
251 |
|
252 |
-
|
253 |
import gradio as gr
|
254 |
from huggingface_hub import InferenceClient
|
255 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
256 |
-
from huggingface_hub import InferenceClient
|
257 |
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
|
261 |
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
|
@@ -286,10 +290,10 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
|
|
286 |
|
287 |
|
288 |
# Function to handle video processing and interaction
|
289 |
-
def transcribe_and_predict_video(video, chat_history=[]):
|
290 |
# Process the video for emotions (use your own emotion detection functions)
|
291 |
image_emotion = process_video(video)
|
292 |
-
text_emotion, audio_emotion
|
293 |
em = [image_emotion, text_emotion, audio_emotion]
|
294 |
|
295 |
# Format the conversation history
|
@@ -324,20 +328,17 @@ def transcribe_and_predict_video(video, chat_history=[]):
|
|
324 |
|
325 |
|
326 |
# Gradio interface setup
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
|
334 |
-
],
|
335 |
)
|
336 |
|
337 |
# Launch the Gradio interface
|
338 |
if __name__ == "__main__":
|
339 |
-
|
340 |
-
|
341 |
|
342 |
|
343 |
|
|
|
249 |
|
250 |
|
251 |
|
252 |
+
import torch
|
253 |
import gradio as gr
|
254 |
from huggingface_hub import InferenceClient
|
255 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
256 |
|
257 |
+
# Hugging Face Inference Client (equivalent to the reference code's client)
|
258 |
+
client = InferenceClient("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ")
|
259 |
+
|
260 |
+
# Tokenizer and model loading (still necessary if you want to process locally)
|
261 |
+
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ")
|
262 |
+
model = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ")
|
263 |
|
264 |
|
265 |
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
|
|
|
290 |
|
291 |
|
292 |
# Function to handle video processing and interaction
|
293 |
+
def transcribe_and_predict_video(video, user_input, chat_history=[]):
|
294 |
# Process the video for emotions (use your own emotion detection functions)
|
295 |
image_emotion = process_video(video)
|
296 |
+
text_emotion, audio_emotion = process_audio_from_video(video)
|
297 |
em = [image_emotion, text_emotion, audio_emotion]
|
298 |
|
299 |
# Format the conversation history
|
|
|
328 |
|
329 |
|
330 |
# Gradio interface setup
|
331 |
+
iface = gr.Interface(
|
332 |
+
fn=transcribe_and_predict_video,
|
333 |
+
inputs=[gr.Video(), gr.Textbox(), gr.State()], # Accepting video input, user text, and chat history
|
334 |
+
outputs=[gr.Textbox(), gr.State()], # Output is the response and updated chat history
|
335 |
+
title="Multimodal Emotion Recognition from Video",
|
336 |
+
description="Upload a video to get text, audio, and image emotion predictions and interact with the chatbot."
|
|
|
|
|
337 |
)
|
338 |
|
339 |
# Launch the Gradio interface
|
340 |
if __name__ == "__main__":
|
341 |
+
iface.launch()
|
|
|
342 |
|
343 |
|
344 |
|