Spaces:
Sleeping
Sleeping
File size: 5,015 Bytes
ab4fd59 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 a22f055 18a71f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import gradio as gr
import numpy as np
import cv2
import librosa
import tempfile
import wave
import os
import speech_recognition as sr
import pickle
import json
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from collections import Counter
from transformers import LlamaTokenizer, LlamaForCausalLM
# Initialize necessary models and tools
# Load the tokenizer and model for text-based emotion prediction
with open('tokenizer.json') as json_file:
tokenizer_json = json.load(json_file)
tokenizer = tokenizer_from_json(tokenizer_json)
text_model = load_model('model_for_text_emotion_updated(1).keras')
# Load the audio emotion model and scaler
with open('encoder.pkl', 'rb') as file:
encoder = pickle.load(file)
with open('scaler.pkl', 'rb') as file:
scaler = pickle.load(file)
audio_model = load_model('my_model.h5')
# Load the LLaMA model for question answering
llama_tokenizer = LlamaTokenizer.from_pretrained('huggingface/llama-7b')
llama_model = LlamaForCausalLM.from_pretrained('huggingface/llama-7b')
# Initialize NLTK tools
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = nltk.WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words('english'))
# Preprocess text for emotion prediction
def preprocess_text(text):
tokens = nltk.word_tokenize(text.lower())
tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
return ' '.join(lemmatized_tokens)
# Extract audio features and predict emotion
def extract_audio_features(data, sample_rate):
result = np.array([])
zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
result = np.hstack((result, zcr))
mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
result = np.hstack((result, mfcc))
return result
def predict_emotion_from_audio(audio_data):
sample_rate, data = audio_data
features = extract_audio_features(data, sample_rate)
features = np.expand_dims(features, axis=0)
scaled_features = scaler.transform(features)
prediction = audio_model.predict(scaled_features)
emotion_index = np.argmax(prediction)
emotion_array = np.zeros((1, len(encoder.categories_[0])))
emotion_array[0, emotion_index] = 1
emotion_label = encoder.inverse_transform(emotion_array)[0]
return emotion_label
# Extract text from audio (speech recognition)
def extract_text_from_audio(audio_path):
recognizer = sr.Recognizer()
with sr.AudioFile(audio_path) as source:
audio_data = recognizer.record(source)
text = recognizer.recognize_google(audio_data)
return text
# Use LLaMA to answer questions based on the text
def ask_llama(question, context):
inputs = llama_tokenizer(question, context, return_tensors="pt")
outputs = llama_model.generate(inputs['input_ids'], max_length=150)
answer = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
return answer
# Process the video and extract text, emotion, and context for LLaMA
def process_video(video_path):
# Extract audio from the video
video = mp.VideoFileClip(video_path)
if video.audio is None:
raise ValueError("No audio found in the video.")
audio = video.audio
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
temp_audio_path = temp_audio_file.name
audio.write_audiofile(temp_audio_path)
# Extract text from the audio
video_text = extract_text_from_audio(temp_audio_path)
# Predict emotions from the text and audio
preprocessed_text = preprocess_text(video_text)
title_seq = tokenizer.texts_to_sequences([preprocessed_text])
padded_title_seq = pad_sequences(title_seq, maxlen=35, padding='post', truncating='post')
text_emotion_prediction = text_model.predict(np.array(padded_title_seq))
text_emotion = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'][np.argmax(text_emotion_prediction)]
audio_data = audio.to_soundarray(fps=audio.fps)
audio_emotion = predict_emotion_from_audio((audio.fps, audio_data))
# Answer user queries based on the video text
context = video_text
return context, text_emotion, audio_emotion
# Define Gradio Interface
def video_query_interface(video, question):
context, text_emotion, audio_emotion = process_video(video)
answer = ask_llama(question, context)
return f"Text Emotion: {text_emotion}, Audio Emotion: {audio_emotion}\nAnswer: {answer}"
iface = gr.Interface(fn=video_query_interface,
inputs=[gr.Video(), gr.Textbox()],
outputs="text",
title="Video Emotion and Q&A",
description="Upload a video and ask a question based on the audio content.")
iface.launch()
|