File size: 5,015 Bytes
ab4fd59
18a71f5
 
a22f055
 
 
 
18a71f5
a22f055
 
18a71f5
a22f055
18a71f5
a22f055
18a71f5
a22f055
18a71f5
a22f055
 
18a71f5
a22f055
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18a71f5
 
 
a22f055
 
 
 
18a71f5
 
a22f055
 
 
 
 
 
 
 
 
 
 
 
 
 
18a71f5
a22f055
 
 
 
18a71f5
a22f055
 
18a71f5
a22f055
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18a71f5
a22f055
 
18a71f5
a22f055
 
 
 
 
 
18a71f5
a22f055
 
18a71f5
a22f055
 
 
18a71f5
a22f055
 
 
 
 
18a71f5
a22f055
 
18a71f5
a22f055
 
18a71f5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import gradio as gr
import numpy as np
import cv2
import librosa
import tempfile
import wave
import os
import speech_recognition as sr
import pickle
import json
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from collections import Counter
from transformers import LlamaTokenizer, LlamaForCausalLM

# Initialize necessary models and tools
# Load the tokenizer and model for text-based emotion prediction
with open('tokenizer.json') as json_file:
    tokenizer_json = json.load(json_file)
tokenizer = tokenizer_from_json(tokenizer_json)
text_model = load_model('model_for_text_emotion_updated(1).keras')

# Load the audio emotion model and scaler
with open('encoder.pkl', 'rb') as file:
    encoder = pickle.load(file)
with open('scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)
audio_model = load_model('my_model.h5')

# Load the LLaMA model for question answering
llama_tokenizer = LlamaTokenizer.from_pretrained('huggingface/llama-7b')
llama_model = LlamaForCausalLM.from_pretrained('huggingface/llama-7b')

# Initialize NLTK tools
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = nltk.WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words('english'))

# Preprocess text for emotion prediction
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

# Extract audio features and predict emotion
def extract_audio_features(data, sample_rate):
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result = np.hstack((result, zcr))
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc))
    return result

def predict_emotion_from_audio(audio_data):
    sample_rate, data = audio_data
    features = extract_audio_features(data, sample_rate)
    features = np.expand_dims(features, axis=0)
    scaled_features = scaler.transform(features)
    prediction = audio_model.predict(scaled_features)
    emotion_index = np.argmax(prediction)
    emotion_array = np.zeros((1, len(encoder.categories_[0])))
    emotion_array[0, emotion_index] = 1
    emotion_label = encoder.inverse_transform(emotion_array)[0]
    return emotion_label

# Extract text from audio (speech recognition)
def extract_text_from_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)
    text = recognizer.recognize_google(audio_data)
    return text

# Use LLaMA to answer questions based on the text
def ask_llama(question, context):
    inputs = llama_tokenizer(question, context, return_tensors="pt")
    outputs = llama_model.generate(inputs['input_ids'], max_length=150)
    answer = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Process the video and extract text, emotion, and context for LLaMA
def process_video(video_path):
    # Extract audio from the video
    video = mp.VideoFileClip(video_path)
    if video.audio is None:
        raise ValueError("No audio found in the video.")
    
    audio = video.audio
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
        temp_audio_path = temp_audio_file.name
        audio.write_audiofile(temp_audio_path)

    # Extract text from the audio
    video_text = extract_text_from_audio(temp_audio_path)
    
    # Predict emotions from the text and audio
    preprocessed_text = preprocess_text(video_text)
    title_seq = tokenizer.texts_to_sequences([preprocessed_text])
    padded_title_seq = pad_sequences(title_seq, maxlen=35, padding='post', truncating='post')
    text_emotion_prediction = text_model.predict(np.array(padded_title_seq))
    text_emotion = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'][np.argmax(text_emotion_prediction)]
    
    audio_data = audio.to_soundarray(fps=audio.fps)
    audio_emotion = predict_emotion_from_audio((audio.fps, audio_data))

    # Answer user queries based on the video text
    context = video_text
    return context, text_emotion, audio_emotion

# Define Gradio Interface
def video_query_interface(video, question):
    context, text_emotion, audio_emotion = process_video(video)
    answer = ask_llama(question, context)
    return f"Text Emotion: {text_emotion}, Audio Emotion: {audio_emotion}\nAnswer: {answer}"

iface = gr.Interface(fn=video_query_interface, 
                     inputs=[gr.Video(), gr.Textbox()], 
                     outputs="text", 
                     title="Video Emotion and Q&A",
                     description="Upload a video and ask a question based on the audio content.")

iface.launch()