Garvitj commited on
Commit
6fc3e25
·
verified ·
1 Parent(s): 5a48e57

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -361
app.py CHANGED
@@ -1,366 +1,17 @@
1
  import gradio as gr
2
- import numpy as np
3
- import cv2
4
- import librosa
5
- import speech_recognition as sr
6
- import tempfile
7
- import wave
8
- import optimum
9
- import os
10
- import tensorflow as tf
11
- from tensorflow.keras.preprocessing.text import tokenizer_from_json
12
- from tensorflow.keras.models import load_model, model_from_json
13
- from sklearn.preprocessing import StandardScaler
14
- from tensorflow.keras.preprocessing.sequence import pad_sequences
15
- import nltk
16
- from nltk.corpus import stopwords
17
- from nltk.stem import WordNetLemmatizer
18
- import pickle
19
- import json
20
- from tensorflow.keras.preprocessing.image import img_to_array, load_img
21
- from collections import Counter
22
- from pydub import AudioSegment
23
- import ffmpeg
24
 
25
- nltk.download('punkt') # Tokenizer
26
- nltk.download('wordnet') # WordNet lemmatizer
27
- nltk.download('stopwords') # Stopwords
28
-
29
- # Load the text model
30
- with open('model_architecture_for_text_emotion_updated_json.json', 'r') as json_file:
31
- model_json = json_file.read()
32
- text_model = model_from_json(model_json)
33
- text_model.load_weights("model_for_text_emotion_updated(1).keras")
34
-
35
- # Load the encoder and scaler for audio
36
- with open('encoder.pkl', 'rb') as file:
37
- encoder = pickle.load(file)
38
- with open('scaler.pkl', 'rb') as file:
39
- scaler = pickle.load(file)
40
-
41
- # Load the tokenizer for text
42
- with open('tokenizer.json') as json_file:
43
- tokenizer_json = json.load(json_file)
44
- tokenizer = tokenizer_from_json(tokenizer_json)
45
-
46
- # Load the audio model
47
- audio_model = load_model('my_model.h5')
48
-
49
- # Load the image model
50
- image_model = load_model('model_emotion.h5')
51
-
52
- # Initialize NLTK
53
- lemmatizer = WordNetLemmatizer()
54
- stop_words = set(stopwords.words('english'))
55
-
56
- # Preprocess text function
57
- def preprocess_text(text):
58
- tokens = nltk.word_tokenize(text.lower())
59
- tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
60
- lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
61
- return ' '.join(lemmatized_tokens)
62
-
63
- # Extract features from audio
64
- import numpy as np
65
- import torch
66
- import torchaudio
67
- import torchaudio.transforms as T
68
-
69
- def extract_features(data, sample_rate):
70
- # List to collect all features
71
- features = []
72
-
73
- # Zero Crossing Rate (ZCR)
74
- zcr = T.ZeroCrossingRate()(data)
75
- features.append(torch.mean(zcr).numpy())
76
-
77
- # Chroma Short-Time Fourier Transform (STFT)
78
- stft = T.MelSpectrogram(sample_rate)(data)
79
- chroma_stft = torch.mean(stft, dim=-1).numpy() # Take mean across the time dimension
80
- features.append(chroma_stft)
81
-
82
- # Mel Frequency Cepstral Coefficients (MFCC)
83
- mfcc_transform = T.MFCC(sample_rate=sample_rate, melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23})
84
- mfcc = mfcc_transform(data)
85
- mfcc = torch.mean(mfcc, dim=-1).numpy() # Take mean across the time dimension
86
- features.append(mfcc)
87
-
88
- # Root Mean Square Energy (RMS)
89
- rms = torch.mean(T.MelSpectrogram(sample_rate)(data), dim=-1) # Same as RMS feature extraction
90
- features.append(rms.numpy())
91
-
92
- # Mel Spectrogram
93
- mel = T.MelSpectrogram(sample_rate)(data)
94
- mel = torch.mean(mel, dim=-1).numpy() # Take mean across the time dimension
95
- features.append(mel)
96
-
97
- # Convert list of features to a single numpy array
98
- result = np.hstack(features)
99
-
100
- return result
101
-
102
-
103
- # Predict emotion from text
104
- def find_emotion_using_text(sample_rate, audio_data, recognizer):
105
- mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
106
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
107
- temp_audio_path = temp_audio_file.name
108
-
109
- with wave.open(temp_audio_path, 'w') as wf:
110
- wf.setnchannels(1)
111
- wf.setsampwidth(2)
112
- wf.setframerate(sample_rate)
113
- wf.writeframes(audio_data.tobytes())
114
-
115
- with sr.AudioFile(temp_audio_path) as source:
116
- audio_record = recognizer.record(source)
117
- text = recognizer.recognize_google(audio_record)
118
- pre_text = preprocess_text(text)
119
- title_seq = tokenizer.texts_to_sequences([pre_text])
120
- padded_title_seq = pad_sequences(title_seq, maxlen=35, padding='post', truncating='post')
121
- inp1 = np.array(padded_title_seq)
122
- text_prediction = text_model.predict(inp1)
123
-
124
- os.remove(temp_audio_path)
125
- max_index = text_prediction.argmax()
126
- return mapping[max_index],text
127
-
128
- # Predict emotion from audio
129
- def predict_emotion(audio_data):
130
- sample_rate, data = audio_data
131
- data = data.flatten()
132
-
133
- if data.dtype != np.float32:
134
- data = data.astype(np.float32)
135
- data = data / np.max(np.abs(data))
136
-
137
- features = extract_features(data, sample_rate)
138
- features = np.expand_dims(features, axis=0)
139
-
140
- if features.ndim == 3:
141
- features = np.squeeze(features, axis=2)
142
- elif features.ndim != 2:
143
- raise ValueError("Features array has unexpected dimensions.")
144
-
145
- scaled_features = scaler.transform(features)
146
- scaled_features = np.expand_dims(scaled_features, axis=2)
147
-
148
- prediction = audio_model.predict(scaled_features)
149
- emotion_index = np.argmax(prediction)
150
-
151
- num_classes = len(encoder.categories_[0])
152
- emotion_array = np.zeros((1, num_classes))
153
- emotion_array[0, emotion_index] = 1
154
-
155
- emotion_label = encoder.inverse_transform(emotion_array)[0]
156
- return emotion_label
157
-
158
- def preprocess_image(image):
159
- image = load_img(image, target_size=(48, 48), color_mode="grayscale")
160
- image = img_to_array(image)
161
- image = np.expand_dims(image, axis=0)
162
- image = image / 255.0
163
- return image
164
-
165
- # Predict emotion from image
166
- def predict_emotion_from_image(image):
167
- preprocessed_image = preprocess_image(image)
168
- prediction = image_model.predict(preprocessed_image)
169
- emotion_index = np.argmax(prediction)
170
-
171
- mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
172
- return mapping[emotion_index]
173
-
174
- def process_video(video_path):
175
- cap = cv2.VideoCapture(video_path)
176
- frame_rate = cap.get(cv2.CAP_PROP_FPS)
177
-
178
- frame_count = 0
179
- predictions = []
180
-
181
- while cap.isOpened():
182
- ret, frame = cap.read()
183
- if not ret:
184
- break
185
-
186
- # Process every nth frame (to speed up processing)
187
- if frame_count % int(frame_rate) == 0:
188
- # Convert frame to grayscale as required by your model
189
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
190
- frame = cv2.resize(frame, (48, 48)) # Resize to match model input size
191
- frame = img_to_array(frame)
192
- frame = np.expand_dims(frame, axis=0) / 255.0
193
-
194
- # Predict emotion
195
- prediction = image_model.predict(frame)
196
- predictions.append(np.argmax(prediction))
197
-
198
- frame_count += 1
199
-
200
- cap.release()
201
- cv2.destroyAllWindows()
202
-
203
- # Find the most common prediction
204
- most_common_emotion = Counter(predictions).most_common(1)[0][0]
205
- mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
206
- return mapping[most_common_emotion]
207
-
208
-
209
-
210
- def process_audio_from_video(video_path):
211
- text_emotion = "Error in text processing" # Initialize text_emotion
212
- text=""
213
- try:
214
- # Load the video using an alternative library (e.g., ffmpeg or cv2)
215
- import ffmpeg
216
-
217
- audio_output = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
218
- ffmpeg.input(video_path).output(audio_output, format="wav").run(quiet=True)
219
-
220
- recognizer = sr.Recognizer()
221
-
222
- with sr.AudioFile(audio_output) as source:
223
- audio_record = recognizer.record(source)
224
- text = recognizer.recognize_google(audio_record)
225
- pre_text = preprocess_text(text)
226
- title_seq = tokenizer.texts_to_sequences([pre_text])
227
- padded_title_seq = pad_sequences(title_seq, maxlen=35, padding='post', truncating='post')
228
- inp1 = np.array(padded_title_seq)
229
- text_prediction = text_model.predict(inp1)
230
-
231
- os.remove(audio_output)
232
-
233
- max_index = text_prediction.argmax()
234
- text_emotion = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}[max_index]
235
-
236
- except Exception as e:
237
- print(f"Error processing text from audio: {e}")
238
- text_emotion = "Error in text processing"
239
-
240
- try:
241
- # Extract audio features for emotion recognition
242
- sample_rate, data = librosa.load(video_path, sr=None, mono=True)
243
- data = data.flatten()
244
-
245
- if data.dtype != np.float32:
246
- data = data.astype(np.float32)
247
- data = data / np.max(np.abs(data))
248
-
249
- features = extract_features(data, sample_rate)
250
- features = np.expand_dims(features, axis=0)
251
- scaled_features = scaler.transform(features)
252
- scaled_features = np.expand_dims(scaled_features, axis=2)
253
-
254
- prediction = audio_model.predict(scaled_features)
255
- emotion_index = np.argmax(prediction)
256
-
257
- num_classes = len(encoder.categories_[0])
258
- emotion_array = np.zeros((1, num_classes))
259
- emotion_array[0, emotion_index] = 1
260
-
261
- audio_emotion = encoder.inverse_transform(emotion_array)[0]
262
-
263
- except Exception as e:
264
- print(f"Error processing audio features: {e}")
265
- audio_emotion = "Error in audio processing"
266
-
267
- return text_emotion, audio_emotion,text
268
-
269
-
270
-
271
- import torch
272
- import gradio as gr
273
- from huggingface_hub import InferenceClient
274
- from transformers import AutoTokenizer, AutoModelForCausalLM
275
-
276
- # Hugging Face Inference Client (equivalent to the reference code's client)
277
- client = InferenceClient("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ")
278
-
279
- # Tokenizer and model loading (still necessary if you want to process locally)
280
- tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ")
281
- model = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GPTQ")
282
-
283
-
284
- def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
285
- messages = [{"role": "system", "content": system_message}]
286
-
287
- # Format history with user and bot messages
288
- for val in history:
289
- if val[0]:
290
- messages.append({"role": "user", "content": val[0]})
291
- if val[1]:
292
- messages.append({"role": "assistant", "content": val[1]})
293
-
294
- messages.append({"role": "user", "content": message})
295
-
296
- response = ""
297
-
298
- # Stream response from the model
299
- for message in client.chat_completion(
300
- messages,
301
- max_tokens=max_tokens,
302
- stream=True,
303
- temperature=temperature,
304
- top_p=top_p,
305
- ):
306
- token = message.choices[0].delta.content
307
- response += token
308
- yield response
309
-
310
-
311
- # Function to handle video processing and interaction
312
- def transcribe_and_predict_video(video, user_input, chat_history=[]):
313
- # Process the video for emotions (use your own emotion detection functions)
314
-
315
- if chat_history is None:
316
- chat_history = []
317
- image_emotion = process_video(video)
318
- text_emotion, audio_emotion,text = process_audio_from_video(video)
319
- em = [image_emotion, text_emotion, audio_emotion]
320
-
321
- # Format the conversation history
322
- history_text = "".join([f"User ({msg[2]}): {msg[0]}\nBot: {msg[1]}\n" for msg in chat_history])
323
-
324
- # Construct the prompt with emotion context and history
325
- prompt = f"""
326
- You are a helpful AI assistant. Respond like a human while considering the user's emotion.
327
-
328
- User's Emotion: {em}
329
- video text context: {text}
330
- Conversation History:
331
- {history_text}
332
-
333
- User ({em}): {user_input}
334
- Bot:"""
335
-
336
- # Tokenize input
337
- inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
338
-
339
- # Generate response
340
- output = model.generate(**inputs, max_length=512, temperature=0.7, top_p=0.9, do_sample=True)
341
- response = tokenizer.decode(output[0], skip_special_tokens=True).split("Bot:")[-1].strip()
342
-
343
- # Store the current emotion for the user input (modify emotion detection as needed)
344
- emotion = detect_emotion(user_input) # Assuming `detect_emotion` is a function that returns the user's emotion
345
-
346
- # Update the chat history with the current conversation and emotion
347
- chat_history.append((user_input, response, emotion))
348
-
349
- return response, chat_history
350
-
351
-
352
- # Gradio interface setup
353
- iface = gr.Interface(
354
- fn=transcribe_and_predict_video,
355
- inputs=[gr.Video(), gr.Textbox(), gr.State()], # Accepting video input, user text, and chat history
356
- outputs=[gr.Textbox(), gr.State()], # Output is the response and updated chat history
357
- title="Multimodal Emotion Recognition from Video",
358
- description="Upload a video to get text, audio, and image emotion predictions and interact with the chatbot."
359
- )
360
-
361
- # Launch the Gradio interface
362
- if __name__ == "__main__":
363
- iface.launch()
364
 
 
 
 
 
 
365
 
 
 
366
 
 
 
 
1
  import gradio as gr
2
+ from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ # Load the DeepSeek model
5
+ pipe = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1", trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # Function to interact with the chatbot
8
+ def chat_with_bot(message, chat_history):
9
+ messages = [{"role": "user", "content": message}]
10
+ response = pipe(messages, max_length=512)
11
+ return response[0]["generated_text"]
12
 
13
+ # Create Gradio UI
14
+ interface = gr.ChatInterface(fn=chat_with_bot, title="DeepSeek AI Chatbot")
15
 
16
+ # Launch the chatbot
17
+ interface.launch()