welcometoFightclub commited on
Commit
3f871dd
·
verified ·
1 Parent(s): 90a9f0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +219 -226
app.py CHANGED
@@ -1,227 +1,220 @@
1
- import gradio as gr
2
- import torch
3
- import cv2
4
- import speech_recognition as sr
5
- from groq import Groq
6
- import os
7
- import time
8
- import base64
9
- from io import BytesIO
10
- from gtts import gTTS
11
-
12
- # Set device
13
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
14
- print(f"Using device: {device}")
15
-
16
- # Clear GPU memory if using GPU
17
- if torch.cuda.is_available():
18
- torch.cuda.empty_cache()
19
-
20
- # Grok API client with API key (stored as environment variable for security)
21
- GROQ_API_KEY = os.getenv("GROQ_API_KEY", "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") # Replace with your key or use env variable
22
- try:
23
- client = Groq(api_key=GROQ_API_KEY)
24
- print("Grok client initialized successfully")
25
- except Exception as e:
26
- print(f"Error initializing Groq client: {str(e)}")
27
- raise
28
-
29
- # Functions
30
- def predict_text_emotion(text):
31
- prompt = f"The user has entered text '{text}' classify user's emotion as happy or sad or anxious or angry. Respond in only one word."
32
- try:
33
- completion = client.chat.completions.create(
34
- model="llama-3.2-90b-vision-preview",
35
- messages=[{"role": "user", "content": prompt}],
36
- temperature=1,
37
- max_completion_tokens=64,
38
- top_p=1,
39
- stream=False,
40
- stop=None,
41
- )
42
- return completion.choices[0].message.content
43
- except Exception as e:
44
- return f"Error with Grok API: {str(e)}"
45
-
46
- def transcribe_audio(audio_path):
47
- r = sr.Recognizer()
48
- with sr.AudioFile(audio_path) as source:
49
- audio_text = r.listen(source)
50
- try:
51
- text = r.recognize_google(audio_text)
52
- return text
53
- except sr.UnknownValueError:
54
- return "I didn’t catch that—could you try again?"
55
- except sr.RequestError:
56
- return "Speech recognition unavailable—try typing instead."
57
-
58
- def capture_webcam_frame():
59
- cap = cv2.VideoCapture(0)
60
- if not cap.isOpened():
61
- return None
62
- start_time = time.time()
63
- while time.time() - start_time < 2:
64
- ret, frame = cap.read()
65
- if ret:
66
- _, buffer = cv2.imencode('.jpg', frame)
67
- img_base64 = base64.b64encode(buffer).decode('utf-8')
68
- img_url = f"data:image/jpeg;base64,{img_base64}"
69
- cap.release()
70
- return img_url
71
- cap.release()
72
- return None
73
-
74
- def detect_facial_emotion():
75
- img_url = capture_webcam_frame()
76
- if not img_url:
77
- return "neutral"
78
- try:
79
- completion = client.chat.completions.create(
80
- model="llama-3.2-90b-vision-preview",
81
- messages=[
82
- {
83
- "role": "user",
84
- "content": [
85
- {"type": "text", "text": "Identify user's facial emotion into happy or sad or anxious or angry. Respond in one word only"},
86
- {"type": "image_url", "image_url": {"url": img_url}}
87
- ]
88
- }
89
- ],
90
- temperature=1,
91
- max_completion_tokens=20,
92
- top_p=1,
93
- stream=False,
94
- stop=None,
95
- )
96
- emotion = completion.choices[0].message.content.strip().lower()
97
- if emotion not in ["happy", "sad", "anxious", "angry"]:
98
- return "neutral"
99
- return emotion
100
- except Exception as e:
101
- print(f"Error with Grok facial detection: {str(e)}")
102
- return "neutral"
103
-
104
- def generate_response(user_input, emotion):
105
- prompt = f"The user is feeling {emotion}. They said: '{user_input}'. Respond in a friendly caring manner with the user so the user feels being loved."
106
- try:
107
- completion = client.chat.completions.create(
108
- model="llama-3.2-90b-vision-preview",
109
- messages=[{"role": "user", "content": prompt}],
110
- temperature=1,
111
- max_completion_tokens=64,
112
- top_p=1,
113
- stream=False,
114
- stop=None,
115
- )
116
- return completion.choices[0].message.content
117
- except Exception as e:
118
- return f"Error with Groq API: {str(e)}"
119
-
120
- def text_to_speech(text):
121
- try:
122
- tts = gTTS(text=text, lang='en', slow=False)
123
- audio_buffer = BytesIO()
124
- tts.write_to_fp(audio_buffer)
125
- audio_buffer.seek(0)
126
- return audio_buffer
127
- except Exception as e:
128
- print(f"Error generating speech: {str(e)}")
129
- return None
130
-
131
- # Chat function for Gradio with voice output
132
- def chat_function(input_type, text_input, audio_input, chat_history):
133
- if input_type == "text" and text_input:
134
- user_input = text_input
135
- elif input_type == "voice" and audio_input:
136
- user_input = transcribe_audio(audio_input)
137
- else:
138
- return chat_history, "Please provide text or voice input.", gr.update(value=text_input), None
139
-
140
- text_emotion = predict_text_emotion(user_input)
141
- if not chat_history:
142
- gr.Info("Please look at the camera for emotion detection...")
143
- facial_emotion = detect_facial_emotion()
144
- else:
145
- facial_emotion = "neutral"
146
-
147
- emotions = [e for e in [text_emotion, facial_emotion] if e and e != "neutral"]
148
- combined_emotion = emotions[0] if emotions else "neutral"
149
-
150
- response = generate_response(user_input, combined_emotion)
151
- chat_history.append({"role": "user", "content": user_input})
152
- chat_history.append({"role": "assistant", "content": response})
153
-
154
- audio_output = text_to_speech(response)
155
- return chat_history, f"Detected Emotion: {combined_emotion}", "", audio_output
156
-
157
- # Custom CSS for better styling
158
- css = """
159
- <style>
160
- .chatbot .message-user {
161
- background-color: #e3f2fd;
162
- border-radius: 10px;
163
- padding: 10px;
164
- margin: 5px 0;
165
- }
166
- .chatbot .message-assistant {
167
- background-color: #c8e6c9;
168
- border-radius: 10px;
169
- padding: 10px;
170
- margin: 5px 0;
171
- }
172
- .input-container {
173
- padding: 10px;
174
- background-color: #f9f9f9;
175
- border-radius: 10px;
176
- margin-top: 10px;
177
- }
178
- </style>
179
- """
180
-
181
- # Build the Gradio interface
182
- with gr.Blocks(theme=gr.themes.Soft(), css=css) as app:
183
- gr.Markdown(
184
- """
185
- # Multimodal Mental Health AI Agent
186
- Chat with our empathetic AI designed to support you by understanding your emotions through text and facial expressions.
187
- """
188
- )
189
-
190
- with gr.Row():
191
- with gr.Column(scale=1):
192
- emotion_display = gr.Textbox(label="Emotion", interactive=False, placeholder="Detected emotion will appear here")
193
-
194
- with gr.Column(scale=3):
195
- chatbot = gr.Chatbot(label="Conversation History", height=500, type="messages", elem_classes="chatbot")
196
-
197
- with gr.Row(elem_classes="input-container"):
198
- input_type = gr.Radio(["text", "voice"], label="Input Method", value="text")
199
- text_input = gr.Textbox(label="Type Your Message", placeholder="How are you feeling today?", visible=True)
200
- audio_input = gr.Audio(type="filepath", label="Record Your Message", visible=False)
201
- submit_btn = gr.Button("Send", variant="primary")
202
- clear_btn = gr.Button("Clear Chat", variant="secondary")
203
- audio_output = gr.Audio(label="Assistant Response", type="filepath", interactive=False, autoplay=True)
204
-
205
- # Dynamic visibility based on input type
206
- def update_visibility(input_type):
207
- return gr.update(visible=input_type == "text"), gr.update(visible=input_type == "voice")
208
-
209
- input_type.change(fn=update_visibility, inputs=input_type, outputs=[text_input, audio_input])
210
-
211
- # Submit action with voice output
212
- submit_btn.click(
213
- fn=chat_function,
214
- inputs=[input_type, text_input, audio_input, chatbot],
215
- outputs=[chatbot, emotion_display, text_input, audio_output]
216
- )
217
-
218
- # Clear chat and audio
219
- clear_btn.click(
220
- lambda: ([], "", "", None),
221
- inputs=None,
222
- outputs=[chatbot, emotion_display, text_input, audio_output]
223
- )
224
-
225
- # Launch the app (for local testing; deployment will handle this differently)
226
- if __name__ == "__main__":
227
  app.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ import gradio as gr
2
+ import speech_recognition as sr
3
+ from groq import Groq
4
+ import os
5
+ import time
6
+ import base64
7
+ from io import BytesIO
8
+ from gtts import gTTS
9
+
10
+ # Set device
11
+
12
+
13
+ # Grok API client with API key (stored as environment variable for security)
14
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY", "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") # Replace with your key or use env variable
15
+ try:
16
+ client = Groq(api_key=GROQ_API_KEY)
17
+ print("Grok client initialized successfully")
18
+ except Exception as e:
19
+ print(f"Error initializing Groq client: {str(e)}")
20
+ raise
21
+
22
+ # Functions
23
+ def predict_text_emotion(text):
24
+ prompt = f"The user has entered text '{text}' classify user's emotion as happy or sad or anxious or angry. Respond in only one word."
25
+ try:
26
+ completion = client.chat.completions.create(
27
+ model="llama-3.2-90b-vision-preview",
28
+ messages=[{"role": "user", "content": prompt}],
29
+ temperature=1,
30
+ max_completion_tokens=64,
31
+ top_p=1,
32
+ stream=False,
33
+ stop=None,
34
+ )
35
+ return completion.choices[0].message.content
36
+ except Exception as e:
37
+ return f"Error with Grok API: {str(e)}"
38
+
39
+ def transcribe_audio(audio_path):
40
+ r = sr.Recognizer()
41
+ with sr.AudioFile(audio_path) as source:
42
+ audio_text = r.listen(source)
43
+ try:
44
+ text = r.recognize_google(audio_text)
45
+ return text
46
+ except sr.UnknownValueError:
47
+ return "I didn’t catch that—could you try again?"
48
+ except sr.RequestError:
49
+ return "Speech recognition unavailable—try typing instead."
50
+
51
+ def capture_webcam_frame():
52
+ cap = cv2.VideoCapture(0)
53
+ if not cap.isOpened():
54
+ return None
55
+ start_time = time.time()
56
+ while time.time() - start_time < 2:
57
+ ret, frame = cap.read()
58
+ if ret:
59
+ _, buffer = cv2.imencode('.jpg', frame)
60
+ img_base64 = base64.b64encode(buffer).decode('utf-8')
61
+ img_url = f"data:image/jpeg;base64,{img_base64}"
62
+ cap.release()
63
+ return img_url
64
+ cap.release()
65
+ return None
66
+
67
+ def detect_facial_emotion():
68
+ img_url = capture_webcam_frame()
69
+ if not img_url:
70
+ return "neutral"
71
+ try:
72
+ completion = client.chat.completions.create(
73
+ model="llama-3.2-90b-vision-preview",
74
+ messages=[
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {"type": "text", "text": "Identify user's facial emotion into happy or sad or anxious or angry. Respond in one word only"},
79
+ {"type": "image_url", "image_url": {"url": img_url}}
80
+ ]
81
+ }
82
+ ],
83
+ temperature=1,
84
+ max_completion_tokens=20,
85
+ top_p=1,
86
+ stream=False,
87
+ stop=None,
88
+ )
89
+ emotion = completion.choices[0].message.content.strip().lower()
90
+ if emotion not in ["happy", "sad", "anxious", "angry"]:
91
+ return "neutral"
92
+ return emotion
93
+ except Exception as e:
94
+ print(f"Error with Grok facial detection: {str(e)}")
95
+ return "neutral"
96
+
97
+ def generate_response(user_input, emotion):
98
+ prompt = f"The user is feeling {emotion}. They said: '{user_input}'. Respond in a friendly caring manner with the user so the user feels being loved."
99
+ try:
100
+ completion = client.chat.completions.create(
101
+ model="llama-3.2-90b-vision-preview",
102
+ messages=[{"role": "user", "content": prompt}],
103
+ temperature=1,
104
+ max_completion_tokens=64,
105
+ top_p=1,
106
+ stream=False,
107
+ stop=None,
108
+ )
109
+ return completion.choices[0].message.content
110
+ except Exception as e:
111
+ return f"Error with Groq API: {str(e)}"
112
+
113
+ def text_to_speech(text):
114
+ try:
115
+ tts = gTTS(text=text, lang='en', slow=False)
116
+ audio_buffer = BytesIO()
117
+ tts.write_to_fp(audio_buffer)
118
+ audio_buffer.seek(0)
119
+ return audio_buffer
120
+ except Exception as e:
121
+ print(f"Error generating speech: {str(e)}")
122
+ return None
123
+
124
+ # Chat function for Gradio with voice output
125
+ def chat_function(input_type, text_input, audio_input, chat_history):
126
+ if input_type == "text" and text_input:
127
+ user_input = text_input
128
+ elif input_type == "voice" and audio_input:
129
+ user_input = transcribe_audio(audio_input)
130
+ else:
131
+ return chat_history, "Please provide text or voice input.", gr.update(value=text_input), None
132
+
133
+ text_emotion = predict_text_emotion(user_input)
134
+ if not chat_history:
135
+ gr.Info("Please look at the camera for emotion detection...")
136
+ facial_emotion = detect_facial_emotion()
137
+ else:
138
+ facial_emotion = "neutral"
139
+
140
+ emotions = [e for e in [text_emotion, facial_emotion] if e and e != "neutral"]
141
+ combined_emotion = emotions[0] if emotions else "neutral"
142
+
143
+ response = generate_response(user_input, combined_emotion)
144
+ chat_history.append({"role": "user", "content": user_input})
145
+ chat_history.append({"role": "assistant", "content": response})
146
+
147
+ audio_output = text_to_speech(response)
148
+ return chat_history, f"Detected Emotion: {combined_emotion}", "", audio_output
149
+
150
+ # Custom CSS for better styling
151
+ css = """
152
+ <style>
153
+ .chatbot .message-user {
154
+ background-color: #e3f2fd;
155
+ border-radius: 10px;
156
+ padding: 10px;
157
+ margin: 5px 0;
158
+ }
159
+ .chatbot .message-assistant {
160
+ background-color: #c8e6c9;
161
+ border-radius: 10px;
162
+ padding: 10px;
163
+ margin: 5px 0;
164
+ }
165
+ .input-container {
166
+ padding: 10px;
167
+ background-color: #f9f9f9;
168
+ border-radius: 10px;
169
+ margin-top: 10px;
170
+ }
171
+ </style>
172
+ """
173
+
174
+ # Build the Gradio interface
175
+ with gr.Blocks(theme=gr.themes.Soft(), css=css) as app:
176
+ gr.Markdown(
177
+ """
178
+ # Multimodal Mental Health AI Agent
179
+ Chat with our empathetic AI designed to support you by understanding your emotions through text and facial expressions.
180
+ """
181
+ )
182
+
183
+ with gr.Row():
184
+ with gr.Column(scale=1):
185
+ emotion_display = gr.Textbox(label="Emotion", interactive=False, placeholder="Detected emotion will appear here")
186
+
187
+ with gr.Column(scale=3):
188
+ chatbot = gr.Chatbot(label="Conversation History", height=500, type="messages", elem_classes="chatbot")
189
+
190
+ with gr.Row(elem_classes="input-container"):
191
+ input_type = gr.Radio(["text", "voice"], label="Input Method", value="text")
192
+ text_input = gr.Textbox(label="Type Your Message", placeholder="How are you feeling today?", visible=True)
193
+ audio_input = gr.Audio(type="filepath", label="Record Your Message", visible=False)
194
+ submit_btn = gr.Button("Send", variant="primary")
195
+ clear_btn = gr.Button("Clear Chat", variant="secondary")
196
+ audio_output = gr.Audio(label="Assistant Response", type="filepath", interactive=False, autoplay=True)
197
+
198
+ # Dynamic visibility based on input type
199
+ def update_visibility(input_type):
200
+ return gr.update(visible=input_type == "text"), gr.update(visible=input_type == "voice")
201
+
202
+ input_type.change(fn=update_visibility, inputs=input_type, outputs=[text_input, audio_input])
203
+
204
+ # Submit action with voice output
205
+ submit_btn.click(
206
+ fn=chat_function,
207
+ inputs=[input_type, text_input, audio_input, chatbot],
208
+ outputs=[chatbot, emotion_display, text_input, audio_output]
209
+ )
210
+
211
+ # Clear chat and audio
212
+ clear_btn.click(
213
+ lambda: ([], "", "", None),
214
+ inputs=None,
215
+ outputs=[chatbot, emotion_display, text_input, audio_output]
216
+ )
217
+
218
+ # Launch the app (for local testing; deployment will handle this differently)
219
+ if __name__ == "__main__":
 
 
 
 
 
 
 
220
  app.launch(server_name="0.0.0.0", server_port=7860)