|
import gradio as gr |
|
import speech_recognition as sr |
|
from groq import Groq |
|
import os |
|
import time |
|
import base64 |
|
from io import BytesIO |
|
from gtts import gTTS |
|
|
|
|
|
|
|
|
|
|
|
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "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") |
|
try: |
|
client = Groq(api_key=GROQ_API_KEY) |
|
print("Grok client initialized successfully") |
|
except Exception as e: |
|
print(f"Error initializing Groq client: {str(e)}") |
|
raise |
|
|
|
|
|
def predict_text_emotion(text): |
|
prompt = f"The user has entered text '{text}' classify user's emotion as happy or sad or anxious or angry. Respond in only one word." |
|
try: |
|
completion = client.chat.completions.create( |
|
model="llama-3.2-90b-vision-preview", |
|
messages=[{"role": "user", "content": prompt}], |
|
temperature=1, |
|
max_completion_tokens=64, |
|
top_p=1, |
|
stream=False, |
|
stop=None, |
|
) |
|
return completion.choices[0].message.content |
|
except Exception as e: |
|
return f"Error with Grok API: {str(e)}" |
|
|
|
def transcribe_audio(audio_path): |
|
r = sr.Recognizer() |
|
with sr.AudioFile(audio_path) as source: |
|
audio_text = r.listen(source) |
|
try: |
|
text = r.recognize_google(audio_text) |
|
return text |
|
except sr.UnknownValueError: |
|
return "I didn’t catch that—could you try again?" |
|
except sr.RequestError: |
|
return "Speech recognition unavailable—try typing instead." |
|
|
|
def capture_webcam_frame(): |
|
cap = cv2.VideoCapture(0) |
|
if not cap.isOpened(): |
|
return None |
|
start_time = time.time() |
|
while time.time() - start_time < 2: |
|
ret, frame = cap.read() |
|
if ret: |
|
_, buffer = cv2.imencode('.jpg', frame) |
|
img_base64 = base64.b64encode(buffer).decode('utf-8') |
|
img_url = f"data:image/jpeg;base64,{img_base64}" |
|
cap.release() |
|
return img_url |
|
cap.release() |
|
return None |
|
|
|
def detect_facial_emotion(): |
|
img_url = capture_webcam_frame() |
|
if not img_url: |
|
return "neutral" |
|
try: |
|
completion = client.chat.completions.create( |
|
model="llama-3.2-90b-vision-preview", |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": "Identify user's facial emotion into happy or sad or anxious or angry. Respond in one word only"}, |
|
{"type": "image_url", "image_url": {"url": img_url}} |
|
] |
|
} |
|
], |
|
temperature=1, |
|
max_completion_tokens=20, |
|
top_p=1, |
|
stream=False, |
|
stop=None, |
|
) |
|
emotion = completion.choices[0].message.content.strip().lower() |
|
if emotion not in ["happy", "sad", "anxious", "angry"]: |
|
return "neutral" |
|
return emotion |
|
except Exception as e: |
|
print(f"Error with Grok facial detection: {str(e)}") |
|
return "neutral" |
|
|
|
def generate_response(user_input, emotion): |
|
prompt = f"The user is feeling {emotion}. They said: '{user_input}'. Respond in a friendly caring manner with the user so the user feels being loved." |
|
try: |
|
completion = client.chat.completions.create( |
|
model="llama-3.2-90b-vision-preview", |
|
messages=[{"role": "user", "content": prompt}], |
|
temperature=1, |
|
max_completion_tokens=64, |
|
top_p=1, |
|
stream=False, |
|
stop=None, |
|
) |
|
return completion.choices[0].message.content |
|
except Exception as e: |
|
return f"Error with Groq API: {str(e)}" |
|
|
|
def text_to_speech(text): |
|
try: |
|
tts = gTTS(text=text, lang='en', slow=False) |
|
audio_buffer = BytesIO() |
|
tts.write_to_fp(audio_buffer) |
|
audio_buffer.seek(0) |
|
return audio_buffer |
|
except Exception as e: |
|
print(f"Error generating speech: {str(e)}") |
|
return None |
|
|
|
|
|
def chat_function(input_type, text_input, audio_input, chat_history): |
|
if input_type == "text" and text_input: |
|
user_input = text_input |
|
elif input_type == "voice" and audio_input: |
|
user_input = transcribe_audio(audio_input) |
|
else: |
|
return chat_history, "Please provide text or voice input.", gr.update(value=text_input), None |
|
|
|
text_emotion = predict_text_emotion(user_input) |
|
if not chat_history: |
|
gr.Info("Please look at the camera for emotion detection...") |
|
facial_emotion = detect_facial_emotion() |
|
else: |
|
facial_emotion = "neutral" |
|
|
|
emotions = [e for e in [text_emotion, facial_emotion] if e and e != "neutral"] |
|
combined_emotion = emotions[0] if emotions else "neutral" |
|
|
|
response = generate_response(user_input, combined_emotion) |
|
chat_history.append({"role": "user", "content": user_input}) |
|
chat_history.append({"role": "assistant", "content": response}) |
|
|
|
audio_output = text_to_speech(response) |
|
return chat_history, f"Detected Emotion: {combined_emotion}", "", audio_output |
|
|
|
|
|
css = """ |
|
<style> |
|
.chatbot .message-user { |
|
background-color: #e3f2fd; |
|
border-radius: 10px; |
|
padding: 10px; |
|
margin: 5px 0; |
|
} |
|
.chatbot .message-assistant { |
|
background-color: #c8e6c9; |
|
border-radius: 10px; |
|
padding: 10px; |
|
margin: 5px 0; |
|
} |
|
.input-container { |
|
padding: 10px; |
|
background-color: #f9f9f9; |
|
border-radius: 10px; |
|
margin-top: 10px; |
|
} |
|
</style> |
|
""" |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(), css=css) as app: |
|
gr.Markdown( |
|
""" |
|
# Multimodal Mental Health AI Agent |
|
Chat with our empathetic AI designed to support you by understanding your emotions through text and facial expressions. |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
emotion_display = gr.Textbox(label="Emotion", interactive=False, placeholder="Detected emotion will appear here") |
|
|
|
with gr.Column(scale=3): |
|
chatbot = gr.Chatbot(label="Conversation History", height=500, type="messages", elem_classes="chatbot") |
|
|
|
with gr.Row(elem_classes="input-container"): |
|
input_type = gr.Radio(["text", "voice"], label="Input Method", value="text") |
|
text_input = gr.Textbox(label="Type Your Message", placeholder="How are you feeling today?", visible=True) |
|
audio_input = gr.Audio(type="filepath", label="Record Your Message", visible=False) |
|
submit_btn = gr.Button("Send", variant="primary") |
|
clear_btn = gr.Button("Clear Chat", variant="secondary") |
|
audio_output = gr.Audio(label="Assistant Response", type="filepath", interactive=False, autoplay=True) |
|
|
|
|
|
def update_visibility(input_type): |
|
return gr.update(visible=input_type == "text"), gr.update(visible=input_type == "voice") |
|
|
|
input_type.change(fn=update_visibility, inputs=input_type, outputs=[text_input, audio_input]) |
|
|
|
|
|
submit_btn.click( |
|
fn=chat_function, |
|
inputs=[input_type, text_input, audio_input, chatbot], |
|
outputs=[chatbot, emotion_display, text_input, audio_output] |
|
) |
|
|
|
|
|
clear_btn.click( |
|
lambda: ([], "", "", None), |
|
inputs=None, |
|
outputs=[chatbot, emotion_display, text_input, audio_output] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
app.launch(server_name="0.0.0.0", server_port=7860) |