Spaces:

welcometoFightclub
/

Hack_the_spring

Running

App Files Files Community

Hack_the_spring / app.py

welcometoFightclub

Update app.py

3f871dd verified 26 days ago

raw

history blame

7.81 kB

	import gradio as gr
	import speech_recognition as sr
	from groq import Groq
	import os
	import time
	import base64
	from io import BytesIO
	from gtts import gTTS

	# Set device


	# Grok API client with API key (stored as environment variable for security)
	GROQ_API_KEY = os.getenv("GROQ_API_KEY", "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") # Replace with your key or use env variable
	try:
	client = Groq(api_key=GROQ_API_KEY)
	print("Grok client initialized successfully")
	except Exception as e:
	print(f"Error initializing Groq client: {str(e)}")
	raise

	# Functions
	def predict_text_emotion(text):
	prompt = f"The user has entered text '{text}' classify user's emotion as happy or sad or anxious or angry. Respond in only one word."
	try:
	completion = client.chat.completions.create(
	model="llama-3.2-90b-vision-preview",
	messages=[{"role": "user", "content": prompt}],
	temperature=1,
	max_completion_tokens=64,
	top_p=1,
	stream=False,
	stop=None,
	)
	return completion.choices[0].message.content
	except Exception as e:
	return f"Error with Grok API: {str(e)}"

	def transcribe_audio(audio_path):
	r = sr.Recognizer()
	with sr.AudioFile(audio_path) as source:
	audio_text = r.listen(source)
	try:
	text = r.recognize_google(audio_text)
	return text
	except sr.UnknownValueError:
	return "I didn’t catch that—could you try again?"
	except sr.RequestError:
	return "Speech recognition unavailable—try typing instead."

	def capture_webcam_frame():
	cap = cv2.VideoCapture(0)
	if not cap.isOpened():
	return None
	start_time = time.time()
	while time.time() - start_time < 2:
	ret, frame = cap.read()
	if ret:
	_, buffer = cv2.imencode('.jpg', frame)
	img_base64 = base64.b64encode(buffer).decode('utf-8')
	img_url = f"data:image/jpeg;base64,{img_base64}"
	cap.release()
	return img_url
	cap.release()
	return None

	def detect_facial_emotion():
	img_url = capture_webcam_frame()
	if not img_url:
	return "neutral"
	try:
	completion = client.chat.completions.create(
	model="llama-3.2-90b-vision-preview",
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": "Identify user's facial emotion into happy or sad or anxious or angry. Respond in one word only"},
	{"type": "image_url", "image_url": {"url": img_url}}
	]
	}
	],
	temperature=1,
	max_completion_tokens=20,
	top_p=1,
	stream=False,
	stop=None,
	)
	emotion = completion.choices[0].message.content.strip().lower()
	if emotion not in ["happy", "sad", "anxious", "angry"]:
	return "neutral"
	return emotion
	except Exception as e:
	print(f"Error with Grok facial detection: {str(e)}")
	return "neutral"

	def generate_response(user_input, emotion):
	prompt = f"The user is feeling {emotion}. They said: '{user_input}'. Respond in a friendly caring manner with the user so the user feels being loved."
	try:
	completion = client.chat.completions.create(
	model="llama-3.2-90b-vision-preview",
	messages=[{"role": "user", "content": prompt}],
	temperature=1,
	max_completion_tokens=64,
	top_p=1,
	stream=False,
	stop=None,
	)
	return completion.choices[0].message.content
	except Exception as e:
	return f"Error with Groq API: {str(e)}"

	def text_to_speech(text):
	try:
	tts = gTTS(text=text, lang='en', slow=False)
	audio_buffer = BytesIO()
	tts.write_to_fp(audio_buffer)
	audio_buffer.seek(0)
	return audio_buffer
	except Exception as e:
	print(f"Error generating speech: {str(e)}")
	return None

	# Chat function for Gradio with voice output
	def chat_function(input_type, text_input, audio_input, chat_history):
	if input_type == "text" and text_input:
	user_input = text_input
	elif input_type == "voice" and audio_input:
	user_input = transcribe_audio(audio_input)
	else:
	return chat_history, "Please provide text or voice input.", gr.update(value=text_input), None

	text_emotion = predict_text_emotion(user_input)
	if not chat_history:
	gr.Info("Please look at the camera for emotion detection...")
	facial_emotion = detect_facial_emotion()
	else:
	facial_emotion = "neutral"

	emotions = [e for e in [text_emotion, facial_emotion] if e and e != "neutral"]
	combined_emotion = emotions[0] if emotions else "neutral"

	response = generate_response(user_input, combined_emotion)
	chat_history.append({"role": "user", "content": user_input})
	chat_history.append({"role": "assistant", "content": response})

	audio_output = text_to_speech(response)
	return chat_history, f"Detected Emotion: {combined_emotion}", "", audio_output

	# Custom CSS for better styling
	css = """
	<style>
	.chatbot .message-user {
	background-color: #e3f2fd;
	border-radius: 10px;
	padding: 10px;
	margin: 5px 0;
	}
	.chatbot .message-assistant {
	background-color: #c8e6c9;
	border-radius: 10px;
	padding: 10px;
	margin: 5px 0;
	}
	.input-container {
	padding: 10px;
	background-color: #f9f9f9;
	border-radius: 10px;
	margin-top: 10px;
	}
	</style>
	"""

	# Build the Gradio interface
	with gr.Blocks(theme=gr.themes.Soft(), css=css) as app:
	gr.Markdown(
	"""
	# Multimodal Mental Health AI Agent
	Chat with our empathetic AI designed to support you by understanding your emotions through text and facial expressions.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	emotion_display = gr.Textbox(label="Emotion", interactive=False, placeholder="Detected emotion will appear here")

	with gr.Column(scale=3):
	chatbot = gr.Chatbot(label="Conversation History", height=500, type="messages", elem_classes="chatbot")

	with gr.Row(elem_classes="input-container"):
	input_type = gr.Radio(["text", "voice"], label="Input Method", value="text")
	text_input = gr.Textbox(label="Type Your Message", placeholder="How are you feeling today?", visible=True)
	audio_input = gr.Audio(type="filepath", label="Record Your Message", visible=False)
	submit_btn = gr.Button("Send", variant="primary")
	clear_btn = gr.Button("Clear Chat", variant="secondary")
	audio_output = gr.Audio(label="Assistant Response", type="filepath", interactive=False, autoplay=True)

	# Dynamic visibility based on input type
	def update_visibility(input_type):
	return gr.update(visible=input_type == "text"), gr.update(visible=input_type == "voice")

	input_type.change(fn=update_visibility, inputs=input_type, outputs=[text_input, audio_input])

	# Submit action with voice output
	submit_btn.click(
	fn=chat_function,
	inputs=[input_type, text_input, audio_input, chatbot],
	outputs=[chatbot, emotion_display, text_input, audio_output]
	)

	# Clear chat and audio
	clear_btn.click(
	lambda: ([], "", "", None),
	inputs=None,
	outputs=[chatbot, emotion_display, text_input, audio_output]
	)

	# Launch the app (for local testing; deployment will handle this differently)
	if __name__ == "__main__":
	app.launch(server_name="0.0.0.0", server_port=7860)