Spaces:

Artificial-superintelligence
/

ATHAI

Running

ATHAI / app.py

Ath

Update app.py

1732969 verified about 1 year ago

4.75 kB

	import os
	import requests
	import google.generativeai as genai
	import gradio as gr
	from tempfile import NamedTemporaryFile
	from google.cloud import speech

	# Configure your Google Generative AI API key
	genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

	# Set up Google Cloud credentials for Speech-to-Text
	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path_to_your_google_cloud_credentials.json"

	# Create the model
	generation_config = {
	"temperature": 1,
	"top_p": 0.95,
	"top_k": 64,
	"max_output_tokens": 8192,
	"response_mime_type": "text/plain",
	}

	model = genai.GenerativeModel(
	model_name="gemini-1.5-pro",
	generation_config=generation_config,
	system_instruction="You are a human that lives on earth, and you also speak friendly and use informal language. Your name is Ath, and you speak like a 100% human.",
	)

	chat_session = model.start_chat(history=[])

	# Eleven Labs API configuration
	eleven_labs_api_key = os.getenv("ELEVEN_LABS_API_KEY")
	eleven_labs_url = "https://api.elevenlabs.io/v1/text-to-speech/aEO01A4wXwd1O8GPgGlF"

	def chat_and_tts_text(user_input, history):
	try:
	# Send the user's text input to the chat session
	response = chat_session.send_message(user_input)
	response_text = response.text

	# Update the chat history with text input and response
	history.append((user_input, response_text))

	return history, history
	except Exception as e:
	return history, f"Error: {str(e)}"

	def convert_audio_to_text(audio_file):
	try:
	client = speech.SpeechClient()

	with open(audio_file.name, "rb") as audio:
	content = audio.read()

	audio = speech.RecognitionAudio(content=content)
	config = speech.RecognitionConfig(
	encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
	sample_rate_hertz=16000,
	language_code="en-US",
	)

	response = client.recognize(config=config, audio=audio)

	# Assuming the audio contains only one speech segment
	transcript = response.results[0].alternatives[0].transcript
	return transcript

	except Exception as e:
	return f"Error in audio to text conversion: {str(e)}"

	def chat_and_tts_audio(audio_file, history):
	try:
	# Convert uploaded audio file to text
	user_input = convert_audio_to_text(audio_file)

	# Send the user's audio input to the chat session
	response = chat_session.send_message(user_input)
	response_text = response.text

	# Eleven Labs text-to-speech request payload
	payload = {
	"text": response_text,
	"voice_settings": {
	"stability": 0,
	"similarity_boost": 0
	}
	}
	headers = {
	"xi-api-key": eleven_labs_api_key,
	"Content-Type": "application/json"
	}

	# Make the request to Eleven Labs API
	tts_response = requests.post(eleven_labs_url, json=payload, headers=headers)

	# Check if the response is successful and save the audio content to a temporary file
	if tts_response.status_code == 200:
	with NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
	temp_audio.write(tts_response.content)
	audio_path = temp_audio.name
	else:
	audio_path = None

	# Update the chat history with audio input and response
	history.append((user_input, response_text))

	return history, history, audio_path
	except Exception as e:
	return history, f"Error: {str(e)}", None

	# Create the Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("<h1 style='text-align: center;'>Chat with Ath</h1>")
	gr.Markdown("Ask any question by typing or upload an audio file to receive a response from Ath in text and audio format.")

	with gr.Row():
	with gr.Column(scale=2):
	chatbot = gr.Chatbot(label="Chat History")
	user_input_text = gr.Textbox(placeholder="Type your question...", label="Text Input")
	submit_btn_text = gr.Button("Send")

	with gr.Column(scale=2):
	user_input_audio = gr.File(label="Upload Audio", type="binary")
	submit_btn_audio = gr.Button("Send")

	with gr.Column(scale=1):
	audio_output = gr.Audio(label="Response Audio", type="filepath")

	state = gr.State([])

	submit_btn_text.click(chat_and_tts_text, inputs=[user_input_text, state], outputs=[chatbot, state])
	submit_btn_audio.click(chat_and_tts_audio, inputs=[user_input_audio, state], outputs=[chatbot, state, audio_output])

	demo.launch()