ATHAI / app.py
Ath
Update app.py
1732969 verified
raw
history blame
4.75 kB
import os
import requests
import google.generativeai as genai
import gradio as gr
from tempfile import NamedTemporaryFile
from google.cloud import speech
# Configure your Google Generative AI API key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
# Set up Google Cloud credentials for Speech-to-Text
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path_to_your_google_cloud_credentials.json"
# Create the model
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-1.5-pro",
generation_config=generation_config,
system_instruction="You are a human that lives on earth, and you also speak friendly and use informal language. Your name is Ath, and you speak like a 100% human.",
)
chat_session = model.start_chat(history=[])
# Eleven Labs API configuration
eleven_labs_api_key = os.getenv("ELEVEN_LABS_API_KEY")
eleven_labs_url = "https://api.elevenlabs.io/v1/text-to-speech/aEO01A4wXwd1O8GPgGlF"
def chat_and_tts_text(user_input, history):
try:
# Send the user's text input to the chat session
response = chat_session.send_message(user_input)
response_text = response.text
# Update the chat history with text input and response
history.append((user_input, response_text))
return history, history
except Exception as e:
return history, f"Error: {str(e)}"
def convert_audio_to_text(audio_file):
try:
client = speech.SpeechClient()
with open(audio_file.name, "rb") as audio:
content = audio.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
)
response = client.recognize(config=config, audio=audio)
# Assuming the audio contains only one speech segment
transcript = response.results[0].alternatives[0].transcript
return transcript
except Exception as e:
return f"Error in audio to text conversion: {str(e)}"
def chat_and_tts_audio(audio_file, history):
try:
# Convert uploaded audio file to text
user_input = convert_audio_to_text(audio_file)
# Send the user's audio input to the chat session
response = chat_session.send_message(user_input)
response_text = response.text
# Eleven Labs text-to-speech request payload
payload = {
"text": response_text,
"voice_settings": {
"stability": 0,
"similarity_boost": 0
}
}
headers = {
"xi-api-key": eleven_labs_api_key,
"Content-Type": "application/json"
}
# Make the request to Eleven Labs API
tts_response = requests.post(eleven_labs_url, json=payload, headers=headers)
# Check if the response is successful and save the audio content to a temporary file
if tts_response.status_code == 200:
with NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
temp_audio.write(tts_response.content)
audio_path = temp_audio.name
else:
audio_path = None
# Update the chat history with audio input and response
history.append((user_input, response_text))
return history, history, audio_path
except Exception as e:
return history, f"Error: {str(e)}", None
# Create the Gradio UI
with gr.Blocks() as demo:
gr.Markdown("<h1 style='text-align: center;'>Chat with Ath</h1>")
gr.Markdown("Ask any question by typing or upload an audio file to receive a response from Ath in text and audio format.")
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(label="Chat History")
user_input_text = gr.Textbox(placeholder="Type your question...", label="Text Input")
submit_btn_text = gr.Button("Send")
with gr.Column(scale=2):
user_input_audio = gr.File(label="Upload Audio", type="binary")
submit_btn_audio = gr.Button("Send")
with gr.Column(scale=1):
audio_output = gr.Audio(label="Response Audio", type="filepath")
state = gr.State([])
submit_btn_text.click(chat_and_tts_text, inputs=[user_input_text, state], outputs=[chatbot, state])
submit_btn_audio.click(chat_and_tts_audio, inputs=[user_input_audio, state], outputs=[chatbot, state, audio_output])
demo.launch()