ATHAI / app.py
Ath
Update app.py
f58d77b verified
raw
history blame
3.85 kB
import os
import requests
import google.generativeai as genai
import gradio as gr
from tempfile import NamedTemporaryFile
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError
# Configure your Google Generative AI API key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
# Create the model
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-1.5-pro",
generation_config=generation_config,
system_instruction="You are a human that lives on earth, and you also speak friendly and use informal language. Your name is Ath, and you speak like a 100% human.",
)
chat_session = model.start_chat(history=[])
# Eleven Labs API configuration
eleven_labs_api_key = os.getenv("ELEVEN_LABS_API_KEY")
eleven_labs_url = "https://api.elevenlabs.io/v1/text-to-speech/aEO01A4wXwd1O8GPgGlF"
def chat_and_tts_text(user_input, history):
# Send the user's text input to the chat session
response = chat_session.send_message(user_input)
response_text = response.text
# Update the chat history with text input and response
history.append((user_input, response_text))
return history, response_text
def convert_audio_to_text(audio_file):
# Function to convert audio to text (you can replace this with your preferred method)
try:
sound = AudioSegment.from_file(audio_file)
return sound.export(format="wav") # Export as WAV for TTS API (adjust format as needed)
except CouldntDecodeError:
return None
def chat_and_tts_audio(audio_file):
# Convert uploaded audio file to text
converted_audio = convert_audio_to_text(audio_file)
if not converted_audio:
return "Error: Could not decode audio file.", None
# Send the audio text to the chat session
response = chat_session.send_message(converted_audio)
response_text = response.text
# Eleven Labs text-to-speech request payload
payload = {
"text": response_text,
"voice_settings": {
"stability": 0,
"similarity_boost": 0
}
}
headers = {
"xi-api-key": eleven_labs_api_key,
"Content-Type": "application/json"
}
# Make the request to Eleven Labs API
tts_response = requests.post(eleven_labs_url, json=payload, headers=headers)
# Check if the response is successful and save the audio content to a temporary file
if tts_response.status_code == 200:
with NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
temp_audio.write(tts_response.content)
audio_path = temp_audio.name
else:
audio_path = None
return response_text, audio_path
# Create the Gradio UI
with gr.Blocks() as demo:
gr.Markdown("<h1 style='text-align: center;'>Chat with Ath</h1>")
gr.Markdown("Ask any question by typing or upload an audio file to receive a response from Ath in text and audio format.")
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(label="Chat History")
user_input_text = gr.Textbox(placeholder="Type your question...", label="Text Input")
submit_btn_text = gr.Button("Send")
with gr.Column(scale=2):
user_input_audio = gr.File(label="Upload Audio", type="audio")
submit_btn_audio = gr.Button("Send")
with gr.Column(scale=1):
audio_output = gr.Audio(label="Response Audio", type="filepath")
state = gr.State([])
submit_btn_text.click(chat_and_tts_text, inputs=[user_input_text, state], outputs=[chatbot, state])
submit_btn_audio.click(chat_and_tts_audio, inputs=[user_input_audio], outputs=[chatbot, audio_output])
demo.launch()