Chat with Ath

import os
import requests
import google.generativeai as genai
import gradio as gr
from tempfile import NamedTemporaryFile
from google.cloud import speech

# Configure your Google Generative AI API key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Set up Google Cloud credentials for Speech-to-Text
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path_to_your_google_cloud_credentials.json"

# Create the model
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
    model_name="gemini-1.5-pro",
    generation_config=generation_config,
    system_instruction="You are a human that lives on earth, and you also speak friendly and use informal language. Your name is Ath, and you speak like a 100% human.",
)

chat_session = model.start_chat(history=[])

# Eleven Labs API configuration
eleven_labs_api_key = os.getenv("ELEVEN_LABS_API_KEY")
eleven_labs_url = "https://api.elevenlabs.io/v1/text-to-speech/aEO01A4wXwd1O8GPgGlF"

def chat_and_tts_text(user_input, history):
    try:
        # Send the user's text input to the chat session
        response = chat_session.send_message(user_input)
        response_text = response.text
        
        # Update the chat history with text input and response
        history.append((user_input, response_text))
        
        return history, history
    except Exception as e:
        return history, f"Error: {str(e)}"

def convert_audio_to_text(audio_file):
    try:
        client = speech.SpeechClient()
        
        with open(audio_file.name, "rb") as audio:
            content = audio.read()

        audio = speech.RecognitionAudio(content=content)
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code="en-US",
        )

        response = client.recognize(config=config, audio=audio)
        
        # Assuming the audio contains only one speech segment
        transcript = response.results[0].alternatives[0].transcript
        return transcript

    except Exception as e:
        return f"Error in audio to text conversion: {str(e)}"

def chat_and_tts_audio(audio_file, history):
    try:
        # Convert uploaded audio file to text
        user_input = convert_audio_to_text(audio_file)
        
        # Send the user's audio input to the chat session
        response = chat_session.send_message(user_input)
        response_text = response.text
        
        # Eleven Labs text-to-speech request payload
        payload = {
            "text": response_text,
            "voice_settings": {
                "stability": 0,
                "similarity_boost": 0
            }
        }
        headers = {
            "xi-api-key": eleven_labs_api_key,
            "Content-Type": "application/json"
        }
        
        # Make the request to Eleven Labs API
        tts_response = requests.post(eleven_labs_url, json=payload, headers=headers)
        
        # Check if the response is successful and save the audio content to a temporary file
        if tts_response.status_code == 200:
            with NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
                temp_audio.write(tts_response.content)
                audio_path = temp_audio.name
        else:
            audio_path = None
        
        # Update the chat history with audio input and response
        history.append((user_input, response_text))
        
        return history, history, audio_path
    except Exception as e:
        return history, f"Error: {str(e)}", None

# Create the Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("<h1 style='text-align: center;'>Chat with Ath</h1>")
    gr.Markdown("Ask any question by typing or upload an audio file to receive a response from Ath in text and audio format.")

    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(label="Chat History")
            user_input_text = gr.Textbox(placeholder="Type your question...", label="Text Input")
            submit_btn_text = gr.Button("Send")

        with gr.Column(scale=2):
            user_input_audio = gr.File(label="Upload Audio", type="binary")
            submit_btn_audio = gr.Button("Send")

        with gr.Column(scale=1):
            audio_output = gr.Audio(label="Response Audio", type="filepath")

    state = gr.State([])

    submit_btn_text.click(chat_and_tts_text, inputs=[user_input_text, state], outputs=[chatbot, state])
    submit_btn_audio.click(chat_and_tts_audio, inputs=[user_input_audio, state], outputs=[chatbot, state, audio_output])

demo.launch()