Spaces:

Artificial-superintelligence
/

ATHAI

Running

File size: 4,751 Bytes

fe12fb0
 
 
 
b0d06ab
1732969
fe12fb0
 
1129b43
fe12fb0
1732969
 
 
fe12fb0
 
d436d1d
 
 
 
 
fe12fb0
 
 
d436d1d
 
b0d06ab
fe12fb0
 
 
 
 
1129b43
5fbb19b
fe12fb0
b0d06ab
575cf6f
 
 
 
 
 
 
 
 
 
 
b0d06ab
 
1732969
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0d06ab
97d1d38
575cf6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe12fb0
575cf6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe12fb0
 
d436d1d
6bc8e69
b0d06ab
d436d1d
 
 
 
b0d06ab
 
 
 
23ea754
b0d06ab
d436d1d
 
2f5c55f
d436d1d
 
 
b0d06ab
97d1d38
fe12fb0
575cf6f

import os
import requests
import google.generativeai as genai
import gradio as gr
from tempfile import NamedTemporaryFile
from google.cloud import speech

# Configure your Google Generative AI API key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Set up Google Cloud credentials for Speech-to-Text
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path_to_your_google_cloud_credentials.json"

# Create the model
generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
    model_name="gemini-1.5-pro",
    generation_config=generation_config,
    system_instruction="You are a human that lives on earth, and you also speak friendly and use informal language. Your name is Ath, and you speak like a 100% human.",
)

chat_session = model.start_chat(history=[])

# Eleven Labs API configuration
eleven_labs_api_key = os.getenv("ELEVEN_LABS_API_KEY")
eleven_labs_url = "https://api.elevenlabs.io/v1/text-to-speech/aEO01A4wXwd1O8GPgGlF"

def chat_and_tts_text(user_input, history):
    try:
        # Send the user's text input to the chat session
        response = chat_session.send_message(user_input)
        response_text = response.text
        
        # Update the chat history with text input and response
        history.append((user_input, response_text))
        
        return history, history
    except Exception as e:
        return history, f"Error: {str(e)}"

def convert_audio_to_text(audio_file):
    try:
        client = speech.SpeechClient()
        
        with open(audio_file.name, "rb") as audio:
            content = audio.read()

        audio = speech.RecognitionAudio(content=content)
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code="en-US",
        )

        response = client.recognize(config=config, audio=audio)
        
        # Assuming the audio contains only one speech segment
        transcript = response.results[0].alternatives[0].transcript
        return transcript

    except Exception as e:
        return f"Error in audio to text conversion: {str(e)}"

def chat_and_tts_audio(audio_file, history):
    try:
        # Convert uploaded audio file to text
        user_input = convert_audio_to_text(audio_file)
        
        # Send the user's audio input to the chat session
        response = chat_session.send_message(user_input)
        response_text = response.text
        
        # Eleven Labs text-to-speech request payload
        payload = {
            "text": response_text,
            "voice_settings": {
                "stability": 0,
                "similarity_boost": 0
            }
        }
        headers = {
            "xi-api-key": eleven_labs_api_key,
            "Content-Type": "application/json"
        }
        
        # Make the request to Eleven Labs API
        tts_response = requests.post(eleven_labs_url, json=payload, headers=headers)
        
        # Check if the response is successful and save the audio content to a temporary file
        if tts_response.status_code == 200:
            with NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
                temp_audio.write(tts_response.content)
                audio_path = temp_audio.name
        else:
            audio_path = None
        
        # Update the chat history with audio input and response
        history.append((user_input, response_text))
        
        return history, history, audio_path
    except Exception as e:
        return history, f"Error: {str(e)}", None

# Create the Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("<h1 style='text-align: center;'>Chat with Ath</h1>")
    gr.Markdown("Ask any question by typing or upload an audio file to receive a response from Ath in text and audio format.")

    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(label="Chat History")
            user_input_text = gr.Textbox(placeholder="Type your question...", label="Text Input")
            submit_btn_text = gr.Button("Send")

        with gr.Column(scale=2):
            user_input_audio = gr.File(label="Upload Audio", type="binary")
            submit_btn_audio = gr.Button("Send")

        with gr.Column(scale=1):
            audio_output = gr.Audio(label="Response Audio", type="filepath")

    state = gr.State([])

    submit_btn_text.click(chat_and_tts_text, inputs=[user_input_text, state], outputs=[chatbot, state])
    submit_btn_audio.click(chat_and_tts_audio, inputs=[user_input_audio, state], outputs=[chatbot, state, audio_output])

demo.launch()