File size: 4,751 Bytes
fe12fb0 b0d06ab 1732969 fe12fb0 1129b43 fe12fb0 1732969 fe12fb0 d436d1d fe12fb0 d436d1d b0d06ab fe12fb0 1129b43 5fbb19b fe12fb0 b0d06ab 575cf6f b0d06ab 1732969 b0d06ab 97d1d38 575cf6f fe12fb0 575cf6f fe12fb0 d436d1d 6bc8e69 b0d06ab d436d1d b0d06ab 23ea754 b0d06ab d436d1d 2f5c55f d436d1d b0d06ab 97d1d38 fe12fb0 575cf6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import os
import requests
import google.generativeai as genai
import gradio as gr
from tempfile import NamedTemporaryFile
from google.cloud import speech
# Configure your Google Generative AI API key
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
# Set up Google Cloud credentials for Speech-to-Text
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path_to_your_google_cloud_credentials.json"
# Create the model
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-1.5-pro",
generation_config=generation_config,
system_instruction="You are a human that lives on earth, and you also speak friendly and use informal language. Your name is Ath, and you speak like a 100% human.",
)
chat_session = model.start_chat(history=[])
# Eleven Labs API configuration
eleven_labs_api_key = os.getenv("ELEVEN_LABS_API_KEY")
eleven_labs_url = "https://api.elevenlabs.io/v1/text-to-speech/aEO01A4wXwd1O8GPgGlF"
def chat_and_tts_text(user_input, history):
try:
# Send the user's text input to the chat session
response = chat_session.send_message(user_input)
response_text = response.text
# Update the chat history with text input and response
history.append((user_input, response_text))
return history, history
except Exception as e:
return history, f"Error: {str(e)}"
def convert_audio_to_text(audio_file):
try:
client = speech.SpeechClient()
with open(audio_file.name, "rb") as audio:
content = audio.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
)
response = client.recognize(config=config, audio=audio)
# Assuming the audio contains only one speech segment
transcript = response.results[0].alternatives[0].transcript
return transcript
except Exception as e:
return f"Error in audio to text conversion: {str(e)}"
def chat_and_tts_audio(audio_file, history):
try:
# Convert uploaded audio file to text
user_input = convert_audio_to_text(audio_file)
# Send the user's audio input to the chat session
response = chat_session.send_message(user_input)
response_text = response.text
# Eleven Labs text-to-speech request payload
payload = {
"text": response_text,
"voice_settings": {
"stability": 0,
"similarity_boost": 0
}
}
headers = {
"xi-api-key": eleven_labs_api_key,
"Content-Type": "application/json"
}
# Make the request to Eleven Labs API
tts_response = requests.post(eleven_labs_url, json=payload, headers=headers)
# Check if the response is successful and save the audio content to a temporary file
if tts_response.status_code == 200:
with NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
temp_audio.write(tts_response.content)
audio_path = temp_audio.name
else:
audio_path = None
# Update the chat history with audio input and response
history.append((user_input, response_text))
return history, history, audio_path
except Exception as e:
return history, f"Error: {str(e)}", None
# Create the Gradio UI
with gr.Blocks() as demo:
gr.Markdown("<h1 style='text-align: center;'>Chat with Ath</h1>")
gr.Markdown("Ask any question by typing or upload an audio file to receive a response from Ath in text and audio format.")
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(label="Chat History")
user_input_text = gr.Textbox(placeholder="Type your question...", label="Text Input")
submit_btn_text = gr.Button("Send")
with gr.Column(scale=2):
user_input_audio = gr.File(label="Upload Audio", type="binary")
submit_btn_audio = gr.Button("Send")
with gr.Column(scale=1):
audio_output = gr.Audio(label="Response Audio", type="filepath")
state = gr.State([])
submit_btn_text.click(chat_and_tts_text, inputs=[user_input_text, state], outputs=[chatbot, state])
submit_btn_audio.click(chat_and_tts_audio, inputs=[user_input_audio, state], outputs=[chatbot, state, audio_output])
demo.launch() |