import os import requests import google.generativeai as genai import gradio as gr from tempfile import NamedTemporaryFile from google.cloud import speech # Configure your Google Generative AI API key genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) # Set up Google Cloud credentials for Speech-to-Text os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path_to_your_google_cloud_credentials.json" # Create the model generation_config = { "temperature": 1, "top_p": 0.95, "top_k": 64, "max_output_tokens": 8192, "response_mime_type": "text/plain", } model = genai.GenerativeModel( model_name="gemini-1.5-pro", generation_config=generation_config, system_instruction="You are a human that lives on earth, and you also speak friendly and use informal language. Your name is Ath, and you speak like a 100% human.", ) chat_session = model.start_chat(history=[]) # Eleven Labs API configuration eleven_labs_api_key = os.getenv("ELEVEN_LABS_API_KEY") eleven_labs_url = "https://api.elevenlabs.io/v1/text-to-speech/aEO01A4wXwd1O8GPgGlF" def chat_and_tts_text(user_input, history): try: # Send the user's text input to the chat session response = chat_session.send_message(user_input) response_text = response.text # Update the chat history with text input and response history.append((user_input, response_text)) return history, history except Exception as e: return history, f"Error: {str(e)}" def convert_audio_to_text(audio_file): try: client = speech.SpeechClient() with open(audio_file.name, "rb") as audio: content = audio.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", ) response = client.recognize(config=config, audio=audio) # Assuming the audio contains only one speech segment transcript = response.results[0].alternatives[0].transcript return transcript except Exception as e: return f"Error in audio to text conversion: {str(e)}" def chat_and_tts_audio(audio_file, history): try: # Convert uploaded audio file to text user_input = convert_audio_to_text(audio_file) # Send the user's audio input to the chat session response = chat_session.send_message(user_input) response_text = response.text # Eleven Labs text-to-speech request payload payload = { "text": response_text, "voice_settings": { "stability": 0, "similarity_boost": 0 } } headers = { "xi-api-key": eleven_labs_api_key, "Content-Type": "application/json" } # Make the request to Eleven Labs API tts_response = requests.post(eleven_labs_url, json=payload, headers=headers) # Check if the response is successful and save the audio content to a temporary file if tts_response.status_code == 200: with NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio: temp_audio.write(tts_response.content) audio_path = temp_audio.name else: audio_path = None # Update the chat history with audio input and response history.append((user_input, response_text)) return history, history, audio_path except Exception as e: return history, f"Error: {str(e)}", None # Create the Gradio UI with gr.Blocks() as demo: gr.Markdown("