import os import gradio as gr from google import genai from gtts import gTTS import tempfile import time # Configure the Gemini API GOOGLE_API_KEY = os.getenv("gemini_api") # Replace with your actual API key client = genai.Client(api_key=GOOGLE_API_KEY) def transcribe_audio(audio_path): """ This function uses Google's Speech-to-Text API to transcribe audio. For the free tier, we're using a simple placeholder. In a real application, you'd use a proper STT API here. """ # For demonstration, we're returning a placeholder message # In a real app, you would connect to a speech-to-text service response = client.models.generate_content( model='gemini-2.0-flash', contents=['Transcribe the input audio & return the transcription only Example - Audio file is transcribed to Hello then just return Hello', audio_path] ) print(response.text) return response.text def text_to_speech(text): """Convert text to speech using gTTS and return the path to the audio file""" with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: tts = gTTS(text=text, lang='en') tts.save(fp.name) return fp.name def chat_with_gemini(user_input, history): """ Process user input through Gemini API and return the response """ # Initialize conversation or continue existing one if not history: history = [] chat = client.chats.create(model="gemini-2.0-flash") print("History is",history) print("User input is ",user_input) # Generate response response = chat.send_message(user_input) response_text = response.text print("Response text is ",response_text) # Update history history.append(user_input) history.append(response_text) # Generate audio response audio_path = text_to_speech(response_text) return response_text, history, audio_path def process_audio(audio, history): """Process audio input, convert to text, and get response""" if audio is None: return "No audio detected", history, None # Convert audio to text user_input = transcribe_audio(audio) # Get response from Gemini response_text, new_history, audio_path = chat_with_gemini(user_input, history) return response_text, new_history, audio_path def process_text(text_input, history): """Process text input and get response""" if not text_input.strip(): return "No input detected", history, None # Get response from Gemini response_text, new_history, audio_path = chat_with_gemini(text_input, history) return response_text, new_history, audio_path def display_history(history): """Format the history for display""" if not history: return "No conversation history yet." display_text = "" for i in range(0, len(history), 2): if i < len(history): display_text += f"You: {history[i]}\n\n" if i + 1 < len(history): display_text += f"Assistant: {history[i+1]}\n\n" return display_text # Create the Gradio interface with gr.Blocks(title="Gemini Audio Chatbot") as demo: gr.Markdown("# Gemini Audio Chatbot") gr.Markdown("Talk or type your message, and the assistant will respond with text and audio.") # State for conversation history history = gr.State([]) with gr.Row(): with gr.Column(scale=7): # Chat history display chat_display = gr.Markdown("No conversation history yet.") with gr.Column(scale=3): # Info and instructions gr.Markdown(""" ## How to use: 1. Speak using the microphone or type your message 2. Wait for the assistant's response 3. The conversation history will be displayed on the left """) with gr.Row(): # Audio input audio_input = gr.Audio( sources=["microphone"], type="filepath", label="Audio Input" ) with gr.Row(): # Assistant's response response_text = gr.Textbox(label="Assistant's Response") with gr.Row(): # Audio output audio_output = gr.Audio(label="Assistant's Voice") # Buttons with gr.Row(): clear_btn = gr.Button("Clear Conversation") audio_input.change( process_audio, inputs=[audio_input, history], outputs=[response_text, history, audio_output] ).then( display_history, inputs=[history], outputs=[chat_display] ) clear_btn.click( lambda: ([], "No conversation history yet.", "", None), outputs=[history, chat_display, response_text, audio_output] ) demo.launch()