import os import gradio as gr from google import genai from gtts import gTTS import tempfile # Configure the Gemini API GOOGLE_API_KEY = os.getenv("gemini_api") # Ensure your API key is set client = genai.Client(api_key=GOOGLE_API_KEY) chat=None def transcribe_audio(audio_path): """ Transcribe the audio file using the Gemini API. """ try: # Upload the audio file #uploaded_file = client.files.upload(file=audio_path) print("Audio Path is",audio_path) myfile = client.files.upload(file=audio_path) # Send the file to Gemini for transcription response = client.models.generate_content( model='gemini-2.0-flash', contents=['Transcribe the input audio & return only the transcription.', myfile] ) print("Transcription Response:", response.text) return response.text except Exception as e: print("Error in transcription:", str(e)) return "Error in transcription" def text_to_speech(text): """Convert text to speech using gTTS and return the path to the audio file.""" with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: tts = gTTS(text=text, lang='en') tts.save(fp.name) return fp.name def chat_with_gemini(user_input, history): """ Process user input through Gemini API and return the response. """ if history is None or not isinstance(history, list): # Ensure history is initialized history = [] global chat # Initialize or continue conversation if chat is None: chat = client.chats.create(model="gemini-2.0-flash") # Initialize chat once print("User input:", user_input) # Generate response response = chat.send_message(user_input) response_text = response.text print("Response text:", response_text) # Append to history properly history.append((user_input, response_text)) # Generate audio response audio_path = text_to_speech(response_text) return response_text, history, audio_path def process_audio(audio, history): """Process audio input, convert to text, and get response.""" if audio is None: return "No audio detected", history, None # Don't reset history # Convert audio to text user_input = transcribe_audio(audio) # Get response from Gemini response_text, new_history, audio_path = chat_with_gemini(user_input, history) return response_text, new_history, audio_path def process_text(text_input, history): """Process text input and get response.""" if not text_input.strip(): return "No input detected", history, None # Get response from Gemini response_text, new_history, audio_path = chat_with_gemini(text_input, history) return response_text, new_history, audio_path def display_history(history): """Format the history for display.""" if not history: return "No conversation history yet." return "\n".join([f"You: {msg[0]}\nAssistant: {msg[1]}\n" for msg in history]) # Create the Gradio interface with gr.Blocks(title="Gemini Audio Chatbot") as demo: gr.Markdown("# Gemini Audio Chatbot") gr.Markdown("Talk or type your message, and the assistant will respond with text and audio.") # State for conversation history history = gr.State([]) # Ensuring the history persists with gr.Row(): with gr.Column(scale=7): chat_display = gr.Markdown("No conversation history yet.") with gr.Column(scale=3): gr.Markdown(""" ## How to use: 1. Speak using the microphone or type your message 2. Wait for the assistant's response 3. The conversation history will be displayed on the left """) with gr.Row(): audio_input = gr.Audio( sources=["microphone"], type="filepath", label="Audio Input" ) # with gr.Row(): # text_input = gr.Textbox(label="Type your message here") with gr.Row(): response_text = gr.Textbox(label="Assistant's Response") with gr.Row(): audio_output = gr.Audio(label="Assistant's Voice") # Buttons with gr.Row(): clear_btn = gr.Button("Clear Conversation") # Audio and Text Input Handling audio_input.change( process_audio, inputs=[audio_input, history], outputs=[response_text, history, audio_output] ).then( display_history, inputs=[history], outputs=[chat_display] ) # text_input.submit( # process_text, # inputs=[text_input, history], # outputs=[response_text, history, audio_output] # ).then( # display_history, # inputs=[history], # outputs=[chat_display] # ) # Clear conversation clear_btn.click( lambda: ([], "No conversation history yet.", "", None), outputs=[history, chat_display, response_text, audio_output] ) demo.launch()