palbha's picture
Update app.py
6d394e1 verified
import os
import gradio as gr
from google import genai
from gtts import gTTS
import tempfile
# Configure the Gemini API
GOOGLE_API_KEY = os.getenv("gemini_api") # Ensure your API key is set
client = genai.Client(api_key=GOOGLE_API_KEY)
chat=None
def transcribe_audio(audio_path):
"""
Transcribe the audio file using the Gemini API.
"""
try:
# Upload the audio file
#uploaded_file = client.files.upload(file=audio_path)
print("Audio Path is",audio_path)
myfile = client.files.upload(file=audio_path)
# Send the file to Gemini for transcription
response = client.models.generate_content(
model='gemini-2.0-flash',
contents=['Transcribe the input audio & return only the transcription.', myfile]
)
print("Transcription Response:", response.text)
return response.text
except Exception as e:
print("Error in transcription:", str(e))
return "Error in transcription"
def text_to_speech(text):
"""Convert text to speech using gTTS and return the path to the audio file."""
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
tts = gTTS(text=text, lang='en')
tts.save(fp.name)
return fp.name
def chat_with_gemini(user_input, history):
"""
Process user input through Gemini API and return the response.
"""
if history is None or not isinstance(history, list): # Ensure history is initialized
history = []
global chat
# Initialize or continue conversation
if chat is None:
chat = client.chats.create(model="gemini-2.0-flash") # Initialize chat once
print("User input:", user_input)
# Generate response
response = chat.send_message(user_input)
response_text = response.text
print("Response text:", response_text)
# Append to history properly
history.append((user_input, response_text))
# Generate audio response
audio_path = text_to_speech(response_text)
return response_text, history, audio_path
def process_audio(audio, history):
"""Process audio input, convert to text, and get response."""
if audio is None:
return "No audio detected", history, None # Don't reset history
# Convert audio to text
user_input = transcribe_audio(audio)
# Get response from Gemini
response_text, new_history, audio_path = chat_with_gemini(user_input, history)
return response_text, new_history, audio_path
def process_text(text_input, history):
"""Process text input and get response."""
if not text_input.strip():
return "No input detected", history, None
# Get response from Gemini
response_text, new_history, audio_path = chat_with_gemini(text_input, history)
return response_text, new_history, audio_path
def display_history(history):
"""Format the history for display."""
if not history:
return "No conversation history yet."
return "\n".join([f"You: {msg[0]}\nAssistant: {msg[1]}\n" for msg in history])
# Create the Gradio interface
with gr.Blocks(title="Gemini Audio Chatbot") as demo:
gr.Markdown("# Gemini Audio Chatbot")
gr.Markdown("Talk or type your message, and the assistant will respond with text and audio.")
# State for conversation history
history = gr.State([]) # Ensuring the history persists
with gr.Row():
with gr.Column(scale=7):
chat_display = gr.Markdown("No conversation history yet.")
with gr.Column(scale=3):
gr.Markdown("""
## How to use:
1. Speak using the microphone or type your message
2. Wait for the assistant's response
3. The conversation history will be displayed on the left
""")
with gr.Row():
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Audio Input"
)
# with gr.Row():
# text_input = gr.Textbox(label="Type your message here")
with gr.Row():
response_text = gr.Textbox(label="Assistant's Response")
with gr.Row():
audio_output = gr.Audio(label="Assistant's Voice")
# Buttons
with gr.Row():
clear_btn = gr.Button("Clear Conversation")
# Audio and Text Input Handling
audio_input.change(
process_audio,
inputs=[audio_input, history],
outputs=[response_text, history, audio_output]
).then(
display_history,
inputs=[history],
outputs=[chat_display]
)
# text_input.submit(
# process_text,
# inputs=[text_input, history],
# outputs=[response_text, history, audio_output]
# ).then(
# display_history,
# inputs=[history],
# outputs=[chat_display]
# )
# Clear conversation
clear_btn.click(
lambda: ([], "No conversation history yet.", "", None),
outputs=[history, chat_display, response_text, audio_output]
)
demo.launch()