palbha's picture
Update app.py
6dbd5d4 verified
raw
history blame
5.36 kB
import os
import gradio as gr
import google.generativeai as genai
from gtts import gTTS
import tempfile
import time
from google.colab import userdata
# Configure the Gemini API
GOOGLE_API_KEY = os.getenv("gemini_api") # Replace with your actual API key
genai.configure(api_key=GOOGLE_API_KEY)
# Initialize the model
model = genai.GenerativeModel('gemini-pro')
def transcribe_audio(audio_path):
"""
This function uses Google's Speech-to-Text API to transcribe audio.
For the free tier, we're using a simple placeholder.
In a real application, you'd use a proper STT API here.
"""
# For demonstration, we're returning a placeholder message
# In a real app, you would connect to a speech-to-text service
return "This is a placeholder for speech-to-text transcription. In a real application, this would be the transcribed text from your audio."
def text_to_speech(text):
"""Convert text to speech using gTTS and return the path to the audio file"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
tts = gTTS(text=text, lang='en')
tts.save(fp.name)
return fp.name
def chat_with_gemini(user_input, history):
"""
Process user input through Gemini API and return the response
"""
# Initialize conversation or continue existing one
if not history:
history = []
chat = model.start_chat(history=[])
else:
# Reconstruct the chat session with history
chat = model.start_chat(history=[
{"role": "user" if i % 2 == 0 else "model", "parts": [msg]}
for i, msg in enumerate(history)
])
# Generate response
response = chat.send_message(user_input)
response_text = response.text
# Update history
history.append(user_input)
history.append(response_text)
# Generate audio response
audio_path = text_to_speech(response_text)
return response_text, history, audio_path
def process_audio(audio, history):
"""Process audio input, convert to text, and get response"""
if audio is None:
return "No audio detected", history, None
# Convert audio to text
user_input = transcribe_audio(audio)
# Get response from Gemini
response_text, new_history, audio_path = chat_with_gemini(user_input, history)
return response_text, new_history, audio_path
def process_text(text_input, history):
"""Process text input and get response"""
if not text_input.strip():
return "No input detected", history, None
# Get response from Gemini
response_text, new_history, audio_path = chat_with_gemini(text_input, history)
return response_text, new_history, audio_path
def display_history(history):
"""Format the history for display"""
if not history:
return "No conversation history yet."
display_text = ""
for i in range(0, len(history), 2):
if i < len(history):
display_text += f"You: {history[i]}\n\n"
if i + 1 < len(history):
display_text += f"Assistant: {history[i+1]}\n\n"
return display_text
# Create the Gradio interface
with gr.Blocks(title="Gemini Audio Chatbot") as demo:
gr.Markdown("# Gemini Audio Chatbot")
gr.Markdown("Talk or type your message, and the assistant will respond with text and audio.")
# State for conversation history
history = gr.State([])
with gr.Row():
with gr.Column(scale=7):
# Chat history display
chat_display = gr.Markdown("No conversation history yet.")
with gr.Column(scale=3):
# Info and instructions
gr.Markdown("""
## How to use:
1. Speak using the microphone or type your message
2. Wait for the assistant's response
3. The conversation history will be displayed on the left
""")
with gr.Row():
# Text input
text_input = gr.Textbox(
placeholder="Type your message here...",
label="Text Input"
)
with gr.Row():
# Audio input
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Audio Input"
)
with gr.Row():
# Assistant's response
response_text = gr.Textbox(label="Assistant's Response")
with gr.Row():
# Audio output
audio_output = gr.Audio(label="Assistant's Voice")
# Buttons
with gr.Row():
clear_btn = gr.Button("Clear Conversation")
# Event handlers
text_input.submit(
process_text,
inputs=[text_input, history],
outputs=[response_text, history, audio_output]
).then(
display_history,
inputs=[history],
outputs=[chat_display]
).then(
lambda: "",
outputs=[text_input]
)
audio_input.change(
process_audio,
inputs=[audio_input, history],
outputs=[response_text, history, audio_output]
).then(
display_history,
inputs=[history],
outputs=[chat_display]
)
clear_btn.click(
lambda: ([], "No conversation history yet.", "", None),
outputs=[history, chat_display, response_text, audio_output]
)
demo.launch()