Spaces:
Running
Running
import os | |
import gradio as gr | |
from google import genai | |
from gtts import gTTS | |
import tempfile | |
import time | |
# Configure the Gemini API | |
GOOGLE_API_KEY = os.getenv("gemini_api") # Replace with your actual API key | |
client = genai.Client(api_key=GOOGLE_API_KEY) | |
def transcribe_audio(audio_path): | |
""" | |
This function uses Google's Speech-to-Text API to transcribe audio. | |
For the free tier, we're using a simple placeholder. | |
In a real application, you'd use a proper STT API here. | |
""" | |
# For demonstration, we're returning a placeholder message | |
# In a real app, you would connect to a speech-to-text service | |
response = client.models.generate_content( | |
model='gemini-2.0-flash', | |
contents=['Transcribe the input audio & return the transcription only Example - Audio file is transcribed to Hello then just return Hello', audio_path] | |
) | |
print(response.text) | |
return response.text | |
def text_to_speech(text): | |
"""Convert text to speech using gTTS and return the path to the audio file""" | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: | |
tts = gTTS(text=text, lang='en') | |
tts.save(fp.name) | |
return fp.name | |
def chat_with_gemini(user_input, history): | |
""" | |
Process user input through Gemini API and return the response | |
""" | |
# Initialize conversation or continue existing one | |
if not history: | |
history = [] | |
chat = client.chats.create(model="gemini-2.0-flash") | |
print("History is",history) | |
print("User input is ",user_input) | |
# Generate response | |
response = chat.send_message(user_input) | |
response_text = response.text | |
print("Response text is ",response_text) | |
# Update history | |
history.append(user_input) | |
history.append(response_text) | |
# Generate audio response | |
audio_path = text_to_speech(response_text) | |
return response_text, history, audio_path | |
def process_audio(audio, history): | |
"""Process audio input, convert to text, and get response""" | |
if audio is None: | |
return "No audio detected", history, None | |
# Convert audio to text | |
user_input = transcribe_audio(audio) | |
# Get response from Gemini | |
response_text, new_history, audio_path = chat_with_gemini(user_input, history) | |
return response_text, new_history, audio_path | |
def process_text(text_input, history): | |
"""Process text input and get response""" | |
if not text_input.strip(): | |
return "No input detected", history, None | |
# Get response from Gemini | |
response_text, new_history, audio_path = chat_with_gemini(text_input, history) | |
return response_text, new_history, audio_path | |
def display_history(history): | |
"""Format the history for display""" | |
if not history: | |
return "No conversation history yet." | |
display_text = "" | |
for i in range(0, len(history), 2): | |
if i < len(history): | |
display_text += f"You: {history[i]}\n\n" | |
if i + 1 < len(history): | |
display_text += f"Assistant: {history[i+1]}\n\n" | |
return display_text | |
# Create the Gradio interface | |
with gr.Blocks(title="Gemini Audio Chatbot") as demo: | |
gr.Markdown("# Gemini Audio Chatbot") | |
gr.Markdown("Talk or type your message, and the assistant will respond with text and audio.") | |
# State for conversation history | |
history = gr.State([]) | |
with gr.Row(): | |
with gr.Column(scale=7): | |
# Chat history display | |
chat_display = gr.Markdown("No conversation history yet.") | |
with gr.Column(scale=3): | |
# Info and instructions | |
gr.Markdown(""" | |
## How to use: | |
1. Speak using the microphone or type your message | |
2. Wait for the assistant's response | |
3. The conversation history will be displayed on the left | |
""") | |
with gr.Row(): | |
# Audio input | |
audio_input = gr.Audio( | |
sources=["microphone"], | |
type="filepath", | |
label="Audio Input" | |
) | |
with gr.Row(): | |
# Assistant's response | |
response_text = gr.Textbox(label="Assistant's Response") | |
with gr.Row(): | |
# Audio output | |
audio_output = gr.Audio(label="Assistant's Voice") | |
# Buttons | |
with gr.Row(): | |
clear_btn = gr.Button("Clear Conversation") | |
audio_input.change( | |
process_audio, | |
inputs=[audio_input, history], | |
outputs=[response_text, history, audio_output] | |
).then( | |
display_history, | |
inputs=[history], | |
outputs=[chat_display] | |
) | |
clear_btn.click( | |
lambda: ([], "No conversation history yet.", "", None), | |
outputs=[history, chat_display, response_text, audio_output] | |
) | |
demo.launch() |