Spaces:
Running
Running
import os | |
import gradio as gr | |
from google import genai | |
from gtts import gTTS | |
import tempfile | |
# Configure the Gemini API | |
GOOGLE_API_KEY = os.getenv("gemini_api") # Ensure your API key is set | |
client = genai.Client(api_key=GOOGLE_API_KEY) | |
chat=None | |
def transcribe_audio(audio_path): | |
""" | |
Transcribe the audio file using the Gemini API. | |
""" | |
try: | |
# Upload the audio file | |
#uploaded_file = client.files.upload(file=audio_path) | |
print("Audio Path is",audio_path) | |
myfile = client.files.upload(file=audio_path) | |
# Send the file to Gemini for transcription | |
response = client.models.generate_content( | |
model='gemini-2.0-flash', | |
contents=['Transcribe the input audio & return only the transcription.', myfile] | |
) | |
print("Transcription Response:", response.text) | |
return response.text | |
except Exception as e: | |
print("Error in transcription:", str(e)) | |
return "Error in transcription" | |
def text_to_speech(text): | |
"""Convert text to speech using gTTS and return the path to the audio file.""" | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: | |
tts = gTTS(text=text, lang='en') | |
tts.save(fp.name) | |
return fp.name | |
def chat_with_gemini(user_input, history): | |
""" | |
Process user input through Gemini API and return the response. | |
""" | |
if history is None or not isinstance(history, list): # Ensure history is initialized | |
history = [] | |
global chat | |
# Initialize or continue conversation | |
if chat is None: | |
chat = client.chats.create(model="gemini-2.0-flash") # Initialize chat once | |
print("User input:", user_input) | |
# Generate response | |
response = chat.send_message(user_input) | |
response_text = response.text | |
print("Response text:", response_text) | |
# Append to history properly | |
history.append((user_input, response_text)) | |
# Generate audio response | |
audio_path = text_to_speech(response_text) | |
return response_text, history, audio_path | |
def process_audio(audio, history): | |
"""Process audio input, convert to text, and get response.""" | |
if audio is None: | |
return "No audio detected", history, None # Don't reset history | |
# Convert audio to text | |
user_input = transcribe_audio(audio) | |
# Get response from Gemini | |
response_text, new_history, audio_path = chat_with_gemini(user_input, history) | |
return response_text, new_history, audio_path | |
def process_text(text_input, history): | |
"""Process text input and get response.""" | |
if not text_input.strip(): | |
return "No input detected", history, None | |
# Get response from Gemini | |
response_text, new_history, audio_path = chat_with_gemini(text_input, history) | |
return response_text, new_history, audio_path | |
def display_history(history): | |
"""Format the history for display.""" | |
if not history: | |
return "No conversation history yet." | |
return "\n".join([f"You: {msg[0]}\nAssistant: {msg[1]}\n" for msg in history]) | |
# Create the Gradio interface | |
with gr.Blocks(title="Gemini Audio Chatbot") as demo: | |
gr.Markdown("# Gemini Audio Chatbot") | |
gr.Markdown("Talk or type your message, and the assistant will respond with text and audio.") | |
# State for conversation history | |
history = gr.State([]) # Ensuring the history persists | |
with gr.Row(): | |
with gr.Column(scale=7): | |
chat_display = gr.Markdown("No conversation history yet.") | |
with gr.Column(scale=3): | |
gr.Markdown(""" | |
## How to use: | |
1. Speak using the microphone or type your message | |
2. Wait for the assistant's response | |
3. The conversation history will be displayed on the left | |
""") | |
with gr.Row(): | |
audio_input = gr.Audio( | |
sources=["microphone"], | |
type="filepath", | |
label="Audio Input" | |
) | |
# with gr.Row(): | |
# text_input = gr.Textbox(label="Type your message here") | |
with gr.Row(): | |
response_text = gr.Textbox(label="Assistant's Response") | |
with gr.Row(): | |
audio_output = gr.Audio(label="Assistant's Voice") | |
# Buttons | |
with gr.Row(): | |
clear_btn = gr.Button("Clear Conversation") | |
# Audio and Text Input Handling | |
audio_input.change( | |
process_audio, | |
inputs=[audio_input, history], | |
outputs=[response_text, history, audio_output] | |
).then( | |
display_history, | |
inputs=[history], | |
outputs=[chat_display] | |
) | |
# text_input.submit( | |
# process_text, | |
# inputs=[text_input, history], | |
# outputs=[response_text, history, audio_output] | |
# ).then( | |
# display_history, | |
# inputs=[history], | |
# outputs=[chat_display] | |
# ) | |
# Clear conversation | |
clear_btn.click( | |
lambda: ([], "No conversation history yet.", "", None), | |
outputs=[history, chat_display, response_text, audio_output] | |
) | |
demo.launch() | |