Spaces:
Running
Running
File size: 4,847 Bytes
5c8bbca 176259d 5c8bbca 6a5b9ba 5c8bbca 6dbd5d4 176259d 5c8bbca 6f0b49b 5c8bbca 176259d fd28ea9 176259d ec99c04 176259d 5c8bbca 176259d 5413e87 5c8bbca 5413e87 5c8bbca 1184cf6 5c8bbca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import os
import gradio as gr
from google import genai
from gtts import gTTS
import tempfile
import time
# Configure the Gemini API
GOOGLE_API_KEY = os.getenv("gemini_api") # Replace with your actual API key
client = genai.Client(api_key=GOOGLE_API_KEY)
def transcribe_audio(audio_path):
"""
This function uses Google's Speech-to-Text API to transcribe audio.
For the free tier, we're using a simple placeholder.
In a real application, you'd use a proper STT API here.
"""
# For demonstration, we're returning a placeholder message
# In a real app, you would connect to a speech-to-text service
response = client.models.generate_content(
model='gemini-2.0-flash',
contents=['Transcribe the input audio & return the transcription only Example - Audio file is transcribed to Hello then just return Hello', audio_path]
)
print(response.text)
return response.text
def text_to_speech(text):
"""Convert text to speech using gTTS and return the path to the audio file"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
tts = gTTS(text=text, lang='en')
tts.save(fp.name)
return fp.name
def chat_with_gemini(user_input, history):
"""
Process user input through Gemini API and return the response
"""
# Initialize conversation or continue existing one
if not history:
history = []
chat = client.chats.create(model="gemini-2.0-flash")
print("History is",history)
print("User input is ",user_input)
# Generate response
response = chat.send_message(user_input)
response_text = response.text
print("Response text is ",response_text)
# Update history
history.append(user_input)
history.append(response_text)
# Generate audio response
audio_path = text_to_speech(response_text)
return response_text, history, audio_path
def process_audio(audio, history):
"""Process audio input, convert to text, and get response"""
if audio is None:
return "No audio detected", history, None
# Convert audio to text
user_input = transcribe_audio(audio)
# Get response from Gemini
response_text, new_history, audio_path = chat_with_gemini(user_input, history)
return response_text, new_history, audio_path
def process_text(text_input, history):
"""Process text input and get response"""
if not text_input.strip():
return "No input detected", history, None
# Get response from Gemini
response_text, new_history, audio_path = chat_with_gemini(text_input, history)
return response_text, new_history, audio_path
def display_history(history):
"""Format the history for display"""
if not history:
return "No conversation history yet."
display_text = ""
for i in range(0, len(history), 2):
if i < len(history):
display_text += f"You: {history[i]}\n\n"
if i + 1 < len(history):
display_text += f"Assistant: {history[i+1]}\n\n"
return display_text
# Create the Gradio interface
with gr.Blocks(title="Gemini Audio Chatbot") as demo:
gr.Markdown("# Gemini Audio Chatbot")
gr.Markdown("Talk or type your message, and the assistant will respond with text and audio.")
# State for conversation history
history = gr.State([])
with gr.Row():
with gr.Column(scale=7):
# Chat history display
chat_display = gr.Markdown("No conversation history yet.")
with gr.Column(scale=3):
# Info and instructions
gr.Markdown("""
## How to use:
1. Speak using the microphone or type your message
2. Wait for the assistant's response
3. The conversation history will be displayed on the left
""")
with gr.Row():
# Audio input
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Audio Input"
)
with gr.Row():
# Assistant's response
response_text = gr.Textbox(label="Assistant's Response")
with gr.Row():
# Audio output
audio_output = gr.Audio(label="Assistant's Voice")
# Buttons
with gr.Row():
clear_btn = gr.Button("Clear Conversation")
audio_input.change(
process_audio,
inputs=[audio_input, history],
outputs=[response_text, history, audio_output]
).then(
display_history,
inputs=[history],
outputs=[chat_display]
)
clear_btn.click(
lambda: ([], "No conversation history yet.", "", None),
outputs=[history, chat_display, response_text, audio_output]
)
demo.launch() |