Spaces:

palbha
/

conversational_ai

Sleeping

File size: 4,847 Bytes

import os
import gradio as gr
from google import genai
from gtts import gTTS
import tempfile
import time

# Configure the Gemini API
GOOGLE_API_KEY = os.getenv("gemini_api") # Replace with your actual API key

client = genai.Client(api_key=GOOGLE_API_KEY)



def transcribe_audio(audio_path):
    """
    This function uses Google's Speech-to-Text API to transcribe audio.
    For the free tier, we're using a simple placeholder.
    In a real application, you'd use a proper STT API here.
    """
    # For demonstration, we're returning a placeholder message
    # In a real app, you would connect to a speech-to-text service
    response = client.models.generate_content(
          model='gemini-2.0-flash',
          contents=['Transcribe the input audio & return the transcription only Example - Audio file is transcribed to Hello then just return Hello', audio_path]
        )
    print(response.text)

    return response.text

def text_to_speech(text):
    """Convert text to speech using gTTS and return the path to the audio file"""
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
        tts = gTTS(text=text, lang='en')
        tts.save(fp.name)
        return fp.name

def chat_with_gemini(user_input, history):
    """
    Process user input through Gemini API and return the response
    """
    # Initialize conversation or continue existing one
    if not history:
        history = []
        
        chat = client.chats.create(model="gemini-2.0-flash")
        
    print("History is",history)
    print("User input is ",user_input)
    # Generate response
    response = chat.send_message(user_input)
    response_text = response.text
    print("Response text is ",response_text)
    # Update history
    history.append(user_input)
    history.append(response_text)
    
    # Generate audio response
    audio_path = text_to_speech(response_text)
    
    return response_text, history, audio_path

def process_audio(audio, history):
    """Process audio input, convert to text, and get response"""
    if audio is None:
        return "No audio detected", history, None
    
    # Convert audio to text
    user_input = transcribe_audio(audio)
    
    # Get response from Gemini
    response_text, new_history, audio_path = chat_with_gemini(user_input, history)
    
    return response_text, new_history, audio_path

def process_text(text_input, history):
    """Process text input and get response"""
    if not text_input.strip():
        return "No input detected", history, None
    
    # Get response from Gemini
    response_text, new_history, audio_path = chat_with_gemini(text_input, history)
    
    return response_text, new_history, audio_path

def display_history(history):
    """Format the history for display"""
    if not history:
        return "No conversation history yet."
    
    display_text = ""
    for i in range(0, len(history), 2):
        if i < len(history):
            display_text += f"You: {history[i]}\n\n"
        if i + 1 < len(history):
            display_text += f"Assistant: {history[i+1]}\n\n"
    
    return display_text

# Create the Gradio interface
with gr.Blocks(title="Gemini Audio Chatbot") as demo:
    gr.Markdown("# Gemini Audio Chatbot")
    gr.Markdown("Talk or type your message, and the assistant will respond with text and audio.")
    
    # State for conversation history
    history = gr.State([])
    
    with gr.Row():
        with gr.Column(scale=7):
            # Chat history display
            chat_display = gr.Markdown("No conversation history yet.")
        
        with gr.Column(scale=3):
            # Info and instructions
            gr.Markdown("""
            ## How to use:
            1. Speak using the microphone or type your message
            2. Wait for the assistant's response
            3. The conversation history will be displayed on the left
            """)
    
    
    with gr.Row():
        # Audio input
        audio_input = gr.Audio(
            sources=["microphone"],
            type="filepath",
            label="Audio Input"
        )
    
    with gr.Row():
        # Assistant's response
        response_text = gr.Textbox(label="Assistant's Response")
    
    with gr.Row():
        # Audio output
        audio_output = gr.Audio(label="Assistant's Voice")
    
    # Buttons
    with gr.Row():
        clear_btn = gr.Button("Clear Conversation")
    
    
    
    audio_input.change(
        process_audio,
        inputs=[audio_input, history],
        outputs=[response_text, history, audio_output]
    ).then(
        display_history,
        inputs=[history],
        outputs=[chat_display]
    )
    
    clear_btn.click(
        lambda: ([], "No conversation history yet.", "", None),
        outputs=[history, chat_display, response_text, audio_output]
    )

demo.launch()