Spaces:

palbha
/

conversational_ai

Running

File size: 5,105 Bytes

import os
import gradio as gr
from google import genai
from gtts import gTTS
import tempfile

# Configure the Gemini API
GOOGLE_API_KEY = os.getenv("gemini_api")  # Ensure your API key is set
client = genai.Client(api_key=GOOGLE_API_KEY)
chat=None
def transcribe_audio(audio_path):
    """
    Transcribe the audio file using the Gemini API.
    """
    try:
        # Upload the audio file
        #uploaded_file = client.files.upload(file=audio_path)
        print("Audio Path is",audio_path)
        myfile = client.files.upload(file=audio_path)
        # Send the file to Gemini for transcription
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=['Transcribe the input audio & return only the transcription.', myfile]
        )

        print("Transcription Response:", response.text)
        return response.text

    except Exception as e:
        print("Error in transcription:", str(e))
        return "Error in transcription"

def text_to_speech(text):
    """Convert text to speech using gTTS and return the path to the audio file."""
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
        tts = gTTS(text=text, lang='en')
        tts.save(fp.name)
        return fp.name

def chat_with_gemini(user_input, history):
    """
    Process user input through Gemini API and return the response.
    """
    if history is None or not isinstance(history, list):  # Ensure history is initialized
        history = []
    global chat
    # Initialize or continue conversation
    if chat is None:
        chat = client.chats.create(model="gemini-2.0-flash")  # Initialize chat once

    
    print("User input:", user_input)
    
    # Generate response
    response = chat.send_message(user_input)
    response_text = response.text
    print("Response text:", response_text)

    # Append to history properly
    history.append((user_input, response_text))
    
    # Generate audio response
    audio_path = text_to_speech(response_text)
    
    return response_text, history, audio_path

def process_audio(audio, history):
    """Process audio input, convert to text, and get response."""
    if audio is None:
        return "No audio detected", history, None  # Don't reset history
       
    # Convert audio to text
    user_input = transcribe_audio(audio)
    
    # Get response from Gemini
    response_text, new_history, audio_path = chat_with_gemini(user_input, history)
    
    return response_text, new_history, audio_path

def process_text(text_input, history):
    """Process text input and get response."""
    if not text_input.strip():
        return "No input detected", history, None
    
    # Get response from Gemini
    response_text, new_history, audio_path = chat_with_gemini(text_input, history)
    
    return response_text, new_history, audio_path

def display_history(history):
    """Format the history for display."""
    if not history:
        return "No conversation history yet."
    
    return "\n".join([f"You: {msg[0]}\nAssistant: {msg[1]}\n" for msg in history])

# Create the Gradio interface
with gr.Blocks(title="Gemini Audio Chatbot") as demo:
    gr.Markdown("# Gemini Audio Chatbot")
    gr.Markdown("Talk or type your message, and the assistant will respond with text and audio.")
    
    # State for conversation history
    history = gr.State([])  # Ensuring the history persists
    
    with gr.Row():
        with gr.Column(scale=7):
            chat_display = gr.Markdown("No conversation history yet.")
        
        with gr.Column(scale=3):
            gr.Markdown("""
            ## How to use:
            1. Speak using the microphone or type your message
            2. Wait for the assistant's response
            3. The conversation history will be displayed on the left
            """)
    
    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone"],
            type="filepath",
            label="Audio Input"
        )
    
    # with gr.Row():
    #     text_input = gr.Textbox(label="Type your message here")

    with gr.Row():
        response_text = gr.Textbox(label="Assistant's Response")
    
    with gr.Row():
        audio_output = gr.Audio(label="Assistant's Voice")
    
    # Buttons
    with gr.Row():
        clear_btn = gr.Button("Clear Conversation")
    
    # Audio and Text Input Handling
    audio_input.change(
        process_audio,
        inputs=[audio_input, history],
        outputs=[response_text, history, audio_output]
    ).then(
        display_history,
        inputs=[history],
        outputs=[chat_display]
    )
    
    # text_input.submit(
    #     process_text,
    #     inputs=[text_input, history],
    #     outputs=[response_text, history, audio_output]
    # ).then(
    #     display_history,
    #     inputs=[history],
    #     outputs=[chat_display]
    # )

    # Clear conversation
    clear_btn.click(
        lambda: ([], "No conversation history yet.", "", None),
        outputs=[history, chat_display, response_text, audio_output]
    )

demo.launch()