File size: 4,847 Bytes
5c8bbca
 
176259d
5c8bbca
 
 
6a5b9ba
5c8bbca
6dbd5d4
176259d
 
5c8bbca
6f0b49b
5c8bbca
 
 
 
 
 
 
 
 
176259d
 
fd28ea9
176259d
ec99c04
176259d
 
5c8bbca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176259d
 
 
5413e87
 
5c8bbca
 
 
5413e87
5c8bbca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1184cf6
5c8bbca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import gradio as gr
from google import genai
from gtts import gTTS
import tempfile
import time

# Configure the Gemini API
GOOGLE_API_KEY = os.getenv("gemini_api") # Replace with your actual API key

client = genai.Client(api_key=GOOGLE_API_KEY)



def transcribe_audio(audio_path):
    """
    This function uses Google's Speech-to-Text API to transcribe audio.
    For the free tier, we're using a simple placeholder.
    In a real application, you'd use a proper STT API here.
    """
    # For demonstration, we're returning a placeholder message
    # In a real app, you would connect to a speech-to-text service
    response = client.models.generate_content(
          model='gemini-2.0-flash',
          contents=['Transcribe the input audio & return the transcription only Example - Audio file is transcribed to Hello then just return Hello', audio_path]
        )
    print(response.text)

    return response.text

def text_to_speech(text):
    """Convert text to speech using gTTS and return the path to the audio file"""
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
        tts = gTTS(text=text, lang='en')
        tts.save(fp.name)
        return fp.name

def chat_with_gemini(user_input, history):
    """
    Process user input through Gemini API and return the response
    """
    # Initialize conversation or continue existing one
    if not history:
        history = []
        
        chat = client.chats.create(model="gemini-2.0-flash")
        
    print("History is",history)
    print("User input is ",user_input)
    # Generate response
    response = chat.send_message(user_input)
    response_text = response.text
    print("Response text is ",response_text)
    # Update history
    history.append(user_input)
    history.append(response_text)
    
    # Generate audio response
    audio_path = text_to_speech(response_text)
    
    return response_text, history, audio_path

def process_audio(audio, history):
    """Process audio input, convert to text, and get response"""
    if audio is None:
        return "No audio detected", history, None
    
    # Convert audio to text
    user_input = transcribe_audio(audio)
    
    # Get response from Gemini
    response_text, new_history, audio_path = chat_with_gemini(user_input, history)
    
    return response_text, new_history, audio_path

def process_text(text_input, history):
    """Process text input and get response"""
    if not text_input.strip():
        return "No input detected", history, None
    
    # Get response from Gemini
    response_text, new_history, audio_path = chat_with_gemini(text_input, history)
    
    return response_text, new_history, audio_path

def display_history(history):
    """Format the history for display"""
    if not history:
        return "No conversation history yet."
    
    display_text = ""
    for i in range(0, len(history), 2):
        if i < len(history):
            display_text += f"You: {history[i]}\n\n"
        if i + 1 < len(history):
            display_text += f"Assistant: {history[i+1]}\n\n"
    
    return display_text

# Create the Gradio interface
with gr.Blocks(title="Gemini Audio Chatbot") as demo:
    gr.Markdown("# Gemini Audio Chatbot")
    gr.Markdown("Talk or type your message, and the assistant will respond with text and audio.")
    
    # State for conversation history
    history = gr.State([])
    
    with gr.Row():
        with gr.Column(scale=7):
            # Chat history display
            chat_display = gr.Markdown("No conversation history yet.")
        
        with gr.Column(scale=3):
            # Info and instructions
            gr.Markdown("""
            ## How to use:
            1. Speak using the microphone or type your message
            2. Wait for the assistant's response
            3. The conversation history will be displayed on the left
            """)
    
    
    with gr.Row():
        # Audio input
        audio_input = gr.Audio(
            sources=["microphone"],
            type="filepath",
            label="Audio Input"
        )
    
    with gr.Row():
        # Assistant's response
        response_text = gr.Textbox(label="Assistant's Response")
    
    with gr.Row():
        # Audio output
        audio_output = gr.Audio(label="Assistant's Voice")
    
    # Buttons
    with gr.Row():
        clear_btn = gr.Button("Clear Conversation")
    
    
    
    audio_input.change(
        process_audio,
        inputs=[audio_input, history],
        outputs=[response_text, history, audio_output]
    ).then(
        display_history,
        inputs=[history],
        outputs=[chat_display]
    )
    
    clear_btn.click(
        lambda: ([], "No conversation history yet.", "", None),
        outputs=[history, chat_display, response_text, audio_output]
    )

demo.launch()