File size: 5,105 Bytes
9203946
 
 
 
 
 
 
 
 
528c829
9203946
 
 
 
 
 
b9e169a
df33bb7
6d394e1
9203946
 
 
65f35bc
9203946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d846897
9203946
1b68154
 
 
9203946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b68154
 
 
9203946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df33bb7
 
9203946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df33bb7
 
 
 
 
 
 
 
 
9203946
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import os
import gradio as gr
from google import genai
from gtts import gTTS
import tempfile

# Configure the Gemini API
GOOGLE_API_KEY = os.getenv("gemini_api")  # Ensure your API key is set
client = genai.Client(api_key=GOOGLE_API_KEY)
chat=None
def transcribe_audio(audio_path):
    """
    Transcribe the audio file using the Gemini API.
    """
    try:
        # Upload the audio file
        #uploaded_file = client.files.upload(file=audio_path)
        print("Audio Path is",audio_path)
        myfile = client.files.upload(file=audio_path)
        # Send the file to Gemini for transcription
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=['Transcribe the input audio & return only the transcription.', myfile]
        )

        print("Transcription Response:", response.text)
        return response.text

    except Exception as e:
        print("Error in transcription:", str(e))
        return "Error in transcription"

def text_to_speech(text):
    """Convert text to speech using gTTS and return the path to the audio file."""
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
        tts = gTTS(text=text, lang='en')
        tts.save(fp.name)
        return fp.name

def chat_with_gemini(user_input, history):
    """
    Process user input through Gemini API and return the response.
    """
    if history is None or not isinstance(history, list):  # Ensure history is initialized
        history = []
    global chat
    # Initialize or continue conversation
    if chat is None:
        chat = client.chats.create(model="gemini-2.0-flash")  # Initialize chat once

    
    print("User input:", user_input)
    
    # Generate response
    response = chat.send_message(user_input)
    response_text = response.text
    print("Response text:", response_text)

    # Append to history properly
    history.append((user_input, response_text))
    
    # Generate audio response
    audio_path = text_to_speech(response_text)
    
    return response_text, history, audio_path

def process_audio(audio, history):
    """Process audio input, convert to text, and get response."""
    if audio is None:
        return "No audio detected", history, None  # Don't reset history
       
    # Convert audio to text
    user_input = transcribe_audio(audio)
    
    # Get response from Gemini
    response_text, new_history, audio_path = chat_with_gemini(user_input, history)
    
    return response_text, new_history, audio_path

def process_text(text_input, history):
    """Process text input and get response."""
    if not text_input.strip():
        return "No input detected", history, None
    
    # Get response from Gemini
    response_text, new_history, audio_path = chat_with_gemini(text_input, history)
    
    return response_text, new_history, audio_path

def display_history(history):
    """Format the history for display."""
    if not history:
        return "No conversation history yet."
    
    return "\n".join([f"You: {msg[0]}\nAssistant: {msg[1]}\n" for msg in history])

# Create the Gradio interface
with gr.Blocks(title="Gemini Audio Chatbot") as demo:
    gr.Markdown("# Gemini Audio Chatbot")
    gr.Markdown("Talk or type your message, and the assistant will respond with text and audio.")
    
    # State for conversation history
    history = gr.State([])  # Ensuring the history persists
    
    with gr.Row():
        with gr.Column(scale=7):
            chat_display = gr.Markdown("No conversation history yet.")
        
        with gr.Column(scale=3):
            gr.Markdown("""
            ## How to use:
            1. Speak using the microphone or type your message
            2. Wait for the assistant's response
            3. The conversation history will be displayed on the left
            """)
    
    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone"],
            type="filepath",
            label="Audio Input"
        )
    
    # with gr.Row():
    #     text_input = gr.Textbox(label="Type your message here")

    with gr.Row():
        response_text = gr.Textbox(label="Assistant's Response")
    
    with gr.Row():
        audio_output = gr.Audio(label="Assistant's Voice")
    
    # Buttons
    with gr.Row():
        clear_btn = gr.Button("Clear Conversation")
    
    # Audio and Text Input Handling
    audio_input.change(
        process_audio,
        inputs=[audio_input, history],
        outputs=[response_text, history, audio_output]
    ).then(
        display_history,
        inputs=[history],
        outputs=[chat_display]
    )
    
    # text_input.submit(
    #     process_text,
    #     inputs=[text_input, history],
    #     outputs=[response_text, history, audio_output]
    # ).then(
    #     display_history,
    #     inputs=[history],
    #     outputs=[chat_display]
    # )

    # Clear conversation
    clear_btn.click(
        lambda: ([], "No conversation history yet.", "", None),
        outputs=[history, chat_display, response_text, audio_output]
    )

demo.launch()