Update app.py
Browse files
app.py
CHANGED
@@ -82,13 +82,15 @@ def chatbot(input_text, image, audio, openai_api_key, reasoning_effort, model_ch
|
|
82 |
if audio and input_mode == "Voice":
|
83 |
input_text = transcribe_audio(audio, openai_api_key)
|
84 |
|
85 |
-
if input_mode == "
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
90 |
# Append the response to the history
|
91 |
-
history.append((f"User: {input_text}", f"Assistant: {
|
92 |
|
93 |
return "", history
|
94 |
|
@@ -241,45 +243,25 @@ custom_css = """
|
|
241 |
# Gradio interface setup
|
242 |
def create_interface():
|
243 |
with gr.Blocks(css=custom_css) as demo:
|
244 |
-
gr.Markdown("""
|
245 |
-
<
|
246 |
-
|
247 |
-
|
248 |
-
</div>
|
249 |
-
""")
|
250 |
|
251 |
-
#
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
- You can also upload an image, and the assistant will process it and answer questions about the image.
|
258 |
-
- Voice input is supported: You can upload or record an audio file, and it will be transcribed to text and sent to the assistant.
|
259 |
-
- Enter your OpenAI API key to start interacting with the model.
|
260 |
-
- You can use the 'Clear History' button to remove the conversation history.
|
261 |
-
- "o1" is for image chat and "o3-mini" is for text chat.
|
262 |
-
### Reasoning Effort:
|
263 |
-
The reasoning effort controls how complex or detailed the assistant's answers should be.
|
264 |
-
- **Low**: Provides quick, concise answers with minimal reasoning or details.
|
265 |
-
- **Medium**: Offers a balanced response with a reasonable level of detail and thought.
|
266 |
-
- **High**: Produces more detailed, analytical, or thoughtful responses, requiring deeper reasoning.
|
267 |
-
""")
|
268 |
|
269 |
with gr.Row():
|
270 |
openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
|
271 |
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
value="Text"
|
277 |
-
)
|
278 |
-
|
279 |
-
with gr.Row():
|
280 |
-
image_input = gr.Image(label="Upload an Image", type="pil") # Image upload input
|
281 |
-
input_text = gr.Textbox(label="Enter Text Question", placeholder="Ask a question or provide text", lines=2)
|
282 |
-
audio_input = gr.Audio(label="Upload or Record Audio", type="filepath") # Audio upload or record input (using filepath)
|
283 |
|
284 |
with gr.Row():
|
285 |
reasoning_effort = gr.Dropdown(
|
@@ -290,17 +272,28 @@ def create_interface():
|
|
290 |
model_choice = gr.Dropdown(
|
291 |
label="Select Model",
|
292 |
choices=["o1", "o3-mini"],
|
293 |
-
value="o1"
|
294 |
)
|
295 |
submit_btn = gr.Button("Ask!", elem_id="submit-btn")
|
296 |
clear_btn = gr.Button("Clear History", elem_id="clear-history")
|
297 |
|
298 |
chat_history = gr.Chatbot()
|
299 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
# Button interactions
|
301 |
submit_btn.click(fn=chatbot, inputs=[input_text, image_input, audio_input, openai_api_key, reasoning_effort, model_choice, input_mode, chat_history], outputs=[input_text, chat_history])
|
302 |
clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history])
|
303 |
|
|
|
|
|
304 |
return demo
|
305 |
|
306 |
# Run the interface
|
|
|
82 |
if audio and input_mode == "Voice":
|
83 |
input_text = transcribe_audio(audio, openai_api_key)
|
84 |
|
85 |
+
if input_mode == "Image" and image:
|
86 |
+
# If Image Mode is selected and image is uploaded
|
87 |
+
input_text = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice)
|
88 |
+
elif input_mode == "Text" and input_text:
|
89 |
+
# If Text Mode is selected
|
90 |
+
input_text = generate_response(input_text, None, openai_api_key, reasoning_effort, model_choice)
|
91 |
+
|
92 |
# Append the response to the history
|
93 |
+
history.append((f"User: {input_text}", f"Assistant: {input_text}"))
|
94 |
|
95 |
return "", history
|
96 |
|
|
|
243 |
# Gradio interface setup
|
244 |
def create_interface():
|
245 |
with gr.Blocks(css=custom_css) as demo:
|
246 |
+
gr.Markdown("""<div class="gradio-header">
|
247 |
+
<h1>Multimodal Chatbot (Text + Image + Voice)</h1>
|
248 |
+
<h3>Interact with a chatbot using text, image, or voice inputs</h3>
|
249 |
+
</div>""")
|
|
|
|
|
250 |
|
251 |
+
# Choose input type (Text, Image, Voice)
|
252 |
+
input_mode = gr.Radio(
|
253 |
+
label="Choose Input Mode",
|
254 |
+
choices=["Text", "Image", "Voice"],
|
255 |
+
value="Text"
|
256 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
|
258 |
with gr.Row():
|
259 |
openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
|
260 |
|
261 |
+
# Text, Image, and Audio Inputs will be displayed based on the chosen mode
|
262 |
+
input_text = gr.Textbox(label="Enter Text Question", placeholder="Ask a question or provide text", lines=2)
|
263 |
+
image_input = gr.Image(label="Upload an Image", type="pil")
|
264 |
+
audio_input = gr.Audio(label="Upload or Record Audio", type="filepath")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
with gr.Row():
|
267 |
reasoning_effort = gr.Dropdown(
|
|
|
272 |
model_choice = gr.Dropdown(
|
273 |
label="Select Model",
|
274 |
choices=["o1", "o3-mini"],
|
275 |
+
value="o1"
|
276 |
)
|
277 |
submit_btn = gr.Button("Ask!", elem_id="submit-btn")
|
278 |
clear_btn = gr.Button("Clear History", elem_id="clear-history")
|
279 |
|
280 |
chat_history = gr.Chatbot()
|
281 |
|
282 |
+
# Dynamically control the input visibility based on the selected mode
|
283 |
+
def toggle_inputs(input_mode):
|
284 |
+
if input_mode == "Text":
|
285 |
+
return input_text, None, None
|
286 |
+
elif input_mode == "Image":
|
287 |
+
return None, image_input, None
|
288 |
+
else: # Voice
|
289 |
+
return None, None, audio_input
|
290 |
+
|
291 |
# Button interactions
|
292 |
submit_btn.click(fn=chatbot, inputs=[input_text, image_input, audio_input, openai_api_key, reasoning_effort, model_choice, input_mode, chat_history], outputs=[input_text, chat_history])
|
293 |
clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history])
|
294 |
|
295 |
+
input_mode.change(toggle_inputs, inputs=[input_mode], outputs=[input_text, image_input, audio_input])
|
296 |
+
|
297 |
return demo
|
298 |
|
299 |
# Run the interface
|