nguyenbh commited on
Commit
b8d2774
·
1 Parent(s): 5325553

Update examples

Browse files
Files changed (1) hide show
  1. app.py +325 -161
app.py CHANGED
@@ -62,33 +62,114 @@ def call_aml_endpoint(payload, url, api_key):
62
  logger.error(f"Error message: {error_message}")
63
  return {"error": error_message}
64
 
65
- def load_audio_from_url(url):
66
- """Load audio from a URL using soundfile
67
  Args:
68
  url (str): URL of the audio file
69
  Returns:
70
- tuple: (sample_rate, audio_data) if successful, None otherwise
71
- str: file path to the temporary saved audio file
72
  """
73
  try:
74
  # Get the audio file from the URL
75
- response = requests.get(url)
76
- response.raise_for_status() # Raise exception for bad status codes
77
 
78
- # For other formats that soundfile supports directly (WAV, FLAC, etc.)
79
- audio_data, sample_rate = sf.read(BytesIO(response.content))
 
 
80
 
81
- # Save to a temporary file to be used by the chatbot
82
  file_extension = os.path.splitext(url)[1].lower()
83
- if not file_extension:
84
- file_extension = '.wav' # Default to .wav if no extension
85
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=file_extension)
87
- sf.write(temp_file.name, audio_data, sample_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- return (sample_rate, audio_data), temp_file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  except Exception as e:
91
- logger.error(f"Error loading audio from URL: {e}")
92
  return None, None
93
 
94
  def encode_base64_from_file(file_path):
@@ -185,6 +266,100 @@ def process_message(history, message, conversation_state):
185
 
186
  return history, gr.MultimodalTextbox(value=None, interactive=False), conversation_state
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  def bot_response(history, conversation_state):
189
  """Generate bot response based on conversation state."""
190
  if not conversation_state:
@@ -252,11 +427,57 @@ def bot_response(history, conversation_state):
252
 
253
  return history, conversation_state
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  # Create Gradio demo
256
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
257
- title = gr.Markdown("# Azure ML Multimodal Chatbot Demo")
258
  description = gr.Markdown("""
259
- This demo allows you to interact with a multimodal AI model through Azure ML.
260
  You can type messages, upload images, or record audio to communicate with the AI.
261
  """)
262
 
@@ -264,7 +485,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
264
  conversation_state = gr.State([])
265
 
266
  with gr.Row():
267
- with gr.Column(scale=4):
268
  chatbot = gr.Chatbot(
269
  type="messages",
270
  avatar_images=(None, "https://upload.wikimedia.org/wikipedia/commons/d/d3/Phi-integrated-information-symbol.png",),
@@ -283,131 +504,107 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
283
  clear_btn = gr.ClearButton([chatbot, chat_input], value="Clear conversation")
284
  clear_btn.click(lambda: [], None, conversation_state) # Also clear the conversation state
285
  gr.HTML("<div style='text-align: right; margin-top: 5px;'><small>Powered by Azure ML</small></div>")
286
-
287
- # Define function to handle example submission directly
288
- def handle_example_submission(text, files, history, conv_state):
289
- """
290
- Process an example submission directly including bot response
291
- This bypasses the regular chat_input.submit flow
292
- """
293
- # Create a message object similar to what would be submitted by the user
294
- message = {"text": text, "files": files if files else []}
295
-
296
- # Use the same processing function as normal submissions
297
- new_history, _, new_conv_state = process_message(history, message, conv_state)
298
-
299
- # Then immediately trigger the bot response
300
- final_history, final_conv_state = bot_response(new_history, new_conv_state)
301
-
302
- # Re-enable the input box
303
- chat_input.update(interactive=True)
304
-
305
- # Return everything needed
306
- return final_history, final_conv_state
307
 
308
  with gr.Column(scale=1):
309
  gr.Markdown("### Examples")
310
 
311
  with gr.Tab("Text Only"):
312
- # For text examples, just submit them directly
313
- def run_text_example(example_text, history, conv_state):
314
- # Process the example directly
315
- return handle_example_submission(example_text, [], history, conv_state)
316
-
317
  text_examples = gr.Examples(
318
  examples=[
319
  ["Tell me about Microsoft Azure cloud services."],
320
  ["What can you help me with today?"],
321
  ["Explain the difference between AI and machine learning."],
322
  ],
323
- inputs=[gr.Textbox(visible=False)],
324
- outputs=[chatbot, conversation_state],
325
- fn=lambda text, h=chatbot, c=conversation_state: run_text_example(text, h, c),
326
- label="Text Examples (Click to run the example)"
327
  )
328
 
329
  with gr.Tab("Text & Audio"):
330
- # Function to handle loading both text and audio from URL and sending directly
331
- def run_audio_example(example_text, example_audio_url, history, conv_state):
332
- try:
333
- # Download and process the audio from URL
334
- print(f"Downloading audio from: {example_audio_url}")
335
- response = requests.get(example_audio_url)
336
- response.raise_for_status()
337
-
338
- # Save to a temporary file
339
- file_extension = os.path.splitext(example_audio_url)[1].lower()
340
- if not file_extension:
341
- file_extension = '.wav' # Default to .wav if no extension
342
-
343
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=file_extension)
344
- temp_file.write(response.content)
345
- temp_file.close()
346
-
347
- print(f"Saved audio to temporary file: {temp_file.name}")
348
-
349
- # Process the example directly
350
- return handle_example_submission(example_text, [temp_file.name], history, conv_state)
351
- except Exception as e:
352
- print(f"Error processing audio example: {e}")
353
- # If an error occurs, just add the text to history
354
- history.append({"role": "user", "content": f"{example_text} (Error loading audio: {e})"})
355
- return history, conv_state
356
 
357
- audio_examples = gr.Examples(
358
- examples=[
359
- ["Transcribe this audio clip", "https://diamondfan.github.io/audio_files/english.weekend.plan.wav"],
360
- ["What language is being spoken in this recording?", "https://www2.cs.uic.edu/~i101/SoundFiles/BabyElephantWalk60.wav"],
361
- ],
362
- inputs=[
363
- gr.Textbox(visible=False),
364
- gr.Textbox(visible=False)
365
- ],
366
- outputs=[chatbot, conversation_state],
367
- fn=lambda text, url, h=chatbot, c=conversation_state: run_audio_example(text, url, h, c),
368
- label="Audio Examples (Click to run the example)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  )
370
 
371
  with gr.Tab("Text & Image"):
372
- # Function to handle loading both text and image from URL and sending directly
373
- def run_image_example(example_text, example_image_url, history, conv_state):
374
- try:
375
- # Download the image from URL
376
- print(f"Downloading image from: {example_image_url}")
377
- response = requests.get(example_image_url)
378
- response.raise_for_status()
379
-
380
- # Save to a temporary file
381
- file_extension = os.path.splitext(example_image_url)[1].lower()
382
- if not file_extension:
383
- file_extension = '.jpg' # Default to .jpg if no extension
384
-
385
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=file_extension)
386
- temp_file.write(response.content)
387
- temp_file.close()
388
-
389
- print(f"Saved image to temporary file: {temp_file.name}")
390
-
391
- # Process the example directly
392
- return handle_example_submission(example_text, [temp_file.name], history, conv_state)
393
- except Exception as e:
394
- print(f"Error processing image example: {e}")
395
- # If an error occurs, just add the text to history
396
- history.append({"role": "user", "content": f"{example_text} (Error loading image: {e})"})
397
- return history, conv_state
398
 
399
- image_examples = gr.Examples(
400
- examples=[
401
- ["What's in this image?", "https://storage.googleapis.com/demo-image/dog.jpg"],
402
- ["Describe this chart", "https://matplotlib.org/stable/_images/sphx_glr_bar_stacked_001.png"],
403
- ],
404
- inputs=[
405
- gr.Textbox(visible=False),
406
- gr.Textbox(visible=False)
407
- ],
408
- outputs=[chatbot, conversation_state],
409
- fn=lambda text, url, h=chatbot, c=conversation_state: run_image_example(text, url, h, c),
410
- label="Image Examples (Click to run the example)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  )
412
 
413
  gr.Markdown("### Instructions")
@@ -415,6 +612,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
415
  - Type a question or statement
416
  - Upload images or audio files
417
  - You can combine text with media files
 
418
  - The model can analyze images and transcribe audio
419
  - For best results with images, use JPG or PNG files
420
  - For audio, use WAV, MP3, or FLAC files
@@ -425,7 +623,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
425
  This chatbot can:
426
  - Answer questions and provide explanations
427
  - Describe and analyze images
428
- - Transcribe and analyze audio content
429
  - Process multiple inputs in the same message
430
  - Maintain context throughout the conversation
431
  """)
@@ -436,38 +634,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
436
  value={}
437
  )
438
 
439
- def update_debug(conversation_state):
440
- """Update debug output with the last payload that would be sent."""
441
- if not conversation_state:
442
- return {}
443
-
444
- # Create a payload from the conversation
445
- payload = {
446
- "input_data": {
447
- "input_string": conversation_state
448
- }
449
- }
450
-
451
- # Remove base64 data to avoid cluttering the UI
452
- sanitized_payload = json.loads(json.dumps(payload))
453
- for item in sanitized_payload["input_data"]["input_string"]:
454
- if "content" in item and isinstance(item["content"], list):
455
- for content_item in item["content"]:
456
- if "image_url" in content_item:
457
- parts = content_item["image_url"]["url"].split(",")
458
- if len(parts) > 1:
459
- content_item["image_url"]["url"] = parts[0] + ",[BASE64_DATA_REMOVED]"
460
- if "audio_url" in content_item:
461
- parts = content_item["audio_url"]["url"].split(",")
462
- if len(parts) > 1:
463
- content_item["audio_url"]["url"] = parts[0] + ",[BASE64_DATA_REMOVED]"
464
-
465
- return sanitized_payload
466
-
467
- def enable_input():
468
- """Re-enable the input box after bot responds."""
469
- return gr.MultimodalTextbox(interactive=True)
470
-
471
  # Set up event handlers
472
  msg_submit = chat_input.submit(
473
  process_message, [chatbot, chat_input, conversation_state], [chatbot, chat_input, conversation_state], queue=False
@@ -478,10 +644,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
478
  )
479
 
480
  msg_response.then(enable_input, None, chat_input)
481
- # btn_response.then(enable_input, None, chat_input)
482
 
483
  # Update debug info
484
- # msg_response.then(update_debug, conversation_state, debug_output)
485
- # btn_response.then(update_debug, conversation_state, debug_output)
486
 
487
  demo.launch(share=True, debug=True)
 
62
  logger.error(f"Error message: {error_message}")
63
  return {"error": error_message}
64
 
65
+ def improved_fetch_audio_from_url(url):
66
+ """Improved function to fetch audio data from URL and convert to base64
67
  Args:
68
  url (str): URL of the audio file
69
  Returns:
70
+ tuple: (mime_type, base64_encoded_data) if successful, (None, None) otherwise
 
71
  """
72
  try:
73
  # Get the audio file from the URL
74
+ logger.info(f"Fetching audio from URL: {url}")
 
75
 
76
+ # Use a session with increased timeout
77
+ session = requests.Session()
78
+ response = session.get(url, timeout=30)
79
+ response.raise_for_status()
80
 
81
+ # Determine MIME type based on URL
82
  file_extension = os.path.splitext(url)[1].lower()
83
+ mime_type = None
84
+
85
+ if file_extension == '.wav':
86
+ mime_type = "audio/wav"
87
+ elif file_extension == '.mp3':
88
+ mime_type = "audio/mpeg"
89
+ elif file_extension == '.flac':
90
+ mime_type = "audio/flac"
91
+ elif file_extension in ['.m4a', '.aac']:
92
+ mime_type = "audio/aac"
93
+ elif file_extension == '.ogg':
94
+ mime_type = "audio/ogg"
95
+ else:
96
+ # Try to detect the MIME type from headers
97
+ content_type = response.headers.get('Content-Type', '')
98
+ if content_type.startswith('audio/'):
99
+ mime_type = content_type
100
+ else:
101
+ mime_type = "audio/wav" # Default to WAV
102
+
103
+ logger.info(f"Detected MIME type: {mime_type}")
104
+
105
+ # Save content to a temporary file for debugging
106
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=file_extension)
107
+ temp_file.write(response.content)
108
+ temp_file.close()
109
+
110
+ logger.info(f"Saved audio to temporary file: {temp_file.name}")
111
+
112
+ # Read the file to verify it's valid
113
+ try:
114
+ # For WAV files, try to read with soundfile to verify
115
+ if mime_type == "audio/wav":
116
+ data, samplerate = sf.read(temp_file.name)
117
+ logger.info(f"Successfully read audio file: {len(data)} samples, {samplerate}Hz")
118
+ except Exception as e:
119
+ logger.warning(f"Could not verify audio with soundfile: {e}")
120
+ # Continue anyway, the file might still be valid
121
+
122
+ # Convert to base64
123
+ with open(temp_file.name, "rb") as f:
124
+ audio_content = f.read()
125
+
126
+ base64_audio = base64.b64encode(audio_content).decode('utf-8')
127
+ logger.info(f"Successfully encoded audio to base64, length: {len(base64_audio)}")
128
+
129
+ # Clean up temporary file
130
+ try:
131
+ os.unlink(temp_file.name)
132
+ except:
133
+ pass
134
+
135
+ return mime_type, base64_audio
136
+ except Exception as e:
137
+ logger.error(f"Error fetching audio from URL: {e}", exc_info=True)
138
+ return None, None
139
+
140
+ def fetch_image_from_url(url):
141
+ """Fetch image data from URL and convert to base64
142
+ Args:
143
+ url (str): URL of the image file
144
+ Returns:
145
+ tuple: (mime_type, base64_encoded_data) if successful, (None, None) otherwise
146
+ """
147
+ try:
148
+ # Get the image file from the URL
149
+ logger.info(f"Fetching image from URL: {url}")
150
+ response = requests.get(url)
151
+ response.raise_for_status()
152
 
153
+ # Determine MIME type based on URL
154
+ file_extension = os.path.splitext(url)[1].lower()
155
+ if file_extension in ['.jpg', '.jpeg']:
156
+ mime_type = "image/jpeg"
157
+ elif file_extension == '.png':
158
+ mime_type = "image/png"
159
+ elif file_extension == '.gif':
160
+ mime_type = "image/gif"
161
+ elif file_extension in ['.bmp', '.tiff', '.webp']:
162
+ mime_type = f"image/{file_extension[1:]}"
163
+ else:
164
+ mime_type = "image/jpeg" # Default to JPEG
165
+
166
+ # Convert to base64
167
+ base64_image = base64.b64encode(response.content).decode('utf-8')
168
+
169
+ logger.info(f"Successfully fetched and encoded image, mime type: {mime_type}")
170
+ return mime_type, base64_image
171
  except Exception as e:
172
+ logger.error(f"Error fetching image from URL: {e}")
173
  return None, None
174
 
175
  def encode_base64_from_file(file_path):
 
266
 
267
  return history, gr.MultimodalTextbox(value=None, interactive=False), conversation_state
268
 
269
+ def process_audio_example_direct(example_text, example_audio_url, history, conversation_state):
270
+ """Process an audio example directly from a URL."""
271
+ try:
272
+ logger.info(f"Processing audio example with text: {example_text}, URL: {example_audio_url}")
273
+
274
+ # Initialize history and conversation_state if they're None
275
+ if history is None:
276
+ history = []
277
+
278
+ if conversation_state is None:
279
+ conversation_state = []
280
+
281
+ # Fetch audio and convert to base64 directly using improved function
282
+ mime_type, base64_audio = improved_fetch_audio_from_url(example_audio_url)
283
+
284
+ if not mime_type or not base64_audio:
285
+ error_msg = f"Failed to load audio from {example_audio_url}"
286
+ logger.error(error_msg)
287
+ history.append({"role": "user", "content": f"{example_text} (Audio URL: {example_audio_url})"})
288
+ history.append({"role": "assistant", "content": f"Error: {error_msg}"})
289
+ return history, conversation_state
290
+
291
+ logger.info(f"Successfully loaded audio, mime type: {mime_type}, base64 length: {len(base64_audio)}")
292
+
293
+ # Add text message to history for display
294
+ history.append({"role": "user", "content": example_text})
295
+
296
+ # Add to conversation state
297
+ content_items = [
298
+ {"type": "text", "text": example_text},
299
+ {"type": "audio_url", "audio_url": {"url": f"data:{mime_type};base64,{base64_audio}"}}
300
+ ]
301
+
302
+ conversation_state.append({
303
+ "role": "user",
304
+ "content": content_items
305
+ })
306
+
307
+ logger.info("Successfully prepared conversation state, now generating response")
308
+
309
+ # Generate bot response
310
+ return bot_response(history, conversation_state)
311
+ except Exception as e:
312
+ logger.error(f"Error processing audio example: {e}", exc_info=True)
313
+ if history is None:
314
+ history = []
315
+ history.append({"role": "user", "content": f"{example_text} (Audio URL: {example_audio_url})"})
316
+ history.append({"role": "assistant", "content": f"Error: {str(e)}"})
317
+ return history, conversation_state
318
+
319
+ def process_image_example_direct(example_text, example_image_url, history, conversation_state):
320
+ """Process an image example directly from a URL."""
321
+ try:
322
+ # Initialize history and conversation_state if they're None
323
+ if history is None:
324
+ history = []
325
+
326
+ if conversation_state is None:
327
+ conversation_state = []
328
+
329
+ # Fetch image and convert to base64 directly
330
+ mime_type, base64_image = fetch_image_from_url(example_image_url)
331
+
332
+ if not mime_type or not base64_image:
333
+ error_msg = f"Failed to load image from {example_image_url}"
334
+ logger.error(error_msg)
335
+ history.append({"role": "user", "content": f"{example_text} (Image URL: {example_image_url})"})
336
+ history.append({"role": "assistant", "content": f"Error: {error_msg}"})
337
+ return history, conversation_state
338
+
339
+ # Add text message to history for display
340
+ history.append({"role": "user", "content": example_text})
341
+
342
+ # Add to conversation state
343
+ content_items = [
344
+ {"type": "text", "text": example_text},
345
+ {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}}
346
+ ]
347
+
348
+ conversation_state.append({
349
+ "role": "user",
350
+ "content": content_items
351
+ })
352
+
353
+ # Generate bot response
354
+ return bot_response(history, conversation_state)
355
+ except Exception as e:
356
+ logger.error(f"Error processing image example: {e}", exc_info=True)
357
+ if history is None:
358
+ history = []
359
+ history.append({"role": "user", "content": f"{example_text} (Image URL: {example_image_url})"})
360
+ history.append({"role": "assistant", "content": f"Error: {str(e)}"})
361
+ return history, conversation_state
362
+
363
  def bot_response(history, conversation_state):
364
  """Generate bot response based on conversation state."""
365
  if not conversation_state:
 
427
 
428
  return history, conversation_state
429
 
430
+ def enable_input():
431
+ """Re-enable the input box after bot responds."""
432
+ return gr.MultimodalTextbox(interactive=True)
433
+
434
+ def update_debug(conversation_state):
435
+ """Update debug output with the last payload that would be sent."""
436
+ if not conversation_state:
437
+ return {}
438
+
439
+ # Create a payload from the conversation
440
+ payload = {
441
+ "input_data": {
442
+ "input_string": conversation_state
443
+ }
444
+ }
445
+
446
+ # Remove base64 data to avoid cluttering the UI
447
+ sanitized_payload = json.loads(json.dumps(payload))
448
+ for item in sanitized_payload["input_data"]["input_string"]:
449
+ if "content" in item and isinstance(item["content"], list):
450
+ for content_item in item["content"]:
451
+ if "image_url" in content_item:
452
+ parts = content_item["image_url"]["url"].split(",")
453
+ if len(parts) > 1:
454
+ content_item["image_url"]["url"] = parts[0] + ",[BASE64_DATA_REMOVED]"
455
+ if "audio_url" in content_item:
456
+ parts = content_item["audio_url"]["url"].split(",")
457
+ if len(parts) > 1:
458
+ content_item["audio_url"]["url"] = parts[0] + ",[BASE64_DATA_REMOVED]"
459
+
460
+ return sanitized_payload
461
+
462
+ # Add this near the beginning of your Blocks definition, before you define your components
463
+ css = """
464
+ #small-audio audio {
465
+ height: 20px !important;
466
+ width: 100px !important;
467
+ }
468
+ #small-audio .wrap {
469
+ max-width: 220px !important;
470
+ }
471
+ #small-audio .audio-container {
472
+ min-height: 0px !important;
473
+ }
474
+ """
475
+
476
  # Create Gradio demo
477
+ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
478
+ title = gr.Markdown("# Phi-4-Multimodal Playground")
479
  description = gr.Markdown("""
480
+ This demo allows you to interact with the [Phi-4-Multimodal AI model](https://aka.ms/phi-4-multimodal/techreport).
481
  You can type messages, upload images, or record audio to communicate with the AI.
482
  """)
483
 
 
485
  conversation_state = gr.State([])
486
 
487
  with gr.Row():
488
+ with gr.Column(scale=2):
489
  chatbot = gr.Chatbot(
490
  type="messages",
491
  avatar_images=(None, "https://upload.wikimedia.org/wikipedia/commons/d/d3/Phi-integrated-information-symbol.png",),
 
504
  clear_btn = gr.ClearButton([chatbot, chat_input], value="Clear conversation")
505
  clear_btn.click(lambda: [], None, conversation_state) # Also clear the conversation state
506
  gr.HTML("<div style='text-align: right; margin-top: 5px;'><small>Powered by Azure ML</small></div>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
 
508
  with gr.Column(scale=1):
509
  gr.Markdown("### Examples")
510
 
511
  with gr.Tab("Text Only"):
 
 
 
 
 
512
  text_examples = gr.Examples(
513
  examples=[
514
  ["Tell me about Microsoft Azure cloud services."],
515
  ["What can you help me with today?"],
516
  ["Explain the difference between AI and machine learning."],
517
  ],
518
+ inputs=chat_input,
519
+ label="Text Examples"
 
 
520
  )
521
 
522
  with gr.Tab("Text & Audio"):
523
+ gr.Markdown("### Audio Examples")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
 
525
+ # Example 1
526
+ gr.Markdown("**Example 1: Transcribe this audio clip**")
527
+ gr.Audio("https://diamondfan.github.io/audio_files/english.weekend.plan.wav",
528
+ label="Preview", elem_id="small-audio")
529
+
530
+ example1_btn = gr.Button("Run it", size="sm")
531
+
532
+ gr.Markdown("-----")
533
+
534
+ # Example 2
535
+ gr.Markdown("**Example 2: Translate audio transcription to English**")
536
+ gr.Audio("https://diamondfan.github.io/audio_files/japanese.seattle.trip.report.wav",
537
+ label="Preview", elem_id="small-audio")
538
+ example2_btn = gr.Button("Run it", size="sm")
539
+
540
+ # Define handlers for audio examples
541
+ def run_audio_example1():
542
+ return process_audio_example_direct(
543
+ "Transcribe this audio clip",
544
+ "https://diamondfan.github.io/audio_files/english.weekend.plan.wav",
545
+ [], []
546
+ )
547
+
548
+ def run_audio_example2():
549
+ return process_audio_example_direct(
550
+ "Translate audio transcription to English",
551
+ "https://diamondfan.github.io/audio_files/japanese.seattle.trip.report.wav",
552
+ [], []
553
+ )
554
+
555
+ # Connect buttons to handlers
556
+ example1_btn.click(
557
+ run_audio_example1,
558
+ inputs=[],
559
+ outputs=[chatbot, conversation_state]
560
+ )
561
+
562
+ example2_btn.click(
563
+ run_audio_example2,
564
+ inputs=[],
565
+ outputs=[chatbot, conversation_state]
566
  )
567
 
568
  with gr.Tab("Text & Image"):
569
+ gr.Markdown("### Image Examples")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
 
571
+ # Example 1
572
+ gr.Markdown("**Example 1: What's in this image?**")
573
+ gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg", label="Preview")
574
+ img_example1_btn = gr.Button("Run it", size="sm")
575
+
576
+ # Example 2
577
+ gr.Markdown("**Example 2: Describe this chart**")
578
+ gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg", label="Preview")
579
+ img_example2_btn = gr.Button("Run it", size="sm")
580
+
581
+ # Define handlers for image examples
582
+ def run_image_example1():
583
+ return process_image_example_direct(
584
+ "What's in this image?",
585
+ "https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg",
586
+ [], #chatbot.value,
587
+ [], #conversation_state.value
588
+ )
589
+
590
+ def run_image_example2():
591
+ return process_image_example_direct(
592
+ "Describe this chart",
593
+ "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg",
594
+ [], []
595
+ )
596
+
597
+ # Connect buttons to handlers
598
+ img_example1_btn.click(
599
+ run_image_example1,
600
+ inputs=[],
601
+ outputs=[chatbot, conversation_state]
602
+ )
603
+
604
+ img_example2_btn.click(
605
+ run_image_example2,
606
+ inputs=[],
607
+ outputs=[chatbot, conversation_state]
608
  )
609
 
610
  gr.Markdown("### Instructions")
 
612
  - Type a question or statement
613
  - Upload images or audio files
614
  - You can combine text with media files
615
+ - Support 2 modalities at the same time
616
  - The model can analyze images and transcribe audio
617
  - For best results with images, use JPG or PNG files
618
  - For audio, use WAV, MP3, or FLAC files
 
623
  This chatbot can:
624
  - Answer questions and provide explanations
625
  - Describe and analyze images
626
+ - Transcribe, translate, summarize, and analyze audio content
627
  - Process multiple inputs in the same message
628
  - Maintain context throughout the conversation
629
  """)
 
634
  value={}
635
  )
636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637
  # Set up event handlers
638
  msg_submit = chat_input.submit(
639
  process_message, [chatbot, chat_input, conversation_state], [chatbot, chat_input, conversation_state], queue=False
 
644
  )
645
 
646
  msg_response.then(enable_input, None, chat_input)
 
647
 
648
  # Update debug info
649
+ msg_response.then(update_debug, conversation_state, debug_output)
 
650
 
651
  demo.launch(share=True, debug=True)