nguyenbh commited on
Commit
089499a
·
1 Parent(s): fd1391b

Update chat history

Browse files
Files changed (1) hide show
  1. app.py +69 -14
app.py CHANGED
@@ -20,6 +20,7 @@ logger = logging.getLogger(__name__)
20
  url = os.getenv("AZURE_ENDPOINT")
21
  api_key = os.getenv("AZURE_API_KEY")
22
 
 
23
  # Initialize MIME types
24
  mimetypes.init()
25
 
@@ -219,6 +220,22 @@ def process_message(history, message, conversation_state):
219
  if text_content:
220
  content_items.append({"type": "text", "text": text_content})
221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  # Process and immediately convert files to base64
223
  if message["files"] and len(message["files"]) > 0:
224
  for file_path in message["files"]:
@@ -237,6 +254,10 @@ def process_message(history, message, conversation_state):
237
  }
238
  })
239
  image_files.append(file_path)
 
 
 
 
240
  elif mime_type.startswith("audio/"):
241
  content_items.append({
242
  "type": "audio_url",
@@ -245,9 +266,19 @@ def process_message(history, message, conversation_state):
245
  }
246
  })
247
  audio_files.append(file_path)
 
 
 
 
248
 
249
  # Only proceed if we have content
250
  if content_items:
 
 
 
 
 
 
251
  # Add to Gradio chatbot history (for display)
252
  history.append({"role": "user", "content": text_content})
253
 
@@ -255,8 +286,7 @@ def process_message(history, message, conversation_state):
255
  for file_path in image_files + audio_files:
256
  history.append({"role": "user", "content": {"path": file_path}})
257
 
258
- print(f"DEBUG: history = {history}")
259
-
260
 
261
  # Add to internal conversation state (with base64 data)
262
  conversation_state.append({
@@ -278,6 +308,20 @@ def process_audio_example_direct(example_text, example_audio_url, history, conve
278
  if conversation_state is None:
279
  conversation_state = []
280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  # Fetch audio and convert to base64 directly using improved function
282
  mime_type, base64_audio = improved_fetch_audio_from_url(example_audio_url)
283
 
@@ -325,6 +369,20 @@ def process_image_example_direct(example_text, example_image_url, history, conve
325
 
326
  if conversation_state is None:
327
  conversation_state = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
  # Fetch image and convert to base64 directly
330
  mime_type, base64_image = fetch_image_from_url(example_image_url)
@@ -413,8 +471,6 @@ def bot_response(history, conversation_state):
413
  result = f"Error processing response: {str(e)}"
414
 
415
  # Add bot response to history
416
- if result == "None":
417
- result = "Current implementation does not support text + audio + image inputs in the same conversation. Please hit Clear conversation button."
418
  history.append({"role": "assistant", "content": result})
419
 
420
  # Add to conversation state
@@ -423,8 +479,6 @@ def bot_response(history, conversation_state):
423
  "content": [{"type": "text", "text": result}]
424
  })
425
 
426
- print(f"DEBUG: history after response: {history}")
427
-
428
  return history, conversation_state
429
 
430
  def enable_input():
@@ -491,6 +545,8 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
491
  avatar_images=(None, "https://upload.wikimedia.org/wikipedia/commons/d/d3/Phi-integrated-information-symbol.png",),
492
  height=600
493
  )
 
 
494
 
495
  with gr.Row():
496
  chat_input = gr.MultimodalTextbox(
@@ -510,7 +566,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
510
  gr.Markdown("### Audio Examples")
511
 
512
  # Example 1
513
- gr.Markdown("**Example 1: Transcribe this audio clip**")
514
  gr.Audio("https://diamondfan.github.io/audio_files/english.weekend.plan.wav",
515
  label="Preview", elem_id="small-audio")
516
 
@@ -519,7 +575,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
519
  gr.Markdown("-----")
520
 
521
  # Example 2
522
- gr.Markdown("**Example 2: Translate audio transcription to English**")
523
  gr.Audio("https://diamondfan.github.io/audio_files/japanese.seattle.trip.report.wav",
524
  label="Preview", elem_id="small-audio")
525
  example2_btn = gr.Button("Run it", size="sm")
@@ -554,27 +610,26 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
554
 
555
  with gr.Tab("Image & Text"):
556
  # Example 1
557
- gr.Markdown("**Example 1: What's in this image?**")
558
  gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg", label="Preview")
559
  img_example1_btn = gr.Button("Run it", size="sm")
560
 
561
  # Example 2
562
- gr.Markdown("**Example 2: Describe this chart**")
563
  gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg", label="Preview")
564
  img_example2_btn = gr.Button("Run it", size="sm")
565
 
566
  # Define handlers for image examples
567
  def run_image_example1():
568
  return process_image_example_direct(
569
- "What's in this image?",
570
  "https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg",
571
- [], #chatbot.value,
572
- [], #conversation_state.value
573
  )
574
 
575
  def run_image_example2():
576
  return process_image_example_direct(
577
- "Describe this chart",
578
  "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg",
579
  [], []
580
  )
 
20
  url = os.getenv("AZURE_ENDPOINT")
21
  api_key = os.getenv("AZURE_API_KEY")
22
 
23
+
24
  # Initialize MIME types
25
  mimetypes.init()
26
 
 
220
  if text_content:
221
  content_items.append({"type": "text", "text": text_content})
222
 
223
+ # Check if we need to clear history when uploading a second image or audio
224
+ should_clear_history = False
225
+
226
+ # Count existing images and audio in history
227
+ existing_images = 0
228
+ existing_audio = 0
229
+
230
+ for msg in conversation_state:
231
+ if msg["role"] == "user" and "content" in msg:
232
+ for content_item in msg["content"]:
233
+ if isinstance(content_item, dict):
234
+ if content_item.get("type") == "image_url":
235
+ existing_images += 1
236
+ elif content_item.get("type") == "audio_url":
237
+ existing_audio += 1
238
+
239
  # Process and immediately convert files to base64
240
  if message["files"] and len(message["files"]) > 0:
241
  for file_path in message["files"]:
 
254
  }
255
  })
256
  image_files.append(file_path)
257
+ # Check if this is a second image
258
+ if existing_images > 0:
259
+ should_clear_history = True
260
+ logger.info("Detected second image upload - clearing history")
261
  elif mime_type.startswith("audio/"):
262
  content_items.append({
263
  "type": "audio_url",
 
266
  }
267
  })
268
  audio_files.append(file_path)
269
+ # Check if this is a second audio
270
+ if existing_audio > 0:
271
+ should_clear_history = True
272
+ logger.info("Detected second audio upload - clearing history")
273
 
274
  # Only proceed if we have content
275
  if content_items:
276
+ # Clear history if we're uploading a second image or audio
277
+ if should_clear_history:
278
+ history = []
279
+ conversation_state = []
280
+ logger.info("History cleared due to second image/audio upload")
281
+
282
  # Add to Gradio chatbot history (for display)
283
  history.append({"role": "user", "content": text_content})
284
 
 
286
  for file_path in image_files + audio_files:
287
  history.append({"role": "user", "content": {"path": file_path}})
288
 
289
+ logger.info(f"Updated history with user message. Current conversation has {existing_images + len(image_files)} images and {existing_audio + len(audio_files)} audio files")
 
290
 
291
  # Add to internal conversation state (with base64 data)
292
  conversation_state.append({
 
308
  if conversation_state is None:
309
  conversation_state = []
310
 
311
+ # Check if we need to clear history (if there's already an audio in the conversation)
312
+ should_clear_history = False
313
+ for msg in conversation_state:
314
+ if msg["role"] == "user" and "content" in msg:
315
+ for content_item in msg["content"]:
316
+ if isinstance(content_item, dict) and content_item.get("type") == "audio_url":
317
+ should_clear_history = True
318
+ break
319
+
320
+ if should_clear_history:
321
+ history = []
322
+ conversation_state = []
323
+ logger.info("History cleared due to example with second audio")
324
+
325
  # Fetch audio and convert to base64 directly using improved function
326
  mime_type, base64_audio = improved_fetch_audio_from_url(example_audio_url)
327
 
 
369
 
370
  if conversation_state is None:
371
  conversation_state = []
372
+
373
+ # Check if we need to clear history (if there's already an image in the conversation)
374
+ should_clear_history = False
375
+ for msg in conversation_state:
376
+ if msg["role"] == "user" and "content" in msg:
377
+ for content_item in msg["content"]:
378
+ if isinstance(content_item, dict) and content_item.get("type") == "image_url":
379
+ should_clear_history = True
380
+ break
381
+
382
+ if should_clear_history:
383
+ history = []
384
+ conversation_state = []
385
+ logger.info("History cleared due to example with second image")
386
 
387
  # Fetch image and convert to base64 directly
388
  mime_type, base64_image = fetch_image_from_url(example_image_url)
 
471
  result = f"Error processing response: {str(e)}"
472
 
473
  # Add bot response to history
 
 
474
  history.append({"role": "assistant", "content": result})
475
 
476
  # Add to conversation state
 
479
  "content": [{"type": "text", "text": result}]
480
  })
481
 
 
 
482
  return history, conversation_state
483
 
484
  def enable_input():
 
545
  avatar_images=(None, "https://upload.wikimedia.org/wikipedia/commons/d/d3/Phi-integrated-information-symbol.png",),
546
  height=600
547
  )
548
+ # trash icon clear all
549
+ chatbot.clear(lambda: [], None, conversation_state)
550
 
551
  with gr.Row():
552
  chat_input = gr.MultimodalTextbox(
 
566
  gr.Markdown("### Audio Examples")
567
 
568
  # Example 1
569
+ gr.Markdown("Example 1: **Transcribe this audio clip**")
570
  gr.Audio("https://diamondfan.github.io/audio_files/english.weekend.plan.wav",
571
  label="Preview", elem_id="small-audio")
572
 
 
575
  gr.Markdown("-----")
576
 
577
  # Example 2
578
+ gr.Markdown("Example 2: **Translate audio transcription to English**")
579
  gr.Audio("https://diamondfan.github.io/audio_files/japanese.seattle.trip.report.wav",
580
  label="Preview", elem_id="small-audio")
581
  example2_btn = gr.Button("Run it", size="sm")
 
610
 
611
  with gr.Tab("Image & Text"):
612
  # Example 1
613
+ gr.Markdown("Example 1: **Write a limerick about this image**")
614
  gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg", label="Preview")
615
  img_example1_btn = gr.Button("Run it", size="sm")
616
 
617
  # Example 2
618
+ gr.Markdown("Example 2: **Describe the chart in details.**")
619
  gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg", label="Preview")
620
  img_example2_btn = gr.Button("Run it", size="sm")
621
 
622
  # Define handlers for image examples
623
  def run_image_example1():
624
  return process_image_example_direct(
625
+ "Describe this image in details.",
626
  "https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg",
627
+ [], []
 
628
  )
629
 
630
  def run_image_example2():
631
  return process_image_example_direct(
632
+ "Write a limerick about this image",
633
  "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg",
634
  [], []
635
  )