Spaces:
Runtime error
Runtime error
nguyenbh
commited on
Commit
·
089499a
1
Parent(s):
fd1391b
Update chat history
Browse files
app.py
CHANGED
|
@@ -20,6 +20,7 @@ logger = logging.getLogger(__name__)
|
|
| 20 |
url = os.getenv("AZURE_ENDPOINT")
|
| 21 |
api_key = os.getenv("AZURE_API_KEY")
|
| 22 |
|
|
|
|
| 23 |
# Initialize MIME types
|
| 24 |
mimetypes.init()
|
| 25 |
|
|
@@ -219,6 +220,22 @@ def process_message(history, message, conversation_state):
|
|
| 219 |
if text_content:
|
| 220 |
content_items.append({"type": "text", "text": text_content})
|
| 221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
# Process and immediately convert files to base64
|
| 223 |
if message["files"] and len(message["files"]) > 0:
|
| 224 |
for file_path in message["files"]:
|
|
@@ -237,6 +254,10 @@ def process_message(history, message, conversation_state):
|
|
| 237 |
}
|
| 238 |
})
|
| 239 |
image_files.append(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
elif mime_type.startswith("audio/"):
|
| 241 |
content_items.append({
|
| 242 |
"type": "audio_url",
|
|
@@ -245,9 +266,19 @@ def process_message(history, message, conversation_state):
|
|
| 245 |
}
|
| 246 |
})
|
| 247 |
audio_files.append(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
# Only proceed if we have content
|
| 250 |
if content_items:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
# Add to Gradio chatbot history (for display)
|
| 252 |
history.append({"role": "user", "content": text_content})
|
| 253 |
|
|
@@ -255,8 +286,7 @@ def process_message(history, message, conversation_state):
|
|
| 255 |
for file_path in image_files + audio_files:
|
| 256 |
history.append({"role": "user", "content": {"path": file_path}})
|
| 257 |
|
| 258 |
-
|
| 259 |
-
|
| 260 |
|
| 261 |
# Add to internal conversation state (with base64 data)
|
| 262 |
conversation_state.append({
|
|
@@ -278,6 +308,20 @@ def process_audio_example_direct(example_text, example_audio_url, history, conve
|
|
| 278 |
if conversation_state is None:
|
| 279 |
conversation_state = []
|
| 280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
# Fetch audio and convert to base64 directly using improved function
|
| 282 |
mime_type, base64_audio = improved_fetch_audio_from_url(example_audio_url)
|
| 283 |
|
|
@@ -325,6 +369,20 @@ def process_image_example_direct(example_text, example_image_url, history, conve
|
|
| 325 |
|
| 326 |
if conversation_state is None:
|
| 327 |
conversation_state = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
|
| 329 |
# Fetch image and convert to base64 directly
|
| 330 |
mime_type, base64_image = fetch_image_from_url(example_image_url)
|
|
@@ -413,8 +471,6 @@ def bot_response(history, conversation_state):
|
|
| 413 |
result = f"Error processing response: {str(e)}"
|
| 414 |
|
| 415 |
# Add bot response to history
|
| 416 |
-
if result == "None":
|
| 417 |
-
result = "Current implementation does not support text + audio + image inputs in the same conversation. Please hit Clear conversation button."
|
| 418 |
history.append({"role": "assistant", "content": result})
|
| 419 |
|
| 420 |
# Add to conversation state
|
|
@@ -423,8 +479,6 @@ def bot_response(history, conversation_state):
|
|
| 423 |
"content": [{"type": "text", "text": result}]
|
| 424 |
})
|
| 425 |
|
| 426 |
-
print(f"DEBUG: history after response: {history}")
|
| 427 |
-
|
| 428 |
return history, conversation_state
|
| 429 |
|
| 430 |
def enable_input():
|
|
@@ -491,6 +545,8 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
|
| 491 |
avatar_images=(None, "https://upload.wikimedia.org/wikipedia/commons/d/d3/Phi-integrated-information-symbol.png",),
|
| 492 |
height=600
|
| 493 |
)
|
|
|
|
|
|
|
| 494 |
|
| 495 |
with gr.Row():
|
| 496 |
chat_input = gr.MultimodalTextbox(
|
|
@@ -510,7 +566,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
|
| 510 |
gr.Markdown("### Audio Examples")
|
| 511 |
|
| 512 |
# Example 1
|
| 513 |
-
gr.Markdown("
|
| 514 |
gr.Audio("https://diamondfan.github.io/audio_files/english.weekend.plan.wav",
|
| 515 |
label="Preview", elem_id="small-audio")
|
| 516 |
|
|
@@ -519,7 +575,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
|
| 519 |
gr.Markdown("-----")
|
| 520 |
|
| 521 |
# Example 2
|
| 522 |
-
gr.Markdown("
|
| 523 |
gr.Audio("https://diamondfan.github.io/audio_files/japanese.seattle.trip.report.wav",
|
| 524 |
label="Preview", elem_id="small-audio")
|
| 525 |
example2_btn = gr.Button("Run it", size="sm")
|
|
@@ -554,27 +610,26 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
|
|
| 554 |
|
| 555 |
with gr.Tab("Image & Text"):
|
| 556 |
# Example 1
|
| 557 |
-
gr.Markdown("
|
| 558 |
gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg", label="Preview")
|
| 559 |
img_example1_btn = gr.Button("Run it", size="sm")
|
| 560 |
|
| 561 |
# Example 2
|
| 562 |
-
gr.Markdown("
|
| 563 |
gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg", label="Preview")
|
| 564 |
img_example2_btn = gr.Button("Run it", size="sm")
|
| 565 |
|
| 566 |
# Define handlers for image examples
|
| 567 |
def run_image_example1():
|
| 568 |
return process_image_example_direct(
|
| 569 |
-
"
|
| 570 |
"https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg",
|
| 571 |
-
[],
|
| 572 |
-
[], #conversation_state.value
|
| 573 |
)
|
| 574 |
|
| 575 |
def run_image_example2():
|
| 576 |
return process_image_example_direct(
|
| 577 |
-
"
|
| 578 |
"https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg",
|
| 579 |
[], []
|
| 580 |
)
|
|
|
|
| 20 |
url = os.getenv("AZURE_ENDPOINT")
|
| 21 |
api_key = os.getenv("AZURE_API_KEY")
|
| 22 |
|
| 23 |
+
|
| 24 |
# Initialize MIME types
|
| 25 |
mimetypes.init()
|
| 26 |
|
|
|
|
| 220 |
if text_content:
|
| 221 |
content_items.append({"type": "text", "text": text_content})
|
| 222 |
|
| 223 |
+
# Check if we need to clear history when uploading a second image or audio
|
| 224 |
+
should_clear_history = False
|
| 225 |
+
|
| 226 |
+
# Count existing images and audio in history
|
| 227 |
+
existing_images = 0
|
| 228 |
+
existing_audio = 0
|
| 229 |
+
|
| 230 |
+
for msg in conversation_state:
|
| 231 |
+
if msg["role"] == "user" and "content" in msg:
|
| 232 |
+
for content_item in msg["content"]:
|
| 233 |
+
if isinstance(content_item, dict):
|
| 234 |
+
if content_item.get("type") == "image_url":
|
| 235 |
+
existing_images += 1
|
| 236 |
+
elif content_item.get("type") == "audio_url":
|
| 237 |
+
existing_audio += 1
|
| 238 |
+
|
| 239 |
# Process and immediately convert files to base64
|
| 240 |
if message["files"] and len(message["files"]) > 0:
|
| 241 |
for file_path in message["files"]:
|
|
|
|
| 254 |
}
|
| 255 |
})
|
| 256 |
image_files.append(file_path)
|
| 257 |
+
# Check if this is a second image
|
| 258 |
+
if existing_images > 0:
|
| 259 |
+
should_clear_history = True
|
| 260 |
+
logger.info("Detected second image upload - clearing history")
|
| 261 |
elif mime_type.startswith("audio/"):
|
| 262 |
content_items.append({
|
| 263 |
"type": "audio_url",
|
|
|
|
| 266 |
}
|
| 267 |
})
|
| 268 |
audio_files.append(file_path)
|
| 269 |
+
# Check if this is a second audio
|
| 270 |
+
if existing_audio > 0:
|
| 271 |
+
should_clear_history = True
|
| 272 |
+
logger.info("Detected second audio upload - clearing history")
|
| 273 |
|
| 274 |
# Only proceed if we have content
|
| 275 |
if content_items:
|
| 276 |
+
# Clear history if we're uploading a second image or audio
|
| 277 |
+
if should_clear_history:
|
| 278 |
+
history = []
|
| 279 |
+
conversation_state = []
|
| 280 |
+
logger.info("History cleared due to second image/audio upload")
|
| 281 |
+
|
| 282 |
# Add to Gradio chatbot history (for display)
|
| 283 |
history.append({"role": "user", "content": text_content})
|
| 284 |
|
|
|
|
| 286 |
for file_path in image_files + audio_files:
|
| 287 |
history.append({"role": "user", "content": {"path": file_path}})
|
| 288 |
|
| 289 |
+
logger.info(f"Updated history with user message. Current conversation has {existing_images + len(image_files)} images and {existing_audio + len(audio_files)} audio files")
|
|
|
|
| 290 |
|
| 291 |
# Add to internal conversation state (with base64 data)
|
| 292 |
conversation_state.append({
|
|
|
|
| 308 |
if conversation_state is None:
|
| 309 |
conversation_state = []
|
| 310 |
|
| 311 |
+
# Check if we need to clear history (if there's already an audio in the conversation)
|
| 312 |
+
should_clear_history = False
|
| 313 |
+
for msg in conversation_state:
|
| 314 |
+
if msg["role"] == "user" and "content" in msg:
|
| 315 |
+
for content_item in msg["content"]:
|
| 316 |
+
if isinstance(content_item, dict) and content_item.get("type") == "audio_url":
|
| 317 |
+
should_clear_history = True
|
| 318 |
+
break
|
| 319 |
+
|
| 320 |
+
if should_clear_history:
|
| 321 |
+
history = []
|
| 322 |
+
conversation_state = []
|
| 323 |
+
logger.info("History cleared due to example with second audio")
|
| 324 |
+
|
| 325 |
# Fetch audio and convert to base64 directly using improved function
|
| 326 |
mime_type, base64_audio = improved_fetch_audio_from_url(example_audio_url)
|
| 327 |
|
|
|
|
| 369 |
|
| 370 |
if conversation_state is None:
|
| 371 |
conversation_state = []
|
| 372 |
+
|
| 373 |
+
# Check if we need to clear history (if there's already an image in the conversation)
|
| 374 |
+
should_clear_history = False
|
| 375 |
+
for msg in conversation_state:
|
| 376 |
+
if msg["role"] == "user" and "content" in msg:
|
| 377 |
+
for content_item in msg["content"]:
|
| 378 |
+
if isinstance(content_item, dict) and content_item.get("type") == "image_url":
|
| 379 |
+
should_clear_history = True
|
| 380 |
+
break
|
| 381 |
+
|
| 382 |
+
if should_clear_history:
|
| 383 |
+
history = []
|
| 384 |
+
conversation_state = []
|
| 385 |
+
logger.info("History cleared due to example with second image")
|
| 386 |
|
| 387 |
# Fetch image and convert to base64 directly
|
| 388 |
mime_type, base64_image = fetch_image_from_url(example_image_url)
|
|
|
|
| 471 |
result = f"Error processing response: {str(e)}"
|
| 472 |
|
| 473 |
# Add bot response to history
|
|
|
|
|
|
|
| 474 |
history.append({"role": "assistant", "content": result})
|
| 475 |
|
| 476 |
# Add to conversation state
|
|
|
|
| 479 |
"content": [{"type": "text", "text": result}]
|
| 480 |
})
|
| 481 |
|
|
|
|
|
|
|
| 482 |
return history, conversation_state
|
| 483 |
|
| 484 |
def enable_input():
|
|
|
|
| 545 |
avatar_images=(None, "https://upload.wikimedia.org/wikipedia/commons/d/d3/Phi-integrated-information-symbol.png",),
|
| 546 |
height=600
|
| 547 |
)
|
| 548 |
+
# trash icon clear all
|
| 549 |
+
chatbot.clear(lambda: [], None, conversation_state)
|
| 550 |
|
| 551 |
with gr.Row():
|
| 552 |
chat_input = gr.MultimodalTextbox(
|
|
|
|
| 566 |
gr.Markdown("### Audio Examples")
|
| 567 |
|
| 568 |
# Example 1
|
| 569 |
+
gr.Markdown("Example 1: **Transcribe this audio clip**")
|
| 570 |
gr.Audio("https://diamondfan.github.io/audio_files/english.weekend.plan.wav",
|
| 571 |
label="Preview", elem_id="small-audio")
|
| 572 |
|
|
|
|
| 575 |
gr.Markdown("-----")
|
| 576 |
|
| 577 |
# Example 2
|
| 578 |
+
gr.Markdown("Example 2: **Translate audio transcription to English**")
|
| 579 |
gr.Audio("https://diamondfan.github.io/audio_files/japanese.seattle.trip.report.wav",
|
| 580 |
label="Preview", elem_id="small-audio")
|
| 581 |
example2_btn = gr.Button("Run it", size="sm")
|
|
|
|
| 610 |
|
| 611 |
with gr.Tab("Image & Text"):
|
| 612 |
# Example 1
|
| 613 |
+
gr.Markdown("Example 1: **Write a limerick about this image**")
|
| 614 |
gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg", label="Preview")
|
| 615 |
img_example1_btn = gr.Button("Run it", size="sm")
|
| 616 |
|
| 617 |
# Example 2
|
| 618 |
+
gr.Markdown("Example 2: **Describe the chart in details.**")
|
| 619 |
gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg", label="Preview")
|
| 620 |
img_example2_btn = gr.Button("Run it", size="sm")
|
| 621 |
|
| 622 |
# Define handlers for image examples
|
| 623 |
def run_image_example1():
|
| 624 |
return process_image_example_direct(
|
| 625 |
+
"Describe this image in details.",
|
| 626 |
"https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg",
|
| 627 |
+
[], []
|
|
|
|
| 628 |
)
|
| 629 |
|
| 630 |
def run_image_example2():
|
| 631 |
return process_image_example_direct(
|
| 632 |
+
"Write a limerick about this image",
|
| 633 |
"https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg",
|
| 634 |
[], []
|
| 635 |
)
|