Spaces:

microsoft
/

phi-4-multimodal

Running

App Files Files Community

nguyenbh commited on Feb 27

Commit

089499a

1 Parent(s): fd1391b

Update chat history

Browse files

Files changed (1) hide show

app.py +69 -14

app.py CHANGED Viewed

@@ -20,6 +20,7 @@ logger = logging.getLogger(__name__)
 url = os.getenv("AZURE_ENDPOINT")
 api_key = os.getenv("AZURE_API_KEY")
 # Initialize MIME types
 mimetypes.init()
@@ -219,6 +220,22 @@ def process_message(history, message, conversation_state):
     if text_content:
         content_items.append({"type": "text", "text": text_content})
     # Process and immediately convert files to base64
     if message["files"] and len(message["files"]) > 0:
         for file_path in message["files"]:
@@ -237,6 +254,10 @@ def process_message(history, message, conversation_state):
                     }
                 })
                 image_files.append(file_path)
             elif mime_type.startswith("audio/"):
                 content_items.append({
                     "type": "audio_url",
@@ -245,9 +266,19 @@ def process_message(history, message, conversation_state):
                     }
                 })
                 audio_files.append(file_path)
     # Only proceed if we have content
     if content_items:
         # Add to Gradio chatbot history (for display)
         history.append({"role": "user", "content": text_content})
@@ -255,8 +286,7 @@ def process_message(history, message, conversation_state):
         for file_path in image_files + audio_files:
             history.append({"role": "user", "content": {"path": file_path}})
-        print(f"DEBUG: history = {history}")
         # Add to internal conversation state (with base64 data)
         conversation_state.append({
@@ -278,6 +308,20 @@ def process_audio_example_direct(example_text, example_audio_url, history, conve
         if conversation_state is None:
             conversation_state = []
         # Fetch audio and convert to base64 directly using improved function
         mime_type, base64_audio = improved_fetch_audio_from_url(example_audio_url)
@@ -325,6 +369,20 @@ def process_image_example_direct(example_text, example_image_url, history, conve
         if conversation_state is None:
             conversation_state = []
         # Fetch image and convert to base64 directly
         mime_type, base64_image = fetch_image_from_url(example_image_url)
@@ -413,8 +471,6 @@ def bot_response(history, conversation_state):
         result = f"Error processing response: {str(e)}"
     # Add bot response to history
-    if result == "None":
-        result = "Current implementation does not support text + audio + image inputs in the same conversation. Please hit Clear conversation button."
     history.append({"role": "assistant", "content": result})
     # Add to conversation state
@@ -423,8 +479,6 @@ def bot_response(history, conversation_state):
         "content": [{"type": "text", "text": result}]
     })
-    print(f"DEBUG: history after response: {history}")
     return history, conversation_state
 def enable_input():
@@ -491,6 +545,8 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
                 avatar_images=(None, "https://upload.wikimedia.org/wikipedia/commons/d/d3/Phi-integrated-information-symbol.png",),
                 height=600
             )
             with gr.Row():
                 chat_input = gr.MultimodalTextbox(
@@ -510,7 +566,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
                 gr.Markdown("### Audio Examples")
                 # Example 1
-                gr.Markdown("**Example 1: Transcribe this audio clip**")
                 gr.Audio("https://diamondfan.github.io/audio_files/english.weekend.plan.wav",
                          label="Preview", elem_id="small-audio")
@@ -519,7 +575,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
                 gr.Markdown("-----")
                 # Example 2
-                gr.Markdown("**Example 2: Translate audio transcription to English**")
                 gr.Audio("https://diamondfan.github.io/audio_files/japanese.seattle.trip.report.wav",
                          label="Preview", elem_id="small-audio")
                 example2_btn = gr.Button("Run it", size="sm")
@@ -554,27 +610,26 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
             with gr.Tab("Image & Text"):
                 # Example 1
-                gr.Markdown("**Example 1: What's in this image?**")
                 gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg", label="Preview")
                 img_example1_btn = gr.Button("Run it", size="sm")
                 # Example 2
-                gr.Markdown("**Example 2: Describe this chart**")
                 gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg", label="Preview")
                 img_example2_btn = gr.Button("Run it", size="sm")
                 # Define handlers for image examples
                 def run_image_example1():
                     return process_image_example_direct(
-                        "What's in this image?",
                         "https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg",
-                        [], #chatbot.value,
-                        [], #conversation_state.value
                     )
                 def run_image_example2():
                     return process_image_example_direct(
-                        "Describe this chart",
                         "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg",
                         [], []
                     )

 url = os.getenv("AZURE_ENDPOINT")
 api_key = os.getenv("AZURE_API_KEY")
 # Initialize MIME types
 mimetypes.init()
     if text_content:
         content_items.append({"type": "text", "text": text_content})
+    # Check if we need to clear history when uploading a second image or audio
+    should_clear_history = False
+    # Count existing images and audio in history
+    existing_images = 0
+    existing_audio = 0
+    for msg in conversation_state:
+        if msg["role"] == "user" and "content" in msg:
+            for content_item in msg["content"]:
+                if isinstance(content_item, dict):
+                    if content_item.get("type") == "image_url":
+                        existing_images += 1
+                    elif content_item.get("type") == "audio_url":
+                        existing_audio += 1
     # Process and immediately convert files to base64
     if message["files"] and len(message["files"]) > 0:
         for file_path in message["files"]:
                     }
                 })
                 image_files.append(file_path)
+                # Check if this is a second image
+                if existing_images > 0:
+                    should_clear_history = True
+                    logger.info("Detected second image upload - clearing history")
             elif mime_type.startswith("audio/"):
                 content_items.append({
                     "type": "audio_url",
                     }
                 })
                 audio_files.append(file_path)
+                # Check if this is a second audio
+                if existing_audio > 0:
+                    should_clear_history = True
+                    logger.info("Detected second audio upload - clearing history")
     # Only proceed if we have content
     if content_items:
+        # Clear history if we're uploading a second image or audio
+        if should_clear_history:
+            history = []
+            conversation_state = []
+            logger.info("History cleared due to second image/audio upload")
         # Add to Gradio chatbot history (for display)
         history.append({"role": "user", "content": text_content})
         for file_path in image_files + audio_files:
             history.append({"role": "user", "content": {"path": file_path}})
+        logger.info(f"Updated history with user message. Current conversation has {existing_images + len(image_files)} images and {existing_audio + len(audio_files)} audio files")
         # Add to internal conversation state (with base64 data)
         conversation_state.append({
         if conversation_state is None:
             conversation_state = []
+        # Check if we need to clear history (if there's already an audio in the conversation)
+        should_clear_history = False
+        for msg in conversation_state:
+            if msg["role"] == "user" and "content" in msg:
+                for content_item in msg["content"]:
+                    if isinstance(content_item, dict) and content_item.get("type") == "audio_url":
+                        should_clear_history = True
+                        break
+        if should_clear_history:
+            history = []
+            conversation_state = []
+            logger.info("History cleared due to example with second audio")
         # Fetch audio and convert to base64 directly using improved function
         mime_type, base64_audio = improved_fetch_audio_from_url(example_audio_url)
         if conversation_state is None:
             conversation_state = []
+        # Check if we need to clear history (if there's already an image in the conversation)
+        should_clear_history = False
+        for msg in conversation_state:
+            if msg["role"] == "user" and "content" in msg:
+                for content_item in msg["content"]:
+                    if isinstance(content_item, dict) and content_item.get("type") == "image_url":
+                        should_clear_history = True
+                        break
+        if should_clear_history:
+            history = []
+            conversation_state = []
+            logger.info("History cleared due to example with second image")
         # Fetch image and convert to base64 directly
         mime_type, base64_image = fetch_image_from_url(example_image_url)
         result = f"Error processing response: {str(e)}"
     # Add bot response to history
     history.append({"role": "assistant", "content": result})
     # Add to conversation state
         "content": [{"type": "text", "text": result}]
     })
     return history, conversation_state
 def enable_input():
                 avatar_images=(None, "https://upload.wikimedia.org/wikipedia/commons/d/d3/Phi-integrated-information-symbol.png",),
                 height=600
             )
+            # trash icon clear all
+            chatbot.clear(lambda: [], None, conversation_state)
             with gr.Row():
                 chat_input = gr.MultimodalTextbox(
                 gr.Markdown("### Audio Examples")
                 # Example 1
+                gr.Markdown("Example 1: **Transcribe this audio clip**")
                 gr.Audio("https://diamondfan.github.io/audio_files/english.weekend.plan.wav",
                          label="Preview", elem_id="small-audio")
                 gr.Markdown("-----")
                 # Example 2
+                gr.Markdown("Example 2: **Translate audio transcription to English**")
                 gr.Audio("https://diamondfan.github.io/audio_files/japanese.seattle.trip.report.wav",
                          label="Preview", elem_id="small-audio")
                 example2_btn = gr.Button("Run it", size="sm")
             with gr.Tab("Image & Text"):
                 # Example 1
+                gr.Markdown("Example 1: **Write a limerick about this image**")
                 gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg", label="Preview")
                 img_example1_btn = gr.Button("Run it", size="sm")
                 # Example 2
+                gr.Markdown("Example 2: **Describe the chart in details.**")
                 gr.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg", label="Preview")
                 img_example2_btn = gr.Button("Run it", size="sm")
                 # Define handlers for image examples
                 def run_image_example1():
                     return process_image_example_direct(
+                        "Describe this image in details.",
                         "https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Hanoi_Temple_of_Literature.jpg/640px-Hanoi_Temple_of_Literature.jpg",
+                        [], []
                     )
                 def run_image_example2():
                     return process_image_example_direct(
+                        "Write a limerick about this image",
                         "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/Places_to_visit_in_Vietnam_-_SOTC.jpg/640px-Places_to_visit_in_Vietnam_-_SOTC.jpg",
                         [], []
                     )