Spaces:

shb777
/

Granite-Vision-3.1-2B

Running on Zero

App Files Files Community

shb777 commited on Feb 14

Commit

4371bd7

1 Parent(s): dc4de1a

Fix chat history and sending image with every message

Browse files

Files changed (1) hide show

app.py +82 -20

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import spaces
 import random
 import torch
 import gradio as gr
 from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
@@ -8,27 +9,28 @@ model_id = "ibm-granite/granite-vision-3.1-2b-preview"
 processor = LlavaNextProcessor.from_pretrained(model_id, use_fast=True)
 model = LlavaNextForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
-def get_text_from_content(content):
-    texts = []
-    for item in content:
-        if item["type"] == "text":
-            texts.append(item["text"])
-        elif item["type"] == "image":
-            texts.append("<image>")
-    return " ".join(texts)
 @spaces.GPU
 def chat_inference(image, text, temperature, top_p, top_k, max_tokens, conversation):
-    if conversation is None:
-        conversation = []
     user_content = []
     if image is not None:
         if image.width > 512 or image.height > 512:
             image.thumbnail((512, 512))
         user_content.append({"type": "image", "image": image})
     if text and text.strip():
         user_content.append({"type": "text", "text": text.strip()})
     if not user_content:
         return conversation_display(conversation), conversation
@@ -37,6 +39,9 @@ def chat_inference(image, text, temperature, top_p, top_k, max_tokens, conversat
         "content": user_content
     })
     inputs = processor.apply_chat_template(
         conversation,
         add_generation_prompt=True,
@@ -59,29 +64,87 @@ def chat_inference(image, text, temperature, top_p, top_k, max_tokens, conversat
         generation_kwargs["do_sample"] = True
     output = model.generate(**inputs, **generation_kwargs)
-    assistant_response = processor.decode(output[0], skip_special_tokens=True)
     conversation.append({
         "role": "assistant",
-        "content": [{"type": "text", "text": assistant_response.strip()}]
     })
     return conversation_display(conversation), conversation
 def conversation_display(conversation):
     chat_history = []
     for msg in conversation:
         if msg["role"] == "user":
-            user_text = get_text_from_content(msg["content"])
-        elif msg["role"] == "assistant":
-            assistant_text = msg["content"][0]["text"].split("<|assistant|>")[-1].strip()
-            chat_history.append({"role": "user", "content": user_text})
-            chat_history.append({"role": "assistant", "content": assistant_text})
     return chat_history
 def clear_chat():
     return [], [], "", None
 with gr.Blocks(title="Granite Vision 3.1 2B", css="h1 { overflow: hidden; }") as demo:
     gr.Markdown("# [Granite Vision 3.1 2B](https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview)")
@@ -101,7 +164,6 @@ with gr.Blocks(title="Granite Vision 3.1 2B", css="h1 { overflow: hidden; }") as
                 send_button = gr.Button("Chat")
                 clear_button = gr.Button("Clear Chat")
     state = gr.State([])
     send_button.click(

 import spaces
 import random
 import torch
+import hashlib
 import gradio as gr
 from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 processor = LlavaNextProcessor.from_pretrained(model_id, use_fast=True)
 model = LlavaNextForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
+SYSTEM_PROMPT = (
+    "A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions."
+)
 @spaces.GPU
 def chat_inference(image, text, temperature, top_p, top_k, max_tokens, conversation):
+    if conversation is None or conversation == []:
+        conversation = [{
+            "role": "system",
+            "content": [{"type": "text", "text": SYSTEM_PROMPT}]
+        }]
     user_content = []
     if image is not None:
         if image.width > 512 or image.height > 512:
             image.thumbnail((512, 512))
         user_content.append({"type": "image", "image": image})
     if text and text.strip():
         user_content.append({"type": "text", "text": text.strip()})
     if not user_content:
         return conversation_display(conversation), conversation
         "content": user_content
     })
+    conversation = preprocess_conversation(conversation)
+    # Generate input prompt using the chat template.
     inputs = processor.apply_chat_template(
         conversation,
         add_generation_prompt=True,
         generation_kwargs["do_sample"] = True
     output = model.generate(**inputs, **generation_kwargs)
+    raw_response = processor.decode(output[0], skip_special_tokens=True)
+    assistant_text = extract_answer(raw_response)
+    # Append the assistant's answer.
     conversation.append({
         "role": "assistant",
+        "content": [{"type": "text", "text": assistant_text}]
     })
     return conversation_display(conversation), conversation
+def extract_answer(response):
+    if "<|assistant|>" in response:
+        return response.split("<|assistant|>")[-1].strip()
+    return response.strip()
+def compute_image_hash(image):
+    image = image.convert("RGB")
+    image_bytes = image.tobytes()
+    return hashlib.md5(image_bytes).hexdigest()
+def preprocess_conversation(conversation):
+    # Find the last sent image in previous user messages (excluding the latest message)
+    last_image_hash = None
+    for msg in reversed(conversation[:-1]):
+        if msg.get("role") == "user":
+            for item in msg.get("content", []):
+                if item.get("type") == "image" and item.get("image") is not None:
+                    try:
+                        last_image_hash = compute_image_hash(item["image"])
+                        break
+                    except Exception as e:
+                        continue
+            if last_image_hash is not None:
+                break
+    # Process the latest user message.
+    latest_msg = conversation[-1]
+    if latest_msg.get("role") == "user":
+        new_content = []
+        for item in latest_msg.get("content", []):
+            if item.get("type") == "image" and item.get("image") is not None:
+                try:
+                    current_hash = compute_image_hash(item["image"])
+                except Exception as e:
+                    current_hash = None
+                # Remove the image if it matches the last sent image.
+                if last_image_hash is not None and current_hash is not None and current_hash == last_image_hash:
+                    continue
+                else:
+                    new_content.append(item)
+            else:
+                new_content.append(item)
+        latest_msg["content"] = new_content
+    return conversation
 def conversation_display(conversation):
     chat_history = []
     for msg in conversation:
         if msg["role"] == "user":
+            texts = []
+            for item in msg["content"]:
+                if item["type"] == "image":
+                    texts.append("<image>")
+                elif item["type"] == "text":
+                    texts.append(item["text"])
+            chat_history.append({
+                "role": "user",
+                "content": "\n".join(texts)
+            })
+        else:
+            chat_history.append({
+                "role": msg["role"],
+                "content": msg["content"][0]["text"]
+            })
     return chat_history
 def clear_chat():
     return [], [], "", None
 with gr.Blocks(title="Granite Vision 3.1 2B", css="h1 { overflow: hidden; }") as demo:
     gr.Markdown("# [Granite Vision 3.1 2B](https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview)")
                 send_button = gr.Button("Chat")
                 clear_button = gr.Button("Clear Chat")
     state = gr.State([])
     send_button.click(