SmolVLM2

Build error

App Files Files Community

merve HF Staff commited on Feb 12

Commit

18c7142

verified ·

1 Parent(s): 7869234

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -31

app.py CHANGED Viewed

@@ -1,55 +1,87 @@
 import gradio as gr
 from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
 from transformers.image_utils import load_image
 from threading import Thread
 import re
 import time
 import torch
-import spaces
 #import subprocess
 #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct-250M")
-model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct-250M",
-        torch_dtype=torch.bfloat16,
-        #_attn_implementation="flash_attention_2"
-        ).to("cuda")
-@spaces.GPU
 def model_inference(
     input_dict, history
 ):
     text = input_dict["text"]
-    print(input_dict["files"])
-    if len(input_dict["files"]) > 1:
-      images = [load_image(image) for image in input_dict["files"]]
-    elif len(input_dict["files"]) == 1:
-      images = [load_image(input_dict["files"][0])]
-    else:
-      images = []
     if text == "" and not images:
         gr.Error("Please input a query and optionally image(s).")
     if text == "" and images:
         gr.Error("Please input a text query along the image(s).")
-    resulting_messages = [
-                {
-                    "role": "user",
-                    "content": [{"type": "image"} for _ in range(len(images))] + [
-                        {"type": "text", "text": text}
-                    ]
-                }
-            ]
     prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=[images], return_tensors="pt")
-    inputs = inputs.to('cuda')
     generation_args = {
         "input_ids": inputs.input_ids,
         "pixel_values": inputs.pixel_values,
@@ -90,11 +122,12 @@ examples=[
               [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
               [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}],
       ]
-demo = gr.ChatInterface(fn=model_inference, title="SmolVLM-256M: The Smollest VLM ever 💫",
-                description="Play with [HuggingFaceTB/SmolVLM-Instruct-250M](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct-250M) in this demo. To get started, upload an image and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
                 examples=examples,
-                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
-                cache_examples=False
                 )

 import gradio as gr
 from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
+from transformers.models.smolvlm.video_processing_smolvlm import load_smolvlm_video
 from transformers.image_utils import load_image
 from threading import Thread
 import re
 import time
 import torch
+#import spaces
 #import subprocess
 #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+from io import BytesIO
+from transformers.image_utils import load_image
+processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
+model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+                                               _attn_implementation="flash_attention_2",
+                                               torch_dtype=torch.bfloat16, device_map="auto")
+#@spaces.GPU
 def model_inference(
     input_dict, history
 ):
     text = input_dict["text"]
+    # first turn input_dict {'text': 'What', 'files': ['/tmp/gradio/0350274350a64a5737e1a5732f014aee2f28bb7344bbad5105c0d0b7e7334375/cats_2.mp4', '/tmp/gradio/2dd39f382fcf5444a1a2ac57ed6f9acafa775dd855248cf273034e8ce18aeff4/IMG_2201.JPG']}
+    # first turn  history []
+    print("input_dict", input_dict)
+    print("history", history)
+    print("model.device", model.device)
+    images = []
+    # first conv turn
+    if history == []:
+        text = input_dict["text"]
+        resulting_messages = [{"role": "user", "content": [{"type": "text"}, {"type": "text", "text": text}]}]
+        for file in input_dict["files"]:
+            if file.endswith(".mp4"):
+                resulting_messages[0]["content"].append({"type": "video"})
+                frames, timestamps, duration_sec = load_smolvlm_video(
+                    file, sampling_fps=1, max_frames=64
+                )
+                print("frames", frames)
+                images.append(frames)
+            elif file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".png"):
+                resulting_messages[0]["content"].append({"type": "image"})
+                images.append(load_image(file))
+                print("images", images)
+    # second turn input_dict {'text': 'what', 'files': ['/tmp/gradio/7bafdcc4722c4b9902a4936439b3bb694927abd72106a946d773a15cc1c630d7/IMG_2198.JPG']}
+    # second turn history [[('/tmp/gradio/7bafdcc4722c4b9902a4936439b3bb694927abd72106a946d773a15cc1c630d7/IMG_2198.JPG',), None],
+    # [('/tmp/gradio/5b105e97e4876912b4e763902144540bd3ab00d9fd4016491337ee4f4c36f320/football.mp4',), None], ['what', None]]
+    # later conv turn
+    elif len(history) > 0:
+        for hist in history:
+           if isinstance(hist[0], tuple):
+              if hist[0][0].endswith(".mp4"):
+                 resulting_messages.append({"role": "user", "content": [{"type": "video"}, {"type": "text", "text": hist[0][0]}]})
+                 frames, timestamps, duration_sec = load_smolvlm_video(
+                    file, sampling_fps=1, max_frames=64
+                )
+                 images.append(frames)
+              else:
+                 resulting_messages.append({"role": "user", "content": [{"type": "image"}, {"type": "text", "text": hist[0][0]}]})
+                 images.append(load_image(hist[0][0]))
+           elif isinstance(hist[0], str):
+              resulting_messages.append({"role": "user", "content": [{"type": "text"}, {"type": "text", "text": hist[0]}]})
+           if isinstance(hist[1], str):
+              resulting_messages.append({"role": "user", "content": [{"type": "text"}, {"type": "text", "text": hist[0]}]})
     if text == "" and not images:
         gr.Error("Please input a query and optionally image(s).")
     if text == "" and images:
         gr.Error("Please input a text query along the image(s).")
+    print("resulting_messages", resulting_messages)
     prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=[images], padding=True, return_tensors="pt")
+    inputs = inputs.to(model.device)
     generation_args = {
         "input_ids": inputs.input_ids,
         "pixel_values": inputs.pixel_values,
               [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
               [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}],
       ]
+demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
+                description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
                 examples=examples,
+                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
+                cache_examples=False,
+                type="messages"
                 )