SmolVLM2

Running on Zero

App Files Files Community

merve HF Staff commited on Feb 13

Commit

de4762a

verified ·

1 Parent(s): 18c7142

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -59

app.py CHANGED Viewed

@@ -1,73 +1,63 @@
 import gradio as gr
 from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
-from transformers.models.smolvlm.video_processing_smolvlm import load_smolvlm_video
-from transformers.image_utils import load_image
 from threading import Thread
 import re
 import time
 import torch
-#import spaces
 #import subprocess
 #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 from io import BytesIO
-from transformers.image_utils import load_image
-processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
-model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct",
                                                _attn_implementation="flash_attention_2",
-                                               torch_dtype=torch.bfloat16, device_map="auto")
-#@spaces.GPU
 def model_inference(
-    input_dict, history
 ):
     text = input_dict["text"]
-    # first turn input_dict {'text': 'What', 'files': ['/tmp/gradio/0350274350a64a5737e1a5732f014aee2f28bb7344bbad5105c0d0b7e7334375/cats_2.mp4', '/tmp/gradio/2dd39f382fcf5444a1a2ac57ed6f9acafa775dd855248cf273034e8ce18aeff4/IMG_2201.JPG']}
-    # first turn  history []
-    print("input_dict", input_dict)
-    print("history", history)
-    print("model.device", model.device)
     images = []
     # first conv turn
     if history == []:
         text = input_dict["text"]
-        resulting_messages = [{"role": "user", "content": [{"type": "text"}, {"type": "text", "text": text}]}]
         for file in input_dict["files"]:
             if file.endswith(".mp4"):
-                resulting_messages[0]["content"].append({"type": "video"})
-                frames, timestamps, duration_sec = load_smolvlm_video(
-                    file, sampling_fps=1, max_frames=64
-                )
-                print("frames", frames)
-                images.append(frames)
             elif file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".png"):
-                resulting_messages[0]["content"].append({"type": "image"})
-                images.append(load_image(file))
-                print("images", images)
-    # second turn input_dict {'text': 'what', 'files': ['/tmp/gradio/7bafdcc4722c4b9902a4936439b3bb694927abd72106a946d773a15cc1c630d7/IMG_2198.JPG']}
-    # second turn history [[('/tmp/gradio/7bafdcc4722c4b9902a4936439b3bb694927abd72106a946d773a15cc1c630d7/IMG_2198.JPG',), None],
-    # [('/tmp/gradio/5b105e97e4876912b4e763902144540bd3ab00d9fd4016491337ee4f4c36f320/football.mp4',), None], ['what', None]]
-    # later conv turn
     elif len(history) > 0:
-        for hist in history:
-           if isinstance(hist[0], tuple):
-              if hist[0][0].endswith(".mp4"):
-                 resulting_messages.append({"role": "user", "content": [{"type": "video"}, {"type": "text", "text": hist[0][0]}]})
-                 frames, timestamps, duration_sec = load_smolvlm_video(
-                    file, sampling_fps=1, max_frames=64
-                )
-                 images.append(frames)
-              else:
-                 resulting_messages.append({"role": "user", "content": [{"type": "image"}, {"type": "text", "text": hist[0][0]}]})
-                 images.append(load_image(hist[0][0]))
-           elif isinstance(hist[0], str):
-              resulting_messages.append({"role": "user", "content": [{"type": "text"}, {"type": "text", "text": hist[0]}]})
-           if isinstance(hist[1], str):
-              resulting_messages.append({"role": "user", "content": [{"type": "text"}, {"type": "text", "text": hist[0]}]})
@@ -75,26 +65,22 @@ def model_inference(
         gr.Error("Please input a query and optionally image(s).")
     if text == "" and images:
-        gr.Error("Please input a text query along the image(s).")
-    print("resulting_messages", resulting_messages)
-    prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=[images], padding=True, return_tensors="pt")
     inputs = inputs.to(model.device)
-    generation_args = {
-        "input_ids": inputs.input_ids,
-        "pixel_values": inputs.pixel_values,
-        "attention_mask": inputs.attention_mask,
-        "num_return_sequences": 1,
-        "no_repeat_ngram_size": 2,
-        "max_new_tokens": 500,
-        "min_new_tokens": 10,
-    }
     # Generate
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_args = dict(inputs, streamer=streamer, max_new_tokens=500)
     generated_text = ""
     thread = Thread(target=model.generate, kwargs=generation_args)
@@ -127,6 +113,7 @@ demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video
                 examples=examples,
                 textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
                 cache_examples=False,
                 type="messages"
                 )

 import gradio as gr
 from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
 from threading import Thread
 import re
 import time
 import torch
+import spaces
 #import subprocess
 #subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 from io import BytesIO
+processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-500M-Instruct")
+model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM2-500M-Instruct",
                                                _attn_implementation="flash_attention_2",
+                                               torch_dtype=torch.bfloat16).to("cuda:0")
+@spaces.GPU
 def model_inference(
+    input_dict, history, max_tokens
 ):
     text = input_dict["text"]
     images = []
     # first conv turn
     if history == []:
         text = input_dict["text"]
+        resulting_messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
         for file in input_dict["files"]:
             if file.endswith(".mp4"):
+                resulting_messages[0]["content"].append({"type": "video", "path": file})
             elif file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".png"):
+                resulting_messages[0]["content"].append({"type": "image", "path": file})
     elif len(history) > 0:
+        resulting_messages = []
+        for entry in history:
+            if entry["role"] == "user":
+                user_content = []
+                if isinstance(entry["content"], tuple):
+                    file_name = entry["content"][0]
+                    if file_name.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
+                        user_content.append({"type": "image", "path": file_name})
+                    elif file_name.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
+                        user_content.append({"type": "video", "path": file_name})
+                elif isinstance(entry["content"], str):
+                    user_content.insert(0, {"type": "text", "text": entry["content"]})
+            elif entry["role"] == "assistant":
+                resulting_messages.append({
+                    "role": "user",
+                    "content": user_content
+                })
+                resulting_messages.append({
+                    "role": "assistant",
+                    "content": [{"type": "text", "text": entry["content"]}]
+                })
+                user_content = []
         gr.Error("Please input a query and optionally image(s).")
     if text == "" and images:
+        gr.Error("Please input a text query along the images(s).")
+    inputs = processor.apply_chat_template(
+    resulting_messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    )
     inputs = inputs.to(model.device)
     # Generate
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
     generated_text = ""
     thread = Thread(target=model.generate, kwargs=generation_args)
                 examples=examples,
                 textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
                 cache_examples=False,
+                additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
                 type="messages"
                 )