SmolVLM2 / app.py
merve's picture
merve HF Staff
Update app.py
cf8e08c verified
raw
history blame
5.21 kB
import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
from threading import Thread
import re
import time
import torch
import spaces
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
from io import BytesIO
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-500M-Instruct")
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM2-500M-Instruct",
_attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16).to("cuda:0")
#@spaces.GPU
def model_inference(
input_dict, history, max_tokens
):
text = input_dict["text"]
images = []
# first conv turn
if history == []:
text = input_dict["text"]
resulting_messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
for file in input_dict["files"]:
if file.endswith(".mp4"):
resulting_messages[0]["content"].append({"type": "video", "path": file})
elif file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".png"):
resulting_messages[0]["content"].append({"type": "image", "path": file})
elif len(history) > 0:
resulting_messages = []
for entry in history:
if entry["role"] == "user":
user_content = []
if isinstance(entry["content"], tuple):
file_name = entry["content"][0]
if file_name.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
user_content.append({"type": "image", "path": file_name})
elif file_name.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
user_content.append({"type": "video", "path": file_name})
elif isinstance(entry["content"], str):
user_content.insert(0, {"type": "text", "text": entry["content"]})
elif entry["role"] == "assistant":
resulting_messages.append({
"role": "user",
"content": user_content
})
resulting_messages.append({
"role": "assistant",
"content": [{"type": "text", "text": entry["content"]}]
})
user_content = []
if text == "" and not images:
gr.Error("Please input a query and optionally image(s).")
if text == "" and images:
gr.Error("Please input a text query along the images(s).")
inputs = processor.apply_chat_template(
resulting_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)
# Generate
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
generated_text = ""
thread = Thread(target=model.generate, kwargs=generation_args)
thread.start()
yield "..."
buffer = ""
for new_text in streamer:
buffer += new_text
generated_text_without_prompt = buffer#[len(ext_buffer):]
time.sleep(0.01)
yield buffer
examples=[
[{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
[{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
[{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
[{"text": "What art era do these artpieces belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
[{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
[{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
[{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
[{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}],
]
demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
examples=examples,
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
cache_examples=False,
additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
type="messages"
)
demo.launch(debug=True)