Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,146 Bytes
e352103 36be50d e352103 cf8e08c e352103 18c7142 7d72183 18c7142 de4762a e352103 7d72183 e352103 de4762a 36be50d 18c7142 7d72183 18c7142 7d72183 18c7142 de4762a 7d72183 de4762a 7d72183 de4762a 7d72183 de4762a 7d72183 36be50d 18c7142 e352103 de4762a 7d72183 de4762a 18c7142 de4762a e352103 fe4fa5b de4762a 36be50d 6d64276 36be50d d91c73b 7869234 1f3cc30 7d72183 1f3cc30 c1d3aca 7d72183 36be50d 18c7142 7d72183 36be50d 18c7142 de4762a 18c7142 36be50d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
from threading import Thread
import re
import time
import torch
import spaces
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
from io import BytesIO
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct",
_attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16).to("cuda:0")
@spaces.GPU
def model_inference(
input_dict, history, max_tokens
):
text = input_dict["text"]
images = []
user_content = []
media_queue = []
if history == []:
for file in input_dict["files"]:
if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
media_queue.append({"type": "image", "path": file})
elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
media_queue.append({"type": "video", "path": file})
text = input_dict.get("text", "")
parts = re.split(r'(<image>|<video>)', text)
for part in parts:
if part == "<image>" and media_queue:
user_content.append(media_queue.pop(0))
elif part == "<video>" and media_queue:
user_content.append(media_queue.pop(0))
elif part.strip():
user_content.append({"type": "text", "text": part.strip()})
resulting_messages = [{"role": "user", "content": user_content}]
elif len(history) > 0:
resulting_messages = []
user_content = []
media_queue = []
for hist in history:
if hist["role"] == "user" and isinstance(hist["content"], tuple):
file_name = hist["content"][0]
if file_name.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
media_queue.append({"type": "image", "path": file_name})
elif file_name.endswith(".mp4"):
media_queue.append({"type": "video", "path": file_name})
for hist in history:
if hist["role"] == "user" and isinstance(hist["content"], str):
text = hist["content"]
parts = re.split(r'(<image>|<video>)', text)
for part in parts:
if part == "<image>" and media_queue:
user_content.append(media_queue.pop(0))
elif part == "<video>" and media_queue:
user_content.append(media_queue.pop(0))
elif part.strip():
user_content.append({"type": "text", "text": part.strip()})
elif hist["role"] == "assistant":
resulting_messages.append({
"role": "user",
"content": user_content
})
resulting_messages.append({
"role": "assistant",
"content": [{"type": "text", "text": hist["content"]}]
})
user_content = []
if text == "" and not images:
gr.Error("Please input a query and optionally image(s).")
if text == "" and images:
gr.Error("Please input a text query along the images(s).")
print("resulting_messages", resulting_messages)
inputs = processor.apply_chat_template(
resulting_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)
# Generate
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
generated_text = ""
thread = Thread(target=model.generate, kwargs=generation_args)
thread.start()
yield "..."
buffer = ""
for new_text in streamer:
buffer += new_text
generated_text_without_prompt = buffer#[len(ext_buffer):]
time.sleep(0.01)
yield buffer
examples=[
[{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
[{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
[{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
[{"text": "What art era this artpiece <image> and this artpiece <image> belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
[{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
[{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
[{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
[{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}],
[{"text": "What is happening in the video?", "files": ["barcamadrichighlights.mp4"]}],
]
demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples. To see how to interleave images, check the multiple image example.",
examples=examples,
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
cache_examples=False,
additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
type="messages"
)
demo.launch(debug=True)
|