File size: 6,146 Bytes
e352103
36be50d
 
e352103
 
 
cf8e08c
 
 
e352103
18c7142
 
7d72183
 
18c7142
de4762a
e352103
 
7d72183
e352103
de4762a
36be50d
 
18c7142
7d72183
 
18c7142
 
7d72183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18c7142
de4762a
7d72183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de4762a
7d72183
de4762a
 
 
 
 
 
7d72183
de4762a
7d72183
36be50d
18c7142
e352103
 
 
 
de4762a
7d72183
de4762a
 
 
 
 
 
 
18c7142
 
de4762a
e352103
 
fe4fa5b
de4762a
36be50d
 
 
 
6d64276
 
36be50d
 
 
 
 
 
 
 
 
 
 
 
d91c73b
7869234
1f3cc30
7d72183
1f3cc30
 
 
 
c1d3aca
7d72183
36be50d
18c7142
7d72183
36be50d
18c7142
 
de4762a
18c7142
36be50d
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import gradio as gr
from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
from threading import Thread
import re
import time
import torch
import spaces
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

from io import BytesIO

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", 
                                               _attn_implementation="flash_attention_2",
                                               torch_dtype=torch.bfloat16).to("cuda:0")


@spaces.GPU
def model_inference(
    input_dict, history, max_tokens
): 
    text = input_dict["text"]
    images = []
    user_content = []
    media_queue = []
    if history == []:
        for file in input_dict["files"]:
            if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
                media_queue.append({"type": "image", "path": file})
            elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
                media_queue.append({"type": "video", "path": file})

        text = input_dict.get("text", "")
        parts = re.split(r'(<image>|<video>)', text)  

        for part in parts:
            if part == "<image>" and media_queue:
                user_content.append(media_queue.pop(0)) 
            elif part == "<video>" and media_queue:
                user_content.append(media_queue.pop(0)) 
            elif part.strip():  
                user_content.append({"type": "text", "text": part.strip()})
        
        resulting_messages = [{"role": "user", "content": user_content}]

    elif len(history) > 0:
        resulting_messages = []
        user_content = []
        media_queue = []
        for hist in history:
            if hist["role"] == "user" and isinstance(hist["content"], tuple): 
                file_name = hist["content"][0]
            if file_name.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
                media_queue.append({"type": "image", "path": file_name})
            elif file_name.endswith(".mp4"):
                media_queue.append({"type": "video", "path": file_name})


        for hist in history:
            if hist["role"] == "user" and isinstance(hist["content"], str): 
                text = hist["content"]
                parts = re.split(r'(<image>|<video>)', text)  
                
                for part in parts:
                    if part == "<image>" and media_queue:
                        user_content.append(media_queue.pop(0)) 
                    elif part == "<video>" and media_queue:
                        user_content.append(media_queue.pop(0))  
                    elif part.strip(): 
                        user_content.append({"type": "text", "text": part.strip()})
            
            elif hist["role"] == "assistant":
                resulting_messages.append({
                    "role": "user",
                    "content": user_content
                })
                resulting_messages.append({
                    "role": "assistant",
                    "content": [{"type": "text", "text": hist["content"]}]
                })
                user_content = [] 


    if text == "" and not images:
        gr.Error("Please input a query and optionally image(s).")

    if text == "" and images:
        gr.Error("Please input a text query along the images(s).")
    print("resulting_messages", resulting_messages)
    inputs = processor.apply_chat_template(
    resulting_messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
    )

    inputs = inputs.to(model.device)
    

    # Generate
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
    generated_text = ""

    thread = Thread(target=model.generate, kwargs=generation_args)
    thread.start()

    yield "..."
    buffer = ""
    
      
    for new_text in streamer:
    
      buffer += new_text
      generated_text_without_prompt = buffer#[len(ext_buffer):]
      time.sleep(0.01)
      yield buffer


examples=[
              [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
              [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
              [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
              [{"text": "What art era this artpiece <image> and this artpiece <image> belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
              [{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
              [{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
              [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
              [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}],
            [{"text": "What is happening in the video?", "files": ["barcamadrichighlights.mp4"]}],

      ]
demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺", 
                description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples. To see how to interleave images, check the multiple image example.",
                examples=examples,
                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
                cache_examples=False,
                additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
                type="messages"
                )
      
      

demo.launch(debug=True)