merve HF Staff commited on
Commit
7d72183
·
verified ·
1 Parent(s): 843b784

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -31
app.py CHANGED
@@ -10,55 +10,76 @@ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENT
10
 
11
  from io import BytesIO
12
 
13
- processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-500M-Instruct")
14
- model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM2-500M-Instruct",
15
  _attn_implementation="flash_attention_2",
16
  torch_dtype=torch.bfloat16).to("cuda:0")
17
 
18
 
19
- #@spaces.GPU
20
  def model_inference(
21
  input_dict, history, max_tokens
22
  ):
23
  text = input_dict["text"]
24
  images = []
25
- # first conv turn
 
26
  if history == []:
27
- text = input_dict["text"]
28
- resulting_messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
29
  for file in input_dict["files"]:
30
- if file.endswith(".mp4"):
31
- resulting_messages[0]["content"].append({"type": "video", "path": file})
32
-
33
- elif file.endswith(".jpg") or file.endswith(".jpeg") or file.endswith(".png"):
34
- resulting_messages[0]["content"].append({"type": "image", "path": file})
35
-
 
 
 
 
 
 
 
 
 
 
 
 
36
  elif len(history) > 0:
37
  resulting_messages = []
38
- for entry in history:
39
- if entry["role"] == "user":
40
- user_content = []
41
- if isinstance(entry["content"], tuple):
42
- file_name = entry["content"][0]
43
- if file_name.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
44
- user_content.append({"type": "image", "path": file_name})
45
- elif file_name.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
46
- user_content.append({"type": "video", "path": file_name})
47
- elif isinstance(entry["content"], str):
48
- user_content.insert(0, {"type": "text", "text": entry["content"]})
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- elif entry["role"] == "assistant":
51
  resulting_messages.append({
52
  "role": "user",
53
  "content": user_content
54
  })
55
  resulting_messages.append({
56
  "role": "assistant",
57
- "content": [{"type": "text", "text": entry["content"]}]
58
  })
59
- user_content = []
60
-
61
-
62
 
63
 
64
  if text == "" and not images:
@@ -66,7 +87,7 @@ def model_inference(
66
 
67
  if text == "" and images:
68
  gr.Error("Please input a text query along the images(s).")
69
-
70
  inputs = processor.apply_chat_template(
71
  resulting_messages,
72
  add_generation_prompt=True,
@@ -102,14 +123,16 @@ examples=[
102
  [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
103
  [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
104
  [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
105
- [{"text": "What art era do these artpieces belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
106
  [{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
107
  [{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
108
  [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
109
  [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}],
 
 
110
  ]
111
  demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
112
- description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
113
  examples=examples,
114
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
115
  cache_examples=False,
 
10
 
11
  from io import BytesIO
12
 
13
+ processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
14
+ model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct",
15
  _attn_implementation="flash_attention_2",
16
  torch_dtype=torch.bfloat16).to("cuda:0")
17
 
18
 
19
+ @spaces.GPU
20
  def model_inference(
21
  input_dict, history, max_tokens
22
  ):
23
  text = input_dict["text"]
24
  images = []
25
+ user_content = []
26
+ media_queue = []
27
  if history == []:
 
 
28
  for file in input_dict["files"]:
29
+ if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
30
+ media_queue.append({"type": "image", "path": file})
31
+ elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
32
+ media_queue.append({"type": "video", "path": file})
33
+
34
+ text = input_dict.get("text", "")
35
+ parts = re.split(r'(<image>|<video>)', text)
36
+
37
+ for part in parts:
38
+ if part == "<image>" and media_queue:
39
+ user_content.append(media_queue.pop(0))
40
+ elif part == "<video>" and media_queue:
41
+ user_content.append(media_queue.pop(0))
42
+ elif part.strip():
43
+ user_content.append({"type": "text", "text": part.strip()})
44
+
45
+ resulting_messages = [{"role": "user", "content": user_content}]
46
+
47
  elif len(history) > 0:
48
  resulting_messages = []
49
+ user_content = []
50
+ media_queue = []
51
+ for hist in history:
52
+ if hist["role"] == "user" and isinstance(hist["content"], tuple):
53
+ file_name = hist["content"][0]
54
+ if file_name.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
55
+ media_queue.append({"type": "image", "path": file_name})
56
+ elif file_name.endswith(".mp4"):
57
+ media_queue.append({"type": "video", "path": file_name})
58
+
59
+
60
+ for hist in history:
61
+ if hist["role"] == "user" and isinstance(hist["content"], str):
62
+ text = hist["content"]
63
+ parts = re.split(r'(<image>|<video>)', text)
64
+
65
+ for part in parts:
66
+ if part == "<image>" and media_queue:
67
+ user_content.append(media_queue.pop(0))
68
+ elif part == "<video>" and media_queue:
69
+ user_content.append(media_queue.pop(0))
70
+ elif part.strip():
71
+ user_content.append({"type": "text", "text": part.strip()})
72
 
73
+ elif hist["role"] == "assistant":
74
  resulting_messages.append({
75
  "role": "user",
76
  "content": user_content
77
  })
78
  resulting_messages.append({
79
  "role": "assistant",
80
+ "content": [{"type": "text", "text": hist["content"]}]
81
  })
82
+ user_content = []
 
 
83
 
84
 
85
  if text == "" and not images:
 
87
 
88
  if text == "" and images:
89
  gr.Error("Please input a text query along the images(s).")
90
+ print("resulting_messages", resulting_messages)
91
  inputs = processor.apply_chat_template(
92
  resulting_messages,
93
  add_generation_prompt=True,
 
123
  [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
124
  [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
125
  [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
126
+ [{"text": "What art era this artpiece <image> and this artpiece <image> belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
127
  [{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
128
  [{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
129
  [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
130
  [{"text": "What is this UI about?", "files": ["example_images/s2w_example.png"]}],
131
+ [{"text": "What is happening in the video?", "files": ["barcamadrichighlights.mpg"]}],
132
+
133
  ]
134
  demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
135
+ description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples. To see how to interleave images, check the multiple image example.",
136
  examples=examples,
137
  textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
138
  cache_examples=False,