Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -25,23 +25,29 @@ def model_inference(
|
|
25 |
user_content = []
|
26 |
media_queue = []
|
27 |
if history == []:
|
28 |
-
|
|
|
|
|
29 |
if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
|
30 |
media_queue.append({"type": "image", "path": file})
|
31 |
elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
|
32 |
media_queue.append({"type": "video", "path": file})
|
33 |
|
34 |
-
text
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
-
for part in parts:
|
38 |
-
if part == "<image>" and media_queue:
|
39 |
-
user_content.append(media_queue.pop(0))
|
40 |
-
elif part == "<video>" and media_queue:
|
41 |
-
user_content.append(media_queue.pop(0))
|
42 |
-
elif part.strip():
|
43 |
-
user_content.append({"type": "text", "text": part.strip()})
|
44 |
-
|
45 |
resulting_messages = [{"role": "user", "content": user_content}]
|
46 |
|
47 |
elif len(history) > 0:
|
@@ -51,7 +57,7 @@ def model_inference(
|
|
51 |
for hist in history:
|
52 |
if hist["role"] == "user" and isinstance(hist["content"], tuple):
|
53 |
file_name = hist["content"][0]
|
54 |
-
if file_name.endswith((".png", ".jpg", ".jpeg"
|
55 |
media_queue.append({"type": "image", "path": file_name})
|
56 |
elif file_name.endswith(".mp4"):
|
57 |
media_queue.append({"type": "video", "path": file_name})
|
@@ -120,19 +126,15 @@ def model_inference(
|
|
120 |
|
121 |
|
122 |
examples=[
|
123 |
-
[{"text": "
|
124 |
-
[{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
|
125 |
-
[{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
|
126 |
[{"text": "What art era this artpiece <image> and this artpiece <image> belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
|
127 |
[{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
|
128 |
[{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
|
129 |
[{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
|
130 |
-
[{"text": "What is
|
131 |
-
[{"text": "What is happening in the video?", "files": ["example_images/barcamadridhighlights.mp4"]}],
|
132 |
-
|
133 |
]
|
134 |
demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
|
135 |
-
description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples.
|
136 |
examples=examples,
|
137 |
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
|
138 |
cache_examples=False,
|
|
|
25 |
user_content = []
|
26 |
media_queue = []
|
27 |
if history == []:
|
28 |
+
text = input_dict["text"].strip()
|
29 |
+
|
30 |
+
for file in input_dict.get("files", []):
|
31 |
if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
|
32 |
media_queue.append({"type": "image", "path": file})
|
33 |
elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
|
34 |
media_queue.append({"type": "video", "path": file})
|
35 |
|
36 |
+
if "<image>" in text or "<video>" in text:
|
37 |
+
parts = re.split(r'(<image>|<video>)', text)
|
38 |
+
for part in parts:
|
39 |
+
if part == "<image>" and media_queue:
|
40 |
+
user_content.append(media_queue.pop(0))
|
41 |
+
elif part == "<video>" and media_queue:
|
42 |
+
user_content.append(media_queue.pop(0))
|
43 |
+
elif part.strip():
|
44 |
+
user_content.append({"type": "text", "text": part.strip()})
|
45 |
+
else:
|
46 |
+
user_content.append({"type": "text", "text": text})
|
47 |
+
|
48 |
+
for media in media_queue:
|
49 |
+
user_content.append(media)
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
resulting_messages = [{"role": "user", "content": user_content}]
|
52 |
|
53 |
elif len(history) > 0:
|
|
|
57 |
for hist in history:
|
58 |
if hist["role"] == "user" and isinstance(hist["content"], tuple):
|
59 |
file_name = hist["content"][0]
|
60 |
+
if file_name.endswith((".png", ".jpg", ".jpeg")):
|
61 |
media_queue.append({"type": "image", "path": file_name})
|
62 |
elif file_name.endswith(".mp4"):
|
63 |
media_queue.append({"type": "video", "path": file_name})
|
|
|
126 |
|
127 |
|
128 |
examples=[
|
129 |
+
[[{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
|
|
|
|
|
130 |
[{"text": "What art era this artpiece <image> and this artpiece <image> belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
|
131 |
[{"text": "Describe this image.", "files": ["example_images/campeones.jpg"]}],
|
132 |
[{"text": "What does this say?", "files": ["example_images/math.jpg"]}],
|
133 |
[{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
|
134 |
+
[{"text": "What is happening in the video?", "files": ["example_images/barcamadridhighlights.mp4"]}],
|
|
|
|
|
135 |
]
|
136 |
demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
|
137 |
+
description="Play with [SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) in this demo. To get started, upload an image and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
|
138 |
examples=examples,
|
139 |
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
|
140 |
cache_examples=False,
|