Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -24,7 +24,7 @@ from transformers.image_utils import load_image
|
|
24 |
MAX_MAX_NEW_TOKENS = 2048
|
25 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
26 |
# Increase or disable input truncation to avoid token mismatches
|
27 |
-
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "
|
28 |
|
29 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
30 |
|
@@ -34,7 +34,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
34 |
MODEL_ID,
|
35 |
trust_remote_code=True,
|
36 |
torch_dtype=torch.float16
|
37 |
-
).to(
|
38 |
|
39 |
def downsample_video(video_path):
|
40 |
"""
|
@@ -80,15 +80,14 @@ def generate_image(text: str, image: Image.Image,
|
|
80 |
]
|
81 |
}]
|
82 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
83 |
-
# Use max-length padding and enable truncation
|
84 |
inputs = processor(
|
85 |
text=[prompt_full],
|
86 |
images=[image],
|
87 |
return_tensors="pt",
|
88 |
-
padding=
|
89 |
-
truncation=
|
90 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
91 |
-
).to(
|
92 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
93 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
94 |
thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
|
@@ -120,20 +119,19 @@ def generate_video(text: str, video_path: str,
|
|
120 |
{"role": "user", "content": [{"type": "text", "text": text}]}
|
121 |
]
|
122 |
# Append each frame with its timestamp.
|
123 |
-
for
|
|
|
124 |
messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
|
125 |
messages[1]["content"].append({"type": "image", "image": image})
|
126 |
-
|
127 |
-
# Enable truncation in template application
|
128 |
inputs = processor.apply_chat_template(
|
129 |
messages,
|
130 |
tokenize=True,
|
131 |
add_generation_prompt=True,
|
132 |
return_dict=True,
|
133 |
return_tensors="pt",
|
134 |
-
truncation=
|
135 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
136 |
-
).to(
|
137 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
138 |
generation_kwargs = {
|
139 |
**inputs,
|
@@ -165,6 +163,7 @@ video_examples = [
|
|
165 |
["Identify the main actions in the video", "videos/2.mp4"]
|
166 |
]
|
167 |
|
|
|
168 |
css = """
|
169 |
.submit-btn {
|
170 |
background-color: #2980b9 !important;
|
|
|
24 |
MAX_MAX_NEW_TOKENS = 2048
|
25 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
26 |
# Increase or disable input truncation to avoid token mismatches
|
27 |
+
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
|
28 |
|
29 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
30 |
|
|
|
34 |
MODEL_ID,
|
35 |
trust_remote_code=True,
|
36 |
torch_dtype=torch.float16
|
37 |
+
).to("cuda").eval()
|
38 |
|
39 |
def downsample_video(video_path):
|
40 |
"""
|
|
|
80 |
]
|
81 |
}]
|
82 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
|
83 |
inputs = processor(
|
84 |
text=[prompt_full],
|
85 |
images=[image],
|
86 |
return_tensors="pt",
|
87 |
+
padding=True,
|
88 |
+
truncation=False,
|
89 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
90 |
+
).to("cuda")
|
91 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
92 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
93 |
thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
|
|
|
119 |
{"role": "user", "content": [{"type": "text", "text": text}]}
|
120 |
]
|
121 |
# Append each frame with its timestamp.
|
122 |
+
for frame in frames:
|
123 |
+
image, timestamp = frame
|
124 |
messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
|
125 |
messages[1]["content"].append({"type": "image", "image": image})
|
|
|
|
|
126 |
inputs = processor.apply_chat_template(
|
127 |
messages,
|
128 |
tokenize=True,
|
129 |
add_generation_prompt=True,
|
130 |
return_dict=True,
|
131 |
return_tensors="pt",
|
132 |
+
truncation=False,
|
133 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
134 |
+
).to("cuda")
|
135 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
136 |
generation_kwargs = {
|
137 |
**inputs,
|
|
|
163 |
["Identify the main actions in the video", "videos/2.mp4"]
|
164 |
]
|
165 |
|
166 |
+
|
167 |
css = """
|
168 |
.submit-btn {
|
169 |
background-color: #2980b9 !important;
|