prithivMLmods commited on
Commit
83c1dff
·
verified ·
1 Parent(s): 2a06976

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -11
app.py CHANGED
@@ -24,7 +24,7 @@ from transformers.image_utils import load_image
24
  MAX_MAX_NEW_TOKENS = 2048
25
  DEFAULT_MAX_NEW_TOKENS = 1024
26
  # Increase or disable input truncation to avoid token mismatches
27
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
28
 
29
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
30
 
@@ -34,7 +34,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
34
  MODEL_ID,
35
  trust_remote_code=True,
36
  torch_dtype=torch.float16
37
- ).to(device).eval()
38
 
39
  def downsample_video(video_path):
40
  """
@@ -80,15 +80,14 @@ def generate_image(text: str, image: Image.Image,
80
  ]
81
  }]
82
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
83
- # Use max-length padding and enable truncation
84
  inputs = processor(
85
  text=[prompt_full],
86
  images=[image],
87
  return_tensors="pt",
88
- padding="max_length",
89
- truncation=True,
90
  max_length=MAX_INPUT_TOKEN_LENGTH
91
- ).to(device)
92
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
93
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
94
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
@@ -120,20 +119,19 @@ def generate_video(text: str, video_path: str,
120
  {"role": "user", "content": [{"type": "text", "text": text}]}
121
  ]
122
  # Append each frame with its timestamp.
123
- for image, timestamp in frames:
 
124
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
125
  messages[1]["content"].append({"type": "image", "image": image})
126
-
127
- # Enable truncation in template application
128
  inputs = processor.apply_chat_template(
129
  messages,
130
  tokenize=True,
131
  add_generation_prompt=True,
132
  return_dict=True,
133
  return_tensors="pt",
134
- truncation=True,
135
  max_length=MAX_INPUT_TOKEN_LENGTH
136
- ).to(device)
137
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
138
  generation_kwargs = {
139
  **inputs,
@@ -165,6 +163,7 @@ video_examples = [
165
  ["Identify the main actions in the video", "videos/2.mp4"]
166
  ]
167
 
 
168
  css = """
169
  .submit-btn {
170
  background-color: #2980b9 !important;
 
24
  MAX_MAX_NEW_TOKENS = 2048
25
  DEFAULT_MAX_NEW_TOKENS = 1024
26
  # Increase or disable input truncation to avoid token mismatches
27
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
28
 
29
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
30
 
 
34
  MODEL_ID,
35
  trust_remote_code=True,
36
  torch_dtype=torch.float16
37
+ ).to("cuda").eval()
38
 
39
  def downsample_video(video_path):
40
  """
 
80
  ]
81
  }]
82
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
83
  inputs = processor(
84
  text=[prompt_full],
85
  images=[image],
86
  return_tensors="pt",
87
+ padding=True,
88
+ truncation=False,
89
  max_length=MAX_INPUT_TOKEN_LENGTH
90
+ ).to("cuda")
91
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
92
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
93
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
 
119
  {"role": "user", "content": [{"type": "text", "text": text}]}
120
  ]
121
  # Append each frame with its timestamp.
122
+ for frame in frames:
123
+ image, timestamp = frame
124
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
125
  messages[1]["content"].append({"type": "image", "image": image})
 
 
126
  inputs = processor.apply_chat_template(
127
  messages,
128
  tokenize=True,
129
  add_generation_prompt=True,
130
  return_dict=True,
131
  return_tensors="pt",
132
+ truncation=False,
133
  max_length=MAX_INPUT_TOKEN_LENGTH
134
+ ).to("cuda")
135
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
136
  generation_kwargs = {
137
  **inputs,
 
163
  ["Identify the main actions in the video", "videos/2.mp4"]
164
  ]
165
 
166
+
167
  css = """
168
  .submit-btn {
169
  background-color: #2980b9 !important;