prithivMLmods commited on
Commit
2a06976
·
verified ·
1 Parent(s): 02de0e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -7
app.py CHANGED
@@ -24,7 +24,7 @@ from transformers.image_utils import load_image
24
  MAX_MAX_NEW_TOKENS = 2048
25
  DEFAULT_MAX_NEW_TOKENS = 1024
26
  # Increase or disable input truncation to avoid token mismatches
27
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
28
 
29
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
30
 
@@ -34,7 +34,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
34
  MODEL_ID,
35
  trust_remote_code=True,
36
  torch_dtype=torch.float16
37
- ).to("cuda").eval()
38
 
39
  def downsample_video(video_path):
40
  """
@@ -80,14 +80,15 @@ def generate_image(text: str, image: Image.Image,
80
  ]
81
  }]
82
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
83
  inputs = processor(
84
  text=[prompt_full],
85
  images=[image],
86
  return_tensors="pt",
87
- padding='max_length',
88
  truncation=True,
89
  max_length=MAX_INPUT_TOKEN_LENGTH
90
- ).to("cuda")
91
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
92
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
93
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
@@ -119,10 +120,11 @@ def generate_video(text: str, video_path: str,
119
  {"role": "user", "content": [{"type": "text", "text": text}]}
120
  ]
121
  # Append each frame with its timestamp.
122
- for frame in frames:
123
- image, timestamp = frame
124
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
125
  messages[1]["content"].append({"type": "image", "image": image})
 
 
126
  inputs = processor.apply_chat_template(
127
  messages,
128
  tokenize=True,
@@ -131,7 +133,7 @@ def generate_video(text: str, video_path: str,
131
  return_tensors="pt",
132
  truncation=True,
133
  max_length=MAX_INPUT_TOKEN_LENGTH
134
- ).to("cuda")
135
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
136
  generation_kwargs = {
137
  **inputs,
 
24
  MAX_MAX_NEW_TOKENS = 2048
25
  DEFAULT_MAX_NEW_TOKENS = 1024
26
  # Increase or disable input truncation to avoid token mismatches
27
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
28
 
29
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
30
 
 
34
  MODEL_ID,
35
  trust_remote_code=True,
36
  torch_dtype=torch.float16
37
+ ).to(device).eval()
38
 
39
  def downsample_video(video_path):
40
  """
 
80
  ]
81
  }]
82
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
83
+ # Use max-length padding and enable truncation
84
  inputs = processor(
85
  text=[prompt_full],
86
  images=[image],
87
  return_tensors="pt",
88
+ padding="max_length",
89
  truncation=True,
90
  max_length=MAX_INPUT_TOKEN_LENGTH
91
+ ).to(device)
92
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
93
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
94
  thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
 
120
  {"role": "user", "content": [{"type": "text", "text": text}]}
121
  ]
122
  # Append each frame with its timestamp.
123
+ for image, timestamp in frames:
 
124
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
125
  messages[1]["content"].append({"type": "image", "image": image})
126
+
127
+ # Enable truncation in template application
128
  inputs = processor.apply_chat_template(
129
  messages,
130
  tokenize=True,
 
133
  return_tensors="pt",
134
  truncation=True,
135
  max_length=MAX_INPUT_TOKEN_LENGTH
136
+ ).to(device)
137
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
138
  generation_kwargs = {
139
  **inputs,