Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on May 30

Commit

2a06976

verified ·

1 Parent(s): 02de0e1

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -7

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ from transformers.image_utils import load_image
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 # Increase or disable input truncation to avoid token mismatches
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -34,7 +34,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     trust_remote_code=True,
     torch_dtype=torch.float16
-).to("cuda").eval()
 def downsample_video(video_path):
     """
@@ -80,14 +80,15 @@ def generate_image(text: str, image: Image.Image,
         ]
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
         images=[image],
         return_tensors="pt",
-        padding='max_length',
         truncation=True,
         max_length=MAX_INPUT_TOKEN_LENGTH
-    ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
@@ -119,10 +120,11 @@ def generate_video(text: str, video_path: str,
         {"role": "user", "content": [{"type": "text", "text": text}]}
     ]
     # Append each frame with its timestamp.
-    for frame in frames:
-        image, timestamp = frame
         messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         messages[1]["content"].append({"type": "image", "image": image})
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
@@ -131,7 +133,7 @@ def generate_video(text: str, video_path: str,
         return_tensors="pt",
         truncation=True,
         max_length=MAX_INPUT_TOKEN_LENGTH
-    ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,

 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 # Increase or disable input truncation to avoid token mismatches
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     MODEL_ID,
     trust_remote_code=True,
     torch_dtype=torch.float16
+).to(device).eval()
 def downsample_video(video_path):
     """
         ]
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Use max-length padding and enable truncation
     inputs = processor(
         text=[prompt_full],
         images=[image],
         return_tensors="pt",
+        padding="max_length",
         truncation=True,
         max_length=MAX_INPUT_TOKEN_LENGTH
+    ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         {"role": "user", "content": [{"type": "text", "text": text}]}
     ]
     # Append each frame with its timestamp.
+    for image, timestamp in frames:
         messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         messages[1]["content"].append({"type": "image", "image": image})
+    # Enable truncation in template application
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
         return_tensors="pt",
         truncation=True,
         max_length=MAX_INPUT_TOKEN_LENGTH
+    ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,