prithivMLmods commited on
Commit
1d278fb
·
verified ·
1 Parent(s): 929bc6a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -3
app.py CHANGED
@@ -19,7 +19,7 @@ from PIL import Image
19
  import fitz
20
 
21
  from transformers import (
22
- Qwen2_5_VLForConditionalGeneration,
23
  AutoModelForVision2Seq,
24
  AutoModelForImageTextToText,
25
  AutoModel,
@@ -73,14 +73,14 @@ SUBFOLDER = "Recognition"
73
  processor_g = AutoProcessor.from_pretrained(
74
  MODEL_ID_G, trust_remote_code=True, subfolder=SUBFOLDER
75
  )
76
- model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
77
  MODEL_ID_G, trust_remote_code=True, subfolder=SUBFOLDER, torch_dtype=torch.float16
78
  ).to(device).eval()
79
 
80
  MODEL_ID_I = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
81
  processor_i = AutoProcessor.from_pretrained(MODEL_ID_I, trust_remote_code=True)
82
  model_i = AutoModelForImageTextToText.from_pretrained(
83
- MODEL_ID_I, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
84
  ).to(device).eval()
85
 
86
 
@@ -186,6 +186,8 @@ def process_document_stream(
186
  messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
187
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
188
  inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
 
 
189
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
190
 
191
  generation_kwargs = {
 
19
  import fitz
20
 
21
  from transformers import (
22
+ Qwen2VLForConditionalGeneration,
23
  AutoModelForVision2Seq,
24
  AutoModelForImageTextToText,
25
  AutoModel,
 
73
  processor_g = AutoProcessor.from_pretrained(
74
  MODEL_ID_G, trust_remote_code=True, subfolder=SUBFOLDER
75
  )
76
+ model_g = Qwen2VLForConditionalGeneration.from_pretrained(
77
  MODEL_ID_G, trust_remote_code=True, subfolder=SUBFOLDER, torch_dtype=torch.float16
78
  ).to(device).eval()
79
 
80
  MODEL_ID_I = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
81
  processor_i = AutoProcessor.from_pretrained(MODEL_ID_I, trust_remote_code=True)
82
  model_i = AutoModelForImageTextToText.from_pretrained(
83
+ MODEL_ID_I, trust_remote_code=True, torch_dtype=torch.bfloat16, _attn_implementation="flash_attention_2"
84
  ).to(device).eval()
85
 
86
 
 
186
  messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
187
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
188
  inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
189
+ # Convert floating point tensors to the model's dtype
190
+ inputs = {k: v.to(dtype=model.dtype) if isinstance(v, torch.Tensor) and v.dtype.is_floating_point else v for k, v in inputs.items()}
191
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
192
 
193
  generation_kwargs = {