alexnasa commited on
Commit
a2bb6b2
·
verified ·
1 Parent(s): 6846357

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -0
app.py CHANGED
@@ -9,6 +9,9 @@ import numpy as np
9
  from PIL import Image
10
 
11
  import torch
 
 
 
12
 
13
  print(f'torch version:{torch.__version__}')
14
 
@@ -47,6 +50,69 @@ from torchvision import transforms
47
  from models.controlnet import ControlNetModel
48
  from models.unet_2d_condition import UNet2DConditionModel
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  tensor_transforms = transforms.Compose([
51
  transforms.ToTensor(),
52
  ])
 
9
  from PIL import Image
10
 
11
  import torch
12
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
13
+ from qwen_vl_utils import process_vision_info
14
+
15
 
16
  print(f'torch version:{torch.__version__}')
17
 
 
50
  from models.controlnet import ControlNetModel
51
  from models.unet_2d_condition import UNet2DConditionModel
52
 
53
+ VLM_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
54
+
55
+ vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
56
+ VLM_NAME,
57
+ torch_dtype="auto",
58
+ device_map="auto" # immediately dispatches layers onto available GPUs
59
+ )
60
+ vlm_processor = AutoProcessor.from_pretrained(VLM_NAME)
61
+
62
+ def _generate_vlm_prompt(
63
+ vlm_model: Qwen2_5_VLForConditionalGeneration,
64
+ vlm_processor: AutoProcessor,
65
+ process_vision_info,
66
+ pil_image: Image.Image,
67
+ device: str = "cuda"
68
+ ) -> str:
69
+ """
70
+ Given two PIL.Image inputs:
71
+ - prev_pil: the “full” image at the previous recursion.
72
+ - zoomed_pil: the cropped+resized (zoom) image for this step.
73
+ Returns a single “recursive_multiscale” prompt string.
74
+ """
75
+
76
+ message_text = (
77
+ "The give a detailed description of this image as a caption."
78
+ )
79
+
80
+ messages = [
81
+ {"role": "system", "content": message_text},
82
+ {
83
+ "role": "user",
84
+ "content": [
85
+ {"type": "image", "image": pil_image},
86
+ ],
87
+ },
88
+ ]
89
+
90
+ text = vlm_processor.apply_chat_template(
91
+ messages,
92
+ tokenize=False,
93
+ add_generation_prompt=True
94
+ )
95
+ image_inputs, video_inputs = process_vision_info(messages)
96
+
97
+ inputs = vlm_processor(
98
+ text=[text],
99
+ images=image_inputs,
100
+ videos=video_inputs,
101
+ padding=True,
102
+ return_tensors="pt",
103
+ ).to(device)
104
+
105
+ generated = vlm_model.generate(**inputs, max_new_tokens=128)
106
+ trimmed = [
107
+ out_ids[len(in_ids):]
108
+ for in_ids, out_ids in zip(inputs.input_ids, generated)
109
+ ]
110
+ out_text = vlm_processor.batch_decode(
111
+ trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
112
+ )[0]
113
+
114
+ return out_text.strip()
115
+
116
  tensor_transforms = transforms.Compose([
117
  transforms.ToTensor(),
118
  ])