vykanand commited on
Commit
5ca9963
Β·
verified Β·
1 Parent(s): 5a6e90f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -19
app.py CHANGED
@@ -1,27 +1,20 @@
1
- from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 
2
  from qwen_vl_utils import process_vision_info
3
 
4
- # default: Load the model on the available device(s)
 
 
 
5
  model = Qwen2VLForConditionalGeneration.from_pretrained(
6
  "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
7
  )
 
8
 
9
- # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
10
- # model = Qwen2VLForConditionalGeneration.from_pretrained(
11
- # "Qwen/Qwen2-VL-2B-Instruct",
12
- # torch_dtype=torch.bfloat16,
13
- # attn_implementation="flash_attention_2",
14
- # device_map="auto",
15
- # )
16
-
17
- # default processer
18
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
19
 
20
- # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
21
- # min_pixels = 256*28*28
22
- # max_pixels = 1280*28*28
23
- # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
24
-
25
  messages = [
26
  {
27
  "role": "user",
@@ -47,14 +40,20 @@ inputs = processor(
47
  padding=True,
48
  return_tensors="pt",
49
  )
50
- inputs = inputs.to("cuda")
51
 
52
  # Inference: Generation of the output
53
  generated_ids = model.generate(**inputs, max_new_tokens=128)
 
 
54
  generated_ids_trimmed = [
55
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
56
  ]
 
 
57
  output_text = processor.batch_decode(
58
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
59
  )
60
- print(output_text)
 
 
 
1
+ import torch
2
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
3
  from qwen_vl_utils import process_vision_info
4
 
5
+ # Check if CUDA is available and set the device accordingly
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
+ # Load the model on the available device
9
  model = Qwen2VLForConditionalGeneration.from_pretrained(
10
  "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
11
  )
12
+ model = model.to(device)
13
 
14
+ # Default processor
 
 
 
 
 
 
 
 
15
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
16
 
17
+ # Prepare input messages
 
 
 
 
18
  messages = [
19
  {
20
  "role": "user",
 
40
  padding=True,
41
  return_tensors="pt",
42
  )
43
+ inputs = inputs.to(device) # Move inputs to the same device as the model
44
 
45
  # Inference: Generation of the output
46
  generated_ids = model.generate(**inputs, max_new_tokens=128)
47
+
48
+ # Trim the output tokens
49
  generated_ids_trimmed = [
50
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
51
  ]
52
+
53
+ # Decode the generated text
54
  output_text = processor.batch_decode(
55
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
56
  )
57
+
58
+ # Print the output
59
+ print(output_text)