vykanand commited on
Commit
2f4647f
Β·
verified Β·
1 Parent(s): 6f17c34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -54
app.py CHANGED
@@ -1,71 +1,37 @@
1
  import torch
 
2
  from PIL import Image
3
  import requests
4
- from io import BytesIO # Importing BytesIO from the io module
5
- from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
6
- from qwen_vl_utils import process_vision_info
7
-
8
- # Check if CUDA is available and set the device accordingly
9
- device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
11
- # Load the model on the available device
12
- model = Qwen2VLForConditionalGeneration.from_pretrained(
13
- "Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto"
14
- )
15
- model = model.to(device)
16
 
17
- # Default processor
18
- processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
19
 
20
- # Resize the image to a smaller resolution (e.g., 512x512)
21
  image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
22
  response = requests.get(image_url)
23
- img = Image.open(BytesIO(response.content)) # Using BytesIO to handle image from the byte stream
24
 
25
- # Resize the image
26
- img_resized = img.resize((512, 512)) # Resize the image to 512x512
27
- image_inputs = processor(images=img_resized, return_tensors="pt").to(device)
28
 
29
- # Prepare the text input
30
- messages = [
31
- {
32
- "role": "user",
33
- "content": [
34
- {
35
- "type": "image",
36
- "image": img_resized,
37
- },
38
- {"type": "text", "text": "Describe this image."},
39
- ],
40
- }
41
- ]
42
 
43
- # Preparation for inference
44
- text = processor.apply_chat_template(
45
- messages, tokenize=False, add_generation_prompt=True
46
- )
47
- image_inputs, video_inputs = process_vision_info(messages)
48
  inputs = processor(
49
- text=[text],
50
- images=image_inputs,
51
- videos=video_inputs,
52
- padding=True,
53
  return_tensors="pt",
54
- )
55
- inputs = inputs.to(device) # Move inputs to the same device as the model
56
 
57
- # Inference: Generation of the output
58
  generated_ids = model.generate(**inputs, max_new_tokens=128)
59
 
60
- # Trim the output tokens
61
- generated_ids_trimmed = [
62
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
63
- ]
64
-
65
- # Decode the generated text
66
- output_text = processor.batch_decode(
67
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
68
- )
69
-
70
- # Print the output
71
  print(output_text)
 
1
  import torch
2
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
3
  from PIL import Image
4
  import requests
5
+ from io import BytesIO
 
 
 
 
 
6
 
7
+ # Initialize the model and processor
8
+ model_name = "Qwen/Qwen2-VL-2B-Instruct"
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
10
 
11
+ model = Qwen2VLForConditionalGeneration.from_pretrained(model_name).to(device)
12
+ processor = AutoProcessor.from_pretrained(model_name)
13
 
14
+ # Load the image from URL
15
  image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
16
  response = requests.get(image_url)
17
+ img = Image.open(BytesIO(response.content))
18
 
19
+ # Ensure the image is resized and processed correctly
20
+ img_resized = img.resize((224, 224)) # Resize as needed (adjust based on model requirements)
 
21
 
22
+ # Create a prompt or text input
23
+ text_input = "Describe this image."
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ # Process the image and the text input
 
 
 
 
26
  inputs = processor(
27
+ images=img_resized,
28
+ text=text_input,
 
 
29
  return_tensors="pt",
30
+ ).to(device)
 
31
 
32
+ # Inference
33
  generated_ids = model.generate(**inputs, max_new_tokens=128)
34
 
35
+ # Decode the output
36
+ output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
37
  print(output_text)