programmnix-askui commited on
Commit
03ca516
Β·
1 Parent(s): 63f8ed1

Test deepseek

Browse files
Files changed (2) hide show
  1. app.py +75 -0
  2. requirements.txt +2 -1
app.py CHANGED
@@ -8,6 +8,11 @@ from PIL import Image, ImageDraw
8
  from io import BytesIO
9
  import re
10
 
 
 
 
 
 
11
 
12
  models = {
13
  "OS-Copilot/OS-Atlas-Base-7B": Qwen2VLForConditionalGeneration.from_pretrained("OS-Copilot/OS-Atlas-Base-7B", torch_dtype="auto", device_map="auto"),
@@ -49,8 +54,78 @@ def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scal
49
  return rescaled_boxes
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  @spaces.GPU
53
  def run_example(image, text_input, model_id="OS-Copilot/OS-Atlas-Base-7B"):
 
 
 
 
 
54
  model = models[model_id].eval()
55
  processor = processors[model_id]
56
  prompt = f"In this UI screenshot, what is the position of the element corresponding to the command \"{text_input}\" (with bbox)?"
 
8
  from io import BytesIO
9
  import re
10
 
11
+ from deepseek_vl.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
12
+ from deepseek_vl.utils.io import load_pil_images
13
+ from transformers import AutoModelForCausalLM
14
+
15
+
16
 
17
  models = {
18
  "OS-Copilot/OS-Atlas-Base-7B": Qwen2VLForConditionalGeneration.from_pretrained("OS-Copilot/OS-Atlas-Base-7B", torch_dtype="auto", device_map="auto"),
 
54
  return rescaled_boxes
55
 
56
 
57
+ def deepseek():
58
+
59
+ # specify the path to the model
60
+ model_path = "deepseek-ai/deepseek-vl2-small"
61
+ vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
62
+ tokenizer = vl_chat_processor.tokenizer
63
+
64
+ vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
65
+ vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
66
+
67
+ ## single image conversation example
68
+ conversation = [
69
+ {
70
+ "role": "<|User|>",
71
+ "content": "<image>\n<|ref|>The giraffe at the back.<|/ref|>.",
72
+ "images": ["./images/visual_grounding.jpeg"],
73
+ },
74
+ {"role": "<|Assistant|>", "content": ""},
75
+ ]
76
+
77
+ ## multiple images (or in-context learning) conversation example
78
+ # conversation = [
79
+ # {
80
+ # "role": "User",
81
+ # "content": "<image_placeholder>A dog wearing nothing in the foreground, "
82
+ # "<image_placeholder>a dog wearing a santa hat, "
83
+ # "<image_placeholder>a dog wearing a wizard outfit, and "
84
+ # "<image_placeholder>what's the dog wearing?",
85
+ # "images": [
86
+ # "images/dog_a.png",
87
+ # "images/dog_b.png",
88
+ # "images/dog_c.png",
89
+ # "images/dog_d.png",
90
+ # ],
91
+ # },
92
+ # {"role": "Assistant", "content": ""}
93
+ # ]
94
+
95
+ # load images and prepare for inputs
96
+ pil_images = load_pil_images(conversation)
97
+ prepare_inputs = vl_chat_processor(
98
+ conversations=conversation,
99
+ images=pil_images,
100
+ force_batchify=True,
101
+ system_prompt=""
102
+ ).to(vl_gpt.device)
103
+
104
+ # run image encoder to get the image embeddings
105
+ inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
106
+
107
+ # run the model to get the response
108
+ outputs = vl_gpt.language_model.generate(
109
+ inputs_embeds=inputs_embeds,
110
+ attention_mask=prepare_inputs.attention_mask,
111
+ pad_token_id=tokenizer.eos_token_id,
112
+ bos_token_id=tokenizer.bos_token_id,
113
+ eos_token_id=tokenizer.eos_token_id,
114
+ max_new_tokens=512,
115
+ do_sample=False,
116
+ use_cache=True
117
+ )
118
+
119
+ answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
120
+ print(f"{prepare_inputs['sft_format'][0]}", answer)
121
+
122
  @spaces.GPU
123
  def run_example(image, text_input, model_id="OS-Copilot/OS-Atlas-Base-7B"):
124
+
125
+ deepseek()
126
+
127
+
128
+ def run_example_old(image, text_input, model_id="OS-Copilot/OS-Atlas-Base-7B"):
129
  model = models[model_id].eval()
130
  processor = processors[model_id]
131
  prompt = f"In this UI screenshot, what is the position of the element corresponding to the command \"{text_input}\" (with bbox)?"
requirements.txt CHANGED
@@ -5,4 +5,5 @@ torch
5
  torchvision
6
  transformers
7
  accelerate==0.30.0
8
- qwen-vl-utils
 
 
5
  torchvision
6
  transformers
7
  accelerate==0.30.0
8
+ qwen-vl-utils
9
+ deepseek_vl