programmnix-askui commited on
Commit
af0979f
Β·
1 Parent(s): 1953e40

Add prefilling

Browse files
Files changed (1) hide show
  1. app.py +44 -28
app.py CHANGED
@@ -85,35 +85,51 @@ def deepseek(image, text_input, model_id):
85
  system_prompt=""
86
  ).to(vl_gpt.device)
87
 
88
- # run image encoder to get the image embeddings
89
- inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
90
-
91
- # run the model to get the response
92
- outputs = vl_gpt.language.generate(
93
- inputs_embeds=inputs_embeds,
94
- attention_mask=prepare_inputs.attention_mask,
95
- pad_token_id=tokenizer.eos_token_id,
96
- bos_token_id=tokenizer.bos_token_id,
97
- eos_token_id=tokenizer.eos_token_id,
98
- max_new_tokens=512,
99
- do_sample=False,
100
- use_cache=True
101
- )
102
-
103
- answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=False)
104
-
105
- print(f"{prepare_inputs['sft_format'][0]}", answer)
106
- det_pattern = r"<\|det\|>\[\[(.+)]]<\|\/det\|>"
107
-
108
- det_match = re.search(det_pattern, answer)
109
- if det_match is None:
110
- return text_input, [], image
111
 
112
- det_content = det_match.group(1)
113
- bbox = [int(v.strip()) for v in det_content.split(",")]
114
-
115
- scaled_boxes = rescale_bounding_boxes([bbox], image.width, image.height)
116
- return answer, scaled_boxes, draw_bounding_boxes(image, scaled_boxes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
 
119
  @spaces.GPU
 
85
  system_prompt=""
86
  ).to(vl_gpt.device)
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ with torch.no_grad():
90
+
91
+ # run image encoder to get the image embeddings
92
+ inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
93
+
94
+ inputs_embeds, past_key_values = vl_gpt.incremental_prefilling(
95
+ input_ids=prepare_inputs.input_ids,
96
+ images=prepare_inputs.images,
97
+ images_seq_mask=prepare_inputs.images_seq_mask,
98
+ images_spatial_crop=prepare_inputs.images_spatial_crop,
99
+ attention_mask=prepare_inputs.attention_mask,
100
+ chunk_size=512 # prefilling size
101
+ )
102
+
103
+ # run the model to get the response
104
+ outputs = vl_gpt.generate(
105
+ inputs_embeds=inputs_embeds,
106
+ input_ids=prepare_inputs.input_ids,
107
+ images=prepare_inputs.images,
108
+ images_seq_mask=prepare_inputs.images_seq_mask,
109
+ images_spatial_crop=prepare_inputs.images_spatial_crop,
110
+ attention_mask=prepare_inputs.attention_mask,
111
+ past_key_values=past_key_values,
112
+ pad_token_id=tokenizer.eos_token_id,
113
+ bos_token_id=tokenizer.bos_token_id,
114
+ eos_token_id=tokenizer.eos_token_id,
115
+ max_new_tokens=512,
116
+ do_sample=False,
117
+ use_cache=True,
118
+ )
119
+
120
+ answer = tokenizer.decode(outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(), skip_special_tokens=False)
121
+ print(f"{prepare_inputs['sft_format'][0]}", answer)
122
+ det_pattern = r"<\|det\|>\[\[(.+)]]<\|\/det\|>"
123
+
124
+ det_match = re.search(det_pattern, answer)
125
+ if det_match is None:
126
+ return text_input, [], image
127
+
128
+ det_content = det_match.group(1)
129
+ bbox = [int(v.strip()) for v in det_content.split(",")]
130
+
131
+ scaled_boxes = rescale_bounding_boxes([bbox], image.width, image.height)
132
+ return answer, scaled_boxes, draw_bounding_boxes(image, scaled_boxes)
133
 
134
 
135
  @spaces.GPU