abiyyufahri commited on
Commit
2ba99d9
·
verified ·
1 Parent(s): 2ee69d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -49
app.py CHANGED
@@ -168,7 +168,6 @@ def extract_coordinates(text):
168
 
169
  def cpu_inference(conversation, model, tokenizer, processor):
170
  try:
171
- # Apply chat template
172
  prompt = processor.apply_chat_template(
173
  conversation,
174
  tokenize=False,
@@ -176,58 +175,26 @@ def cpu_inference(conversation, model, tokenizer, processor):
176
  )
177
 
178
  image = conversation[1]["content"][0]["image"]
179
-
180
- # Process inputs with explicit padding and proper tensor handling
181
  inputs = processor(
182
  text=[prompt],
183
  images=[image],
184
  return_tensors="pt",
185
- padding=True, # Ensure padding is enabled
186
  truncation=True,
187
- max_length=2048 # Increased max length for vision-language models
188
  )
189
-
190
- # Debug logging
191
- logger.info(f"Input tensor shapes: {[(k, v.shape if hasattr(v, 'shape') else type(v)) for k, v in inputs.items()]}")
192
-
193
- # Ensure all tensors are properly formatted
194
- for key, value in inputs.items():
195
- if isinstance(value, torch.Tensor):
196
- logger.info(f"{key} shape: {value.shape}, dtype: {value.dtype}")
197
-
198
- # Set pad token if not already set
199
- if tokenizer.pad_token_id is None:
200
- if tokenizer.eos_token_id is not None:
201
- tokenizer.pad_token_id = tokenizer.eos_token_id
202
- else:
203
- tokenizer.pad_token_id = 0
204
-
205
- # Generate with proper attention mask handling
206
  with torch.no_grad():
207
- # Ensure attention mask is present
208
- if 'attention_mask' not in inputs and 'input_ids' in inputs:
209
- inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
210
-
211
  outputs = model.generate(
212
  **inputs,
213
  max_new_tokens=256,
214
  do_sample=True,
215
  temperature=0.3,
216
  top_p=0.8,
217
- pad_token_id=tokenizer.pad_token_id,
218
- eos_token_id=tokenizer.eos_token_id,
219
- use_cache=True,
220
- # Add these parameters for better stability
221
- repetition_penalty=1.1,
222
- length_penalty=1.0
223
  )
224
 
225
- # Handle batch dimension properly
226
- if outputs.dim() > 1:
227
- generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
228
- else:
229
- generated_ids = outputs[inputs["input_ids"].shape[1]:]
230
-
231
  response = tokenizer.decode(generated_ids, skip_special_tokens=True)
232
  coordinates = extract_coordinates(response)
233
 
@@ -239,10 +206,6 @@ def cpu_inference(conversation, model, tokenizer, processor):
239
 
240
  except Exception as e:
241
  logger.error(f"Inference error: {e}")
242
- logger.error(f"Error type: {type(e).__name__}")
243
- import traceback
244
- logger.error(f"Full traceback: {traceback.format_exc()}")
245
-
246
  return {
247
  "topk_points": [(0.5, 0.5)],
248
  "response": f"Error during inference: {str(e)}",
@@ -279,7 +242,6 @@ async def predict_click_base64(data: Base64Request):
279
 
280
  try:
281
  pil_image = Image.open(BytesIO(image_data)).convert("RGB")
282
- logger.info(f"Image loaded successfully: {pil_image.size}")
283
  except Exception as e:
284
  raise HTTPException(status_code=400, detail=f"Invalid image format: {e}")
285
 
@@ -349,10 +311,5 @@ async def debug_info():
349
  "processor_type": type(processor).__name__ if processor else None,
350
  "model_type": type(model).__name__ if model else None,
351
  "available_qwen_classes": available_classes,
352
- "transformers_version": transformers.__version__,
353
- "tokenizer_info": {
354
- "pad_token_id": tokenizer.pad_token_id if tokenizer else None,
355
- "eos_token_id": tokenizer.eos_token_id if tokenizer else None,
356
- "vocab_size": tokenizer.vocab_size if tokenizer else None
357
- } if tokenizer else None
358
  }
 
168
 
169
  def cpu_inference(conversation, model, tokenizer, processor):
170
  try:
 
171
  prompt = processor.apply_chat_template(
172
  conversation,
173
  tokenize=False,
 
175
  )
176
 
177
  image = conversation[1]["content"][0]["image"]
 
 
178
  inputs = processor(
179
  text=[prompt],
180
  images=[image],
181
  return_tensors="pt",
182
+ padding=True,
183
  truncation=True,
184
+ max_length=512
185
  )
186
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  with torch.no_grad():
 
 
 
 
188
  outputs = model.generate(
189
  **inputs,
190
  max_new_tokens=256,
191
  do_sample=True,
192
  temperature=0.3,
193
  top_p=0.8,
194
+ pad_token_id=tokenizer.eos_token_id or tokenizer.pad_token_id or 0
 
 
 
 
 
195
  )
196
 
197
+ generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
 
 
 
 
 
198
  response = tokenizer.decode(generated_ids, skip_special_tokens=True)
199
  coordinates = extract_coordinates(response)
200
 
 
206
 
207
  except Exception as e:
208
  logger.error(f"Inference error: {e}")
 
 
 
 
209
  return {
210
  "topk_points": [(0.5, 0.5)],
211
  "response": f"Error during inference: {str(e)}",
 
242
 
243
  try:
244
  pil_image = Image.open(BytesIO(image_data)).convert("RGB")
 
245
  except Exception as e:
246
  raise HTTPException(status_code=400, detail=f"Invalid image format: {e}")
247
 
 
311
  "processor_type": type(processor).__name__ if processor else None,
312
  "model_type": type(model).__name__ if model else None,
313
  "available_qwen_classes": available_classes,
314
+ "transformers_version": transformers.__version__
 
 
 
 
 
315
  }