abiyyufahri commited on
Commit
d5d8986
·
verified ·
1 Parent(s): d2a35a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -8
app.py CHANGED
@@ -38,6 +38,12 @@ async def load_model():
38
  trust_remote_code=True
39
  )
40
 
 
 
 
 
 
 
41
  model = Qwen2VLForConditionalGeneration.from_pretrained(
42
  model_name,
43
  torch_dtype=torch.float32,
@@ -60,6 +66,12 @@ async def load_model():
60
  trust_remote_code=True
61
  )
62
 
 
 
 
 
 
 
63
  model = AutoModel.from_pretrained(
64
  model_name,
65
  torch_dtype=torch.float32,
@@ -94,6 +106,12 @@ async def load_model():
94
  trust_remote_code=True
95
  )
96
 
 
 
 
 
 
 
97
  model = ModelClass.from_pretrained(
98
  model_name,
99
  config=config,
@@ -168,6 +186,7 @@ def extract_coordinates(text):
168
 
169
  def cpu_inference(conversation, model, tokenizer, processor):
170
  try:
 
171
  prompt = processor.apply_chat_template(
172
  conversation,
173
  tokenize=False,
@@ -175,14 +194,28 @@ def cpu_inference(conversation, model, tokenizer, processor):
175
  )
176
 
177
  image = conversation[1]["content"][0]["image"]
 
 
178
  inputs = processor(
179
- text=[prompt],
180
- images=[image],
181
  return_tensors="pt",
182
- padding=True,
183
  truncation=True,
184
  max_length=512
185
  )
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  with torch.no_grad():
188
  outputs = model.generate(
@@ -191,21 +224,28 @@ def cpu_inference(conversation, model, tokenizer, processor):
191
  do_sample=True,
192
  temperature=0.3,
193
  top_p=0.8,
194
- pad_token_id=tokenizer.eos_token_id or tokenizer.pad_token_id or 0
 
195
  )
196
 
197
- generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
 
 
198
  response = tokenizer.decode(generated_ids, skip_special_tokens=True)
199
  coordinates = extract_coordinates(response)
200
 
201
  return {
202
  "topk_points": coordinates,
203
- "response": response,
204
  "success": True
205
  }
206
 
207
  except Exception as e:
208
  logger.error(f"Inference error: {e}")
 
 
 
 
209
  return {
210
  "topk_points": [(0.5, 0.5)],
211
  "response": f"Error during inference: {str(e)}",
@@ -242,9 +282,12 @@ async def predict_click_base64(data: Base64Request):
242
 
243
  try:
244
  pil_image = Image.open(BytesIO(image_data)).convert("RGB")
 
 
245
  except Exception as e:
246
  raise HTTPException(status_code=400, detail=f"Invalid image format: {e}")
247
 
 
248
  conversation = [
249
  {
250
  "role": "system",
@@ -272,6 +315,10 @@ async def predict_click_base64(data: Base64Request):
272
 
273
  # Run inference
274
  pred = cpu_inference(conversation, model, tokenizer, processor)
 
 
 
 
275
  px, py = pred["topk_points"][0]
276
 
277
  return JSONResponse(content={
@@ -285,6 +332,8 @@ async def predict_click_base64(data: Base64Request):
285
  raise
286
  except Exception as e:
287
  logger.error(f"Prediction error: {e}")
 
 
288
  raise HTTPException(
289
  status_code=500,
290
  detail=f"Internal server error: {str(e)}"
@@ -306,10 +355,23 @@ async def debug_info():
306
  import transformers
307
  available_classes = [attr for attr in dir(transformers) if 'Qwen' in attr or 'VL' in attr]
308
 
309
- return {
310
  "model_loaded": model_loaded,
311
  "processor_type": type(processor).__name__ if processor else None,
312
  "model_type": type(model).__name__ if model else None,
313
  "available_qwen_classes": available_classes,
314
  "transformers_version": transformers.__version__
315
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  trust_remote_code=True
39
  )
40
 
41
+ # Configure padding for processor
42
+ if hasattr(processor, 'tokenizer'):
43
+ processor.tokenizer.padding_side = "left" # Important for Qwen2-VL
44
+ if processor.tokenizer.pad_token is None:
45
+ processor.tokenizer.pad_token = processor.tokenizer.eos_token
46
+
47
  model = Qwen2VLForConditionalGeneration.from_pretrained(
48
  model_name,
49
  torch_dtype=torch.float32,
 
66
  trust_remote_code=True
67
  )
68
 
69
+ # Configure padding for processor
70
+ if hasattr(processor, 'tokenizer'):
71
+ processor.tokenizer.padding_side = "left"
72
+ if processor.tokenizer.pad_token is None:
73
+ processor.tokenizer.pad_token = processor.tokenizer.eos_token
74
+
75
  model = AutoModel.from_pretrained(
76
  model_name,
77
  torch_dtype=torch.float32,
 
106
  trust_remote_code=True
107
  )
108
 
109
+ # Configure padding
110
+ if hasattr(processor, 'tokenizer'):
111
+ processor.tokenizer.padding_side = "left"
112
+ if processor.tokenizer.pad_token is None:
113
+ processor.tokenizer.pad_token = processor.tokenizer.eos_token
114
+
115
  model = ModelClass.from_pretrained(
116
  model_name,
117
  config=config,
 
186
 
187
  def cpu_inference(conversation, model, tokenizer, processor):
188
  try:
189
+ # Apply chat template
190
  prompt = processor.apply_chat_template(
191
  conversation,
192
  tokenize=False,
 
194
  )
195
 
196
  image = conversation[1]["content"][0]["image"]
197
+
198
+ # FIXED: Process inputs dengan padding yang benar
199
  inputs = processor(
200
+ text=[prompt], # Wrap dalam list untuk batch processing
201
+ images=[image], # Wrap dalam list untuk batch processing
202
  return_tensors="pt",
203
+ padding=True, # Enable padding
204
  truncation=True,
205
  max_length=512
206
  )
207
+
208
+ # FIXED: Pastikan semua tensor memiliki batch dimension yang konsisten
209
+ for key, value in inputs.items():
210
+ if isinstance(value, torch.Tensor):
211
+ logger.debug(f"Input {key} shape: {value.shape}")
212
+
213
+ # FIXED: Set pad_token_id jika belum ada
214
+ pad_token_id = tokenizer.pad_token_id
215
+ if pad_token_id is None:
216
+ pad_token_id = tokenizer.eos_token_id
217
+ if pad_token_id is None:
218
+ pad_token_id = 0 # Fallback
219
 
220
  with torch.no_grad():
221
  outputs = model.generate(
 
224
  do_sample=True,
225
  temperature=0.3,
226
  top_p=0.8,
227
+ pad_token_id=pad_token_id,
228
+ attention_mask=inputs.get('attention_mask', None) # FIXED: Explicit attention mask
229
  )
230
 
231
+ # FIXED: Extract generated tokens correctly
232
+ input_length = inputs["input_ids"].shape[1]
233
+ generated_ids = outputs[0][input_length:]
234
  response = tokenizer.decode(generated_ids, skip_special_tokens=True)
235
  coordinates = extract_coordinates(response)
236
 
237
  return {
238
  "topk_points": coordinates,
239
+ "response": response.strip(),
240
  "success": True
241
  }
242
 
243
  except Exception as e:
244
  logger.error(f"Inference error: {e}")
245
+ # FIXED: More detailed error logging
246
+ import traceback
247
+ logger.error(f"Full traceback: {traceback.format_exc()}")
248
+
249
  return {
250
  "topk_points": [(0.5, 0.5)],
251
  "response": f"Error during inference: {str(e)}",
 
282
 
283
  try:
284
  pil_image = Image.open(BytesIO(image_data)).convert("RGB")
285
+ # FIXED: Log image dimensions for debugging
286
+ logger.debug(f"Image dimensions: {pil_image.size}")
287
  except Exception as e:
288
  raise HTTPException(status_code=400, detail=f"Invalid image format: {e}")
289
 
290
+ # FIXED: Improved conversation structure
291
  conversation = [
292
  {
293
  "role": "system",
 
315
 
316
  # Run inference
317
  pred = cpu_inference(conversation, model, tokenizer, processor)
318
+
319
+ if not pred["success"]:
320
+ logger.warning(f"Inference failed: {pred['response']}")
321
+
322
  px, py = pred["topk_points"][0]
323
 
324
  return JSONResponse(content={
 
332
  raise
333
  except Exception as e:
334
  logger.error(f"Prediction error: {e}")
335
+ import traceback
336
+ logger.error(f"Full traceback: {traceback.format_exc()}")
337
  raise HTTPException(
338
  status_code=500,
339
  detail=f"Internal server error: {str(e)}"
 
355
  import transformers
356
  available_classes = [attr for attr in dir(transformers) if 'Qwen' in attr or 'VL' in attr]
357
 
358
+ debug_info = {
359
  "model_loaded": model_loaded,
360
  "processor_type": type(processor).__name__ if processor else None,
361
  "model_type": type(model).__name__ if model else None,
362
  "available_qwen_classes": available_classes,
363
  "transformers_version": transformers.__version__
364
+ }
365
+
366
+ # FIXED: Add tokenizer info for debugging
367
+ if processor and hasattr(processor, 'tokenizer'):
368
+ debug_info.update({
369
+ "tokenizer_type": type(processor.tokenizer).__name__,
370
+ "pad_token": processor.tokenizer.pad_token,
371
+ "pad_token_id": processor.tokenizer.pad_token_id,
372
+ "eos_token": processor.tokenizer.eos_token,
373
+ "eos_token_id": processor.tokenizer.eos_token_id,
374
+ "padding_side": processor.tokenizer.padding_side
375
+ })
376
+
377
+ return debug_info