Spaces:

abiyyufahri
/

GUI-Agent

Sleeping

App Files Files Community

abiyyufahri commited on 16 days ago

Commit

2ba99d9

verified ·

1 Parent(s): 2ee69d3

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -49

app.py CHANGED Viewed

@@ -168,7 +168,6 @@ def extract_coordinates(text):
 def cpu_inference(conversation, model, tokenizer, processor):
     try:
-        # Apply chat template
         prompt = processor.apply_chat_template(
             conversation,
             tokenize=False,
@@ -176,58 +175,26 @@ def cpu_inference(conversation, model, tokenizer, processor):
         )
         image = conversation[1]["content"][0]["image"]
-        # Process inputs with explicit padding and proper tensor handling
         inputs = processor(
             text=[prompt],
             images=[image],
             return_tensors="pt",
-            padding=True,  # Ensure padding is enabled
             truncation=True,
-            max_length=2048  # Increased max length for vision-language models
         )
-        # Debug logging
-        logger.info(f"Input tensor shapes: {[(k, v.shape if hasattr(v, 'shape') else type(v)) for k, v in inputs.items()]}")
-        # Ensure all tensors are properly formatted
-        for key, value in inputs.items():
-            if isinstance(value, torch.Tensor):
-                logger.info(f"{key} shape: {value.shape}, dtype: {value.dtype}")
-        # Set pad token if not already set
-        if tokenizer.pad_token_id is None:
-            if tokenizer.eos_token_id is not None:
-                tokenizer.pad_token_id = tokenizer.eos_token_id
-            else:
-                tokenizer.pad_token_id = 0
-        # Generate with proper attention mask handling
         with torch.no_grad():
-            # Ensure attention mask is present
-            if 'attention_mask' not in inputs and 'input_ids' in inputs:
-                inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
             outputs = model.generate(
                 **inputs,
                 max_new_tokens=256,
                 do_sample=True,
                 temperature=0.3,
                 top_p=0.8,
-                pad_token_id=tokenizer.pad_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                use_cache=True,
-                # Add these parameters for better stability
-                repetition_penalty=1.1,
-                length_penalty=1.0
             )
-        # Handle batch dimension properly
-        if outputs.dim() > 1:
-            generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
-        else:
-            generated_ids = outputs[inputs["input_ids"].shape[1]:]
         response = tokenizer.decode(generated_ids, skip_special_tokens=True)
         coordinates = extract_coordinates(response)
@@ -239,10 +206,6 @@ def cpu_inference(conversation, model, tokenizer, processor):
     except Exception as e:
         logger.error(f"Inference error: {e}")
-        logger.error(f"Error type: {type(e).__name__}")
-        import traceback
-        logger.error(f"Full traceback: {traceback.format_exc()}")
         return {
             "topk_points": [(0.5, 0.5)],
             "response": f"Error during inference: {str(e)}",
@@ -279,7 +242,6 @@ async def predict_click_base64(data: Base64Request):
         try:
             pil_image = Image.open(BytesIO(image_data)).convert("RGB")
-            logger.info(f"Image loaded successfully: {pil_image.size}")
         except Exception as e:
             raise HTTPException(status_code=400, detail=f"Invalid image format: {e}")
@@ -349,10 +311,5 @@ async def debug_info():
         "processor_type": type(processor).__name__ if processor else None,
         "model_type": type(model).__name__ if model else None,
         "available_qwen_classes": available_classes,
-        "transformers_version": transformers.__version__,
-        "tokenizer_info": {
-            "pad_token_id": tokenizer.pad_token_id if tokenizer else None,
-            "eos_token_id": tokenizer.eos_token_id if tokenizer else None,
-            "vocab_size": tokenizer.vocab_size if tokenizer else None
-        } if tokenizer else None
     }

 def cpu_inference(conversation, model, tokenizer, processor):
     try:
         prompt = processor.apply_chat_template(
             conversation,
             tokenize=False,
         )
         image = conversation[1]["content"][0]["image"]
         inputs = processor(
             text=[prompt],
             images=[image],
             return_tensors="pt",
+            padding=True,
             truncation=True,
+            max_length=512
         )
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 max_new_tokens=256,
                 do_sample=True,
                 temperature=0.3,
                 top_p=0.8,
+                pad_token_id=tokenizer.eos_token_id or tokenizer.pad_token_id or 0
             )
+        generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
         response = tokenizer.decode(generated_ids, skip_special_tokens=True)
         coordinates = extract_coordinates(response)
     except Exception as e:
         logger.error(f"Inference error: {e}")
         return {
             "topk_points": [(0.5, 0.5)],
             "response": f"Error during inference: {str(e)}",
         try:
             pil_image = Image.open(BytesIO(image_data)).convert("RGB")
         except Exception as e:
             raise HTTPException(status_code=400, detail=f"Invalid image format: {e}")
         "processor_type": type(processor).__name__ if processor else None,
         "model_type": type(model).__name__ if model else None,
         "available_qwen_classes": available_classes,
+        "transformers_version": transformers.__version__
     }