abiyyufahri commited on
Commit
0b96209
·
1 Parent(s): e670b79

Install error fix attemp 6

Browse files
Files changed (3) hide show
  1. Dockerfile +12 -13
  2. app.py +113 -37
  3. requirements.txt +5 -4
Dockerfile CHANGED
@@ -1,30 +1,25 @@
1
- FROM nvidia/cuda:12.1-devel-ubuntu22.04
2
 
3
- # Install Python 3.10
4
  RUN apt-get update && apt-get install -y --no-install-recommends \
5
- python3.10 python3.10-dev python3-pip python3.10-venv \
6
  git gcc g++ libglib2.0-0 libsm6 libxext6 libxrender-dev \
7
  build-essential curl && \
8
  rm -rf /var/lib/apt/lists/*
9
 
10
- # Create symbolic links for python
11
- RUN ln -s /usr/bin/python3.10 /usr/bin/python && \
12
- ln -s /usr/bin/python3.10 /usr/bin/python3
13
-
14
  RUN useradd -m -u 1000 user
15
  USER user
16
  ENV PATH="/home/user/.local/bin:$PATH"
17
 
18
  WORKDIR /app
 
19
 
20
- # Install dependencies step by step untuk menghindari konflik
21
  RUN pip install --upgrade pip && \
22
  pip install --no-cache-dir packaging ninja wheel setuptools numpy
23
 
24
- # Install PyTorch dengan CUDA support
25
- RUN pip install --no-cache-dir torch==2.2.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
26
 
27
- # Install dependencies lain sebelum GUI-Actor
28
  RUN pip install --no-cache-dir \
29
  transformers \
30
  datasets \
@@ -35,8 +30,12 @@ RUN pip install --no-cache-dir \
35
  fastapi \
36
  "uvicorn[standard]"
37
 
38
- # Install GUI-Actor package terakhir (includes flash-attn)
39
- RUN pip install --no-cache-dir "git+https://github.com/microsoft/GUI-Actor.git"
 
 
 
 
40
 
41
  COPY --chown=user . .
42
 
 
1
+ FROM python:3.10-slim
2
 
 
3
  RUN apt-get update && apt-get install -y --no-install-recommends \
 
4
  git gcc g++ libglib2.0-0 libsm6 libxext6 libxrender-dev \
5
  build-essential curl && \
6
  rm -rf /var/lib/apt/lists/*
7
 
 
 
 
 
8
  RUN useradd -m -u 1000 user
9
  USER user
10
  ENV PATH="/home/user/.local/bin:$PATH"
11
 
12
  WORKDIR /app
13
+ COPY --chown=user requirements.txt ./
14
 
15
+ # Install dependencies step by step
16
  RUN pip install --upgrade pip && \
17
  pip install --no-cache-dir packaging ninja wheel setuptools numpy
18
 
19
+ # Install PyTorch CPU version
20
+ RUN pip install --no-cache-dir torch==2.2.2+cpu torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
21
 
22
+ # Install core dependencies
23
  RUN pip install --no-cache-dir \
24
  transformers \
25
  datasets \
 
30
  fastapi \
31
  "uvicorn[standard]"
32
 
33
+ # Install GUI-Actor dependencies manually (skip flash-attn)
34
+ RUN pip install --no-cache-dir \
35
+ pre-commit \
36
+ liger-kernel==0.5.2 \
37
+ opencv-python-headless \
38
+ deepspeed==0.16.0
39
 
40
  COPY --chown=user . .
41
 
app.py CHANGED
@@ -5,36 +5,119 @@ from PIL import Image
5
  from io import BytesIO
6
  import base64
7
  import torch
8
-
9
- # Import sesuai dokumentasi GUI-Actor
10
- from qwen_vl_utils import process_vision_info
11
- from transformers import Qwen2VLProcessor
12
- from gui_actor.constants import chat_template
13
- from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
14
- from gui_actor.inference import inference
15
 
16
  app = FastAPI()
17
 
18
- # Load model sesuai dokumentasi
19
- model_name_or_path = "microsoft/GUI-Actor-2B-Qwen2-VL"
20
- data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path)
21
- tokenizer = data_processor.tokenizer
 
 
 
 
 
 
22
 
23
- # Modifikasi untuk CPU atau GPU
24
- device = "cuda" if torch.cuda.is_available() else "cpu"
25
- torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32
26
 
27
- model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
28
- model_name_or_path,
29
- torch_dtype=torch_dtype,
30
- device_map=device if device == "cuda" else None,
31
- attn_implementation="flash_attention_2" if device == "cuda" else None
 
 
32
  ).eval()
33
 
34
  class Base64Request(BaseModel):
35
  image_base64: str
36
  instruction: str
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  @app.post("/click/base64")
39
  async def predict_click_base64(data: Base64Request):
40
  try:
@@ -48,7 +131,7 @@ async def predict_click_base64(data: Base64Request):
48
  "content": [
49
  {
50
  "type": "text",
51
- "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
52
  }
53
  ]
54
  },
@@ -67,30 +150,24 @@ async def predict_click_base64(data: Base64Request):
67
  },
68
  ]
69
 
70
- # Inference menggunakan fungsi dari GUI-Actor
71
- pred = inference(
72
- conversation,
73
- model,
74
- tokenizer,
75
- data_processor,
76
- use_placeholder=True,
77
- topk=3
78
- )
79
-
80
  px, py = pred["topk_points"][0]
81
 
82
  return JSONResponse(content={
83
  "x": round(px, 4),
84
  "y": round(py, 4),
85
- "all_points": [[round(x, 4), round(y, 4)] for x, y in pred["topk_points"]],
86
- "success": True
87
  })
88
 
89
  except Exception as e:
90
  return JSONResponse(
91
  content={
92
  "error": str(e),
93
- "success": False
 
 
94
  },
95
  status_code=500
96
  )
@@ -99,12 +176,11 @@ async def predict_click_base64(data: Base64Request):
99
  async def health_check():
100
  return {
101
  "status": "healthy",
102
- "model": model_name_or_path,
103
- "device": device,
104
- "torch_dtype": str(torch_dtype)
105
  }
106
 
107
- # Endpoint tambahan untuk testing dengan form data
108
  @app.post("/click/form")
109
  async def predict_click_form(
110
  image_base64: str = Form(...),
 
5
  from io import BytesIO
6
  import base64
7
  import torch
8
+ import re
9
+ from transformers import AutoModelForCausalLM, AutoProcessor
 
 
 
 
 
10
 
11
  app = FastAPI()
12
 
13
+ # Load model untuk CPU
14
+ model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
15
+
16
+ # Load processor
17
+ try:
18
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
19
+ except Exception as e:
20
+ print(f"Failed to load AutoProcessor: {e}")
21
+ from transformers import Qwen2VLProcessor
22
+ processor = Qwen2VLProcessor.from_pretrained(model_name)
23
 
24
+ tokenizer = processor.tokenizer
 
 
25
 
26
+ # Load model dengan CPU support
27
+ model = AutoModelForCausalLM.from_pretrained(
28
+ model_name,
29
+ torch_dtype=torch.float32, # float32 untuk CPU
30
+ device_map=None, # CPU only
31
+ trust_remote_code=True, # untuk custom model
32
+ attn_implementation=None # skip flash attention
33
  ).eval()
34
 
35
  class Base64Request(BaseModel):
36
  image_base64: str
37
  instruction: str
38
 
39
+ def extract_coordinates(text):
40
+ """
41
+ Extract coordinates from model output text
42
+ """
43
+ # Pattern untuk mencari koordinat dalam berbagai format
44
+ patterns = [
45
+ r'click\s*\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)', # click(x, y)
46
+ r'\[\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\]', # [x, y]
47
+ r'(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)', # x, y
48
+ r'point:\s*\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)', # point: (x, y)
49
+ ]
50
+
51
+ for pattern in patterns:
52
+ matches = re.findall(pattern, text.lower())
53
+ if matches:
54
+ try:
55
+ x, y = float(matches[0][0]), float(matches[0][1])
56
+ # Normalize jika koordinat > 1 (asumsi pixel coordinates)
57
+ if x > 1 or y > 1:
58
+ # Asumsi resolusi 1920x1080 untuk normalisasi
59
+ x = x / 1920 if x > 1 else x
60
+ y = y / 1080 if y > 1 else y
61
+ return [(x, y)]
62
+ except (ValueError, IndexError):
63
+ continue
64
+
65
+ # Default ke center jika tidak ditemukan
66
+ return [(0.5, 0.5)]
67
+
68
+ def cpu_inference(conversation, model, tokenizer, processor):
69
+ """
70
+ Inference function untuk CPU tanpa GUI-Actor dependencies
71
+ """
72
+ try:
73
+ # Apply chat template
74
+ text = processor.apply_chat_template(
75
+ conversation,
76
+ tokenize=False,
77
+ add_generation_prompt=True
78
+ )
79
+
80
+ # Get image from conversation
81
+ image = conversation[1]["content"][0]["image"]
82
+
83
+ # Process inputs
84
+ inputs = processor(
85
+ text=[text],
86
+ images=[image],
87
+ return_tensors="pt"
88
+ )
89
+
90
+ # Generate response
91
+ with torch.no_grad():
92
+ outputs = model.generate(
93
+ **inputs,
94
+ max_new_tokens=256,
95
+ do_sample=True,
96
+ temperature=0.3,
97
+ top_p=0.8,
98
+ pad_token_id=tokenizer.eos_token_id
99
+ )
100
+
101
+ # Decode response
102
+ generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
103
+ response = tokenizer.decode(generated_ids, skip_special_tokens=True)
104
+
105
+ # Extract coordinates
106
+ coordinates = extract_coordinates(response)
107
+
108
+ return {
109
+ "topk_points": coordinates,
110
+ "response": response,
111
+ "success": True
112
+ }
113
+
114
+ except Exception as e:
115
+ return {
116
+ "topk_points": [(0.5, 0.5)],
117
+ "response": f"Error during inference: {str(e)}",
118
+ "success": False
119
+ }
120
+
121
  @app.post("/click/base64")
122
  async def predict_click_base64(data: Base64Request):
123
  try:
 
131
  "content": [
132
  {
133
  "type": "text",
134
+ "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task. Please provide the click coordinates.",
135
  }
136
  ]
137
  },
 
150
  },
151
  ]
152
 
153
+ # Run inference
154
+ pred = cpu_inference(conversation, model, tokenizer, processor)
 
 
 
 
 
 
 
 
155
  px, py = pred["topk_points"][0]
156
 
157
  return JSONResponse(content={
158
  "x": round(px, 4),
159
  "y": round(py, 4),
160
+ "response": pred["response"],
161
+ "success": pred["success"]
162
  })
163
 
164
  except Exception as e:
165
  return JSONResponse(
166
  content={
167
  "error": str(e),
168
+ "success": False,
169
+ "x": 0.5,
170
+ "y": 0.5
171
  },
172
  status_code=500
173
  )
 
176
  async def health_check():
177
  return {
178
  "status": "healthy",
179
+ "model": model_name,
180
+ "device": "cpu",
181
+ "torch_dtype": "float32"
182
  }
183
 
 
184
  @app.post("/click/form")
185
  async def predict_click_form(
186
  image_base64: str = Form(...),
requirements.txt CHANGED
@@ -2,13 +2,14 @@ packaging
2
  ninja
3
  fastapi
4
  uvicorn[standard]
5
- transformers
6
  datasets
7
  Pillow
8
- torch==2.2.2
9
  torchvision
10
  torchaudio
 
11
  accelerate
12
  scipy
13
- qwen-vl-utils
14
- git+https://github.com/microsoft/GUI-Actor.git
 
2
  ninja
3
  fastapi
4
  uvicorn[standard]
5
+ transformers>=4.37.0
6
  datasets
7
  Pillow
8
+ torch==2.2.2+cpu
9
  torchvision
10
  torchaudio
11
+ --index-url https://download.pytorch.org/whl/cpu
12
  accelerate
13
  scipy
14
+ numpy
15
+ qwen-vl-utils