abiyyufahri commited on
Commit
55b2cb1
·
1 Parent(s): 2cf117f

Install error fix

Browse files
Files changed (3) hide show
  1. Dockerfile +9 -5
  2. app.py +15 -58
  3. requirements.txt +1 -1
Dockerfile CHANGED
@@ -1,16 +1,20 @@
1
- FROM python:3.9
2
 
3
- RUN apt-get update && apt-get install -y git && \
4
- useradd -m -u 1000 user
 
 
 
 
5
  USER user
6
  ENV PATH="/home/user/.local/bin:$PATH"
7
 
8
  WORKDIR /app
9
 
10
- COPY --chown=user requirements.txt .
11
  RUN pip install --no-cache-dir --upgrade pip && \
12
  pip install --no-cache-dir -r requirements.txt
13
 
14
  COPY --chown=user . .
15
 
16
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ FROM python:3.9-slim
2
 
3
+ # Install dependencies
4
+ RUN apt-get update && apt-get install -y git curl && \
5
+ apt-get clean && rm -rf /var/lib/apt/lists/*
6
+
7
+ # Create non-root user
8
+ RUN useradd -m -u 1000 user
9
  USER user
10
  ENV PATH="/home/user/.local/bin:$PATH"
11
 
12
  WORKDIR /app
13
 
14
+ COPY --chown=user requirements.txt ./
15
  RUN pip install --no-cache-dir --upgrade pip && \
16
  pip install --no-cache-dir -r requirements.txt
17
 
18
  COPY --chown=user . .
19
 
20
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,9 +1,10 @@
1
- from fastapi import FastAPI, UploadFile, Form
2
  from fastapi.responses import JSONResponse
 
3
  from PIL import Image
4
  from io import BytesIO
5
- import torch
6
  import base64
 
7
 
8
  from transformers import Qwen2VLProcessor
9
  from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
@@ -17,66 +18,22 @@ processor = Qwen2VLProcessor.from_pretrained(model_name)
17
  tokenizer = processor.tokenizer
18
  model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
19
  model_name,
20
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
21
- device_map="auto",
22
- attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
23
  ).eval()
24
 
25
 
26
- @app.post("/click_base64")
27
- async def predict_click_base64(
28
- image_base64: str = Form(...),
29
- instruction: str = Form(...)
30
- ):
31
- # Decode base64 image
32
- try:
33
- if "," in image_base64:
34
- image_base64 = image_base64.split(",")[1]
35
- image_data = base64.b64decode(image_base64)
36
- pil_image = Image.open(BytesIO(image_data)).convert("RGB")
37
- except Exception as e:
38
- return JSONResponse(status_code=400, content={"error": f"Invalid image format: {str(e)}"})
39
-
40
- # Prepare conversation
41
- conversation = [
42
- {
43
- "role": "system",
44
- "content": [
45
- {
46
- "type": "text",
47
- "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
48
- }
49
- ]
50
- },
51
- {
52
- "role": "user",
53
- "content": [
54
- {
55
- "type": "image",
56
- "image": pil_image,
57
- },
58
- {
59
- "type": "text",
60
- "text": instruction,
61
- },
62
- ],
63
- },
64
- ]
65
-
66
- # Inference
67
- try:
68
- pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
69
- px, py = pred["topk_points"][0]
70
- return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})
71
- except Exception as e:
72
- return JSONResponse(status_code=500, content={"error": f"Inference failed: {str(e)}"})
73
 
74
 
75
- @app.post("/click")
76
- async def predict_click(image: UploadFile, instruction: str = Form(...)):
77
- # Load image
78
- contents = await image.read()
79
- pil_image = Image.open(BytesIO(contents)).convert("RGB")
80
 
81
  conversation = [
82
  {
@@ -97,7 +54,7 @@ async def predict_click(image: UploadFile, instruction: str = Form(...)):
97
  },
98
  {
99
  "type": "text",
100
- "text": instruction,
101
  },
102
  ],
103
  },
 
1
+ from fastapi import FastAPI, Form
2
  from fastapi.responses import JSONResponse
3
+ from pydantic import BaseModel
4
  from PIL import Image
5
  from io import BytesIO
 
6
  import base64
7
+ import torch
8
 
9
  from transformers import Qwen2VLProcessor
10
  from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
 
18
  tokenizer = processor.tokenizer
19
  model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
20
  model_name,
21
+ torch_dtype=torch.float32, # use float32 for CPU
22
+ device_map=None, # don't map to cuda
23
+ attn_implementation=None,
24
  ).eval()
25
 
26
 
27
+ class Base64Request(BaseModel):
28
+ image_base64: str
29
+ instruction: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
 
32
+ @app.post("/click/base64")
33
+ async def predict_click_base64(data: Base64Request):
34
+ # Decode base64 to image
35
+ image_data = base64.b64decode(data.image_base64.split(",")[-1])
36
+ pil_image = Image.open(BytesIO(image_data)).convert("RGB")
37
 
38
  conversation = [
39
  {
 
54
  },
55
  {
56
  "type": "text",
57
+ "text": data.instruction,
58
  },
59
  ],
60
  },
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
  fastapi
2
  uvicorn[standard]
3
  transformers
4
- torch
5
  datasets
6
  Pillow
7
  accelerate
 
1
  fastapi
2
  uvicorn[standard]
3
  transformers
4
+ torch==2.1.2
5
  datasets
6
  Pillow
7
  accelerate