banao-tech commited on
Commit
f6e9d77
·
verified ·
1 Parent(s): aab71b7

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +16 -0
  2. fly.toml +21 -0
  3. main.py +132 -0
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM registry.hf.space/microsoft-omniparser:latest
2
+
3
+ USER root
4
+
5
+ RUN chmod 1777 /tmp \
6
+ && apt update -q && apt install -y ca-certificates wget libgl1 \
7
+ && wget -qO /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \
8
+ && dpkg -i /tmp/cuda-keyring.deb && apt update -q \
9
+ && apt install -y --no-install-recommends libcudnn8 libcublas-12-2
10
+
11
+ RUN pip install fastapi[all]
12
+
13
+
14
+ COPY main.py main.py
15
+ RUN python main.py
16
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
fly.toml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # See https://fly.io/docs/reference/configuration/ for information about how to use this file.
2
+
3
+ app = 'omni-parser-app'
4
+ primary_region = 'ord'
5
+ swap_size_mb = 32768 # 32gb
6
+
7
+ [build]
8
+
9
+ [http_service]
10
+ internal_port = 7860
11
+ force_https = true
12
+ auto_stop_machines = 'suspend'
13
+ auto_start_machines = true
14
+ min_machines_running = 0
15
+ processes = ['app']
16
+
17
+ [[vm]]
18
+ memory = '32gb'
19
+ cpu_kind = 'performance'
20
+ cpus = 4
21
+ gpu_kind = 'l40s'
main.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ from fastapi.responses import JSONResponse
3
+ from pydantic import BaseModel
4
+ from typing import Optional
5
+ import base64
6
+ import io
7
+ from PIL import Image
8
+ import torch
9
+ import numpy as np
10
+ import os
11
+
12
+ # Existing imports
13
+ import numpy as np
14
+ import torch
15
+ from PIL import Image
16
+ import io
17
+
18
+ from utils import (
19
+ check_ocr_box,
20
+ get_yolo_model,
21
+ get_caption_model_processor,
22
+ get_som_labeled_img,
23
+ )
24
+ import torch
25
+
26
+ # yolo_model = get_yolo_model(model_path='/data/icon_detect/best.pt')
27
+ # caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="/data/icon_caption_florence")
28
+
29
+ from ultralytics import YOLO
30
+
31
+ # if not os.path.exists("/data/icon_detect"):
32
+ # os.makedirs("/data/icon_detect")
33
+
34
+ try:
35
+ yolo_model = YOLO("weights/icon_detect/best.pt").to("cuda")
36
+ except:
37
+ yolo_model = YOLO("weights/icon_detect/best.pt")
38
+
39
+ from transformers import AutoProcessor, AutoModelForCausalLM
40
+
41
+ processor = AutoProcessor.from_pretrained(
42
+ "microsoft/Florence-2-base", trust_remote_code=True
43
+ )
44
+
45
+ try:
46
+ model = AutoModelForCausalLM.from_pretrained(
47
+ "weights/icon_caption_florence",
48
+ torch_dtype=torch.float16,
49
+ trust_remote_code=True,
50
+ ).to("cuda")
51
+ except:
52
+ model = AutoModelForCausalLM.from_pretrained(
53
+ "weights/icon_caption_florence",
54
+ torch_dtype=torch.float16,
55
+ trust_remote_code=True,
56
+ )
57
+ caption_model_processor = {"processor": processor, "model": model}
58
+ print("finish loading model!!!")
59
+
60
+ app = FastAPI()
61
+
62
+
63
+ class ProcessResponse(BaseModel):
64
+ image: str # Base64 encoded image
65
+ parsed_content_list: str
66
+ label_coordinates: str
67
+
68
+
69
+ def process(
70
+ image_input: Image.Image, box_threshold: float, iou_threshold: float
71
+ ) -> ProcessResponse:
72
+ image_save_path = "imgs/saved_image_demo.png"
73
+ image_input.save(image_save_path)
74
+ image = Image.open(image_save_path)
75
+ box_overlay_ratio = image.size[0] / 3200
76
+ draw_bbox_config = {
77
+ "text_scale": 0.8 * box_overlay_ratio,
78
+ "text_thickness": max(int(2 * box_overlay_ratio), 1),
79
+ "text_padding": max(int(3 * box_overlay_ratio), 1),
80
+ "thickness": max(int(3 * box_overlay_ratio), 1),
81
+ }
82
+
83
+ ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
84
+ image_save_path,
85
+ display_img=False,
86
+ output_bb_format="xyxy",
87
+ goal_filtering=None,
88
+ easyocr_args={"paragraph": False, "text_threshold": 0.9},
89
+ use_paddleocr=True,
90
+ )
91
+ text, ocr_bbox = ocr_bbox_rslt
92
+ dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
93
+ image_save_path,
94
+ yolo_model,
95
+ BOX_TRESHOLD=box_threshold,
96
+ output_coord_in_ratio=True,
97
+ ocr_bbox=ocr_bbox,
98
+ draw_bbox_config=draw_bbox_config,
99
+ caption_model_processor=caption_model_processor,
100
+ ocr_text=text,
101
+ iou_threshold=iou_threshold,
102
+ )
103
+ image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
104
+ print("finish processing")
105
+ parsed_content_list_str = "\n".join(parsed_content_list)
106
+
107
+ # Encode image to base64
108
+ buffered = io.BytesIO()
109
+ image.save(buffered, format="PNG")
110
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
111
+
112
+ return ProcessResponse(
113
+ image=img_str,
114
+ parsed_content_list=str(parsed_content_list_str),
115
+ label_coordinates=str(label_coordinates),
116
+ )
117
+
118
+
119
+ @app.post("/process_image", response_model=ProcessResponse)
120
+ async def process_image(
121
+ image_file: UploadFile = File(...),
122
+ box_threshold: float = 0.05,
123
+ iou_threshold: float = 0.1,
124
+ ):
125
+ try:
126
+ contents = await image_file.read()
127
+ image_input = Image.open(io.BytesIO(contents)).convert("RGB")
128
+ except Exception as e:
129
+ raise HTTPException(status_code=400, detail="Invalid image file")
130
+
131
+ response = process(image_input, box_threshold, iou_threshold)
132
+ return response