Spaces:
Sleeping
Sleeping
File size: 4,006 Bytes
4e33759 58032d5 06a056f 4e33759 70e613b 4e33759 58032d5 4e33759 0041f4c 4e33759 0041f4c 4e33759 0041f4c 4e33759 0041f4c 4e33759 0041f4c 4e33759 9531af5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from typing import Optional
import base64
import io
from PIL import Image
import torch
import numpy as np
import os
# Existing imports
import numpy as np
import torch
from PIL import Image
import io
from utils import (
check_ocr_box,
get_yolo_model,
get_caption_model_processor,
get_som_labeled_img,
)
import torch
#yolo_model = get_yolo_model(model_path='best.pt')
#caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="icon_caption_florence")
from ultralytics import YOLO
#if not os.path.exists("NewParser/best.pt"):
#os.makedirs("NewParser/best.pt")
#try:
#yolo_model = YOLO("best.pt").to("cuda")
#except:
#yolo_model = YOLO("best.pt")
# Correctly load the processor and model for Blip-2
try:
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
model = Blip2ForConditionalGeneration.from_pretrained(
"microsoft/OmniParser",
torch_dtype=torch.float16, # Assuming you're using a GPU
trust_remote_code=True
).to("cuda")
except Exception as e:
print(f"Error loading caption model: {e}")
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
model = Blip2ForConditionalGeneration.from_pretrained(
"microsoft/OmniParser",
torch_dtype=torch.float16,
trust_remote_code=True
).to("cpu") # Fallback to CPU if CUDA fails
print("finish loading model!!!")
app = FastAPI()
class ProcessResponse(BaseModel):
image: str # Base64 encoded image
parsed_content_list: str
label_coordinates: str
def process(
image_input: Image.Image, box_threshold: float, iou_threshold: float
) -> ProcessResponse:
image_save_path = "imgs/saved_image_demo.png"
image_input.save(image_save_path)
image = Image.open(image_save_path)
box_overlay_ratio = image.size[0] / 3200
draw_bbox_config = {
"text_scale": 0.8 * box_overlay_ratio,
"text_thickness": max(int(2 * box_overlay_ratio), 1),
"text_padding": max(int(3 * box_overlay_ratio), 1),
"thickness": max(int(3 * box_overlay_ratio), 1),
}
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
image_save_path,
display_img=False,
output_bb_format="xyxy",
goal_filtering=None,
easyocr_args={"paragraph": False, "text_threshold": 0.9},
use_paddleocr=True,
)
text, ocr_bbox = ocr_bbox_rslt
dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
image_save_path,
yolo_model,
BOX_TRESHOLD=box_threshold,
output_coord_in_ratio=True,
ocr_bbox=ocr_bbox,
draw_bbox_config=draw_bbox_config,
caption_model_processor=caption_model_processor,
ocr_text=text,
iou_threshold=iou_threshold,
)
image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
print("finish processing")
parsed_content_list_str = "\n".join(parsed_content_list)
# Encode image to base64
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
return ProcessResponse(
image=img_str,
parsed_content_list=str(parsed_content_list_str),
label_coordinates=str(label_coordinates),
)
@app.post("/process_image", response_model=ProcessResponse)
async def process_image(
image_file: UploadFile = File(...),
box_threshold: float = 0.05,
iou_threshold: float = 0.1,
):
try:
contents = await image_file.read()
image_input = Image.open(io.BytesIO(contents)).convert("RGB")
except Exception as e:
raise HTTPException(status_code=400, detail="Invalid image file")
response = process(image_input, box_threshold, iou_threshold)
return response
|