Spaces:

banao-tech
/

OmniPar

Sleeping

App Files Files Community

banao-tech commited on Dec 31, 2024

Commit

f36b296

verified ·

1 Parent(s): 141751d

Update main.py

Browse files

Files changed (1) hide show

main.py +197 -114

main.py CHANGED Viewed

@@ -1,132 +1,215 @@
 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.responses import JSONResponse
-from pydantic import BaseModel
-from typing import Optional
 import base64
 import io
-from PIL import Image
-import torch
-import numpy as np
 import os
-# Existing imports
-import numpy as np
 import torch
 from PIL import Image
-import io
-from utils import (
-    check_ocr_box,
-    get_yolo_model,
-    get_caption_model_processor,
-    get_som_labeled_img,
-)
-import torch
-# yolo_model = get_yolo_model(model_path='/data/icon_detect/best.pt')
-# caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="/data/icon_caption_florence")
 from ultralytics import YOLO
-# if not os.path.exists("/data/icon_detect"):
-#     os.makedirs("/data/icon_detect")
-try:
-    yolo_model = YOLO("weights/icon_detect/best.pt").to("cpu")
-except:
-    yolo_model = YOLO("weights/icon_detect/best.pt")
 from transformers import AutoProcessor, AutoModelForCausalLM
-processor = AutoProcessor.from_pretrained(
-    "microsoft/Florence-2-base", trust_remote_code=True
 )
-try:
-    model = AutoModelForCausalLM.from_pretrained(
-        "banao-tech/OmniParse",
-        torch_dtype=torch.float16,
-        trust_remote_code=True,
-    ).to("cpu")
-except:
-    model = AutoModelForCausalLM.from_pretrained(
-        "banao-tech/OmniParse",
-        torch_dtype=torch.float16,
-        trust_remote_code=True,
-    )
-caption_model_processor = {"processor": processor, "model": model}
-print("finish loading model!!!")
-app = FastAPI()
-class ProcessResponse(BaseModel):
-    image: str  # Base64 encoded image
-    parsed_content_list: str
-    label_coordinates: str
-def process(
-    image_input: Image.Image, box_threshold: float, iou_threshold: float
-) -> ProcessResponse:
-    image_save_path = "imgs/saved_image_demo.png"
-    image_input.save(image_save_path)
-    image = Image.open(image_save_path)
-    box_overlay_ratio = image.size[0] / 3200
-    draw_bbox_config = {
-        "text_scale": 0.8 * box_overlay_ratio,
-        "text_thickness": max(int(2 * box_overlay_ratio), 1),
-        "text_padding": max(int(3 * box_overlay_ratio), 1),
-        "thickness": max(int(3 * box_overlay_ratio), 1),
-    }
-    ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
-        image_save_path,
-        display_img=False,
-        output_bb_format="xyxy",
-        goal_filtering=None,
-        easyocr_args={"paragraph": False, "text_threshold": 0.9},
-        use_paddleocr=True,
-    )
-    text, ocr_bbox = ocr_bbox_rslt
-    dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
-        image_save_path,
-        yolo_model,
-        BOX_TRESHOLD=box_threshold,
-        output_coord_in_ratio=True,
-        ocr_bbox=ocr_bbox,
-        draw_bbox_config=draw_bbox_config,
-        caption_model_processor=caption_model_processor,
-        ocr_text=text,
-        iou_threshold=iou_threshold,
-    )
-    image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
-    print("finish processing")
-    parsed_content_list_str = "\n".join(parsed_content_list)
-    # Encode image to base64
-    buffered = io.BytesIO()
-    image.save(buffered, format="PNG")
-    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
-    return ProcessResponse(
-        image=img_str,
-        parsed_content_list=str(parsed_content_list_str),
-        label_coordinates=str(label_coordinates),
-    )
-@app.post("/process_image", response_model=ProcessResponse)
 async def process_image(
     image_file: UploadFile = File(...),
-    box_threshold: float = 0.05,
-    iou_threshold: float = 0.1,
 ):
     try:
         contents = await image_file.read()
-        image_input = Image.open(io.BytesIO(contents)).convert("RGB")
     except Exception as e:
-        raise HTTPException(status_code=400, detail="Invalid image file")
-    response = process(image_input, box_threshold, iou_threshold)
-    return response

 from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.responses import JSONResponse
+from pydantic import BaseModel, Field
+from typing import List, Dict, Tuple, Optional
 import base64
 import io
 import os
+from pathlib import Path
 import torch
+import numpy as np
 from PIL import Image
 from ultralytics import YOLO
 from transformers import AutoProcessor, AutoModelForCausalLM
+# Type definitions
+class ProcessResponse(BaseModel):
+    image: str = Field(..., description="Base64 encoded processed image")
+    parsed_content_list: str = Field(..., description="List of parsed content")
+    label_coordinates: str = Field(..., description="Coordinates of detected labels")
+class ModelManager:
+    def __init__(self):
+        self.yolo_model = None
+        self.processor = None
+        self.model = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+    def load_models(self):
+        """Initialize all required models"""
+        try:
+            # Load YOLO model
+            weights_path = Path("weights/icon_detect/best.pt")
+            if not weights_path.exists():
+                raise FileNotFoundError(f"YOLO weights not found at {weights_path}")
+            self.yolo_model = YOLO(str(weights_path)).to(self.device)
+            # Load processor and model
+            self.processor = AutoProcessor.from_pretrained(
+                "microsoft/Florence-2-base",
+                trust_remote_code=True
+            )
+            self.model = AutoModelForCausalLM.from_pretrained(
+                "banao-tech/OmniParse",
+                torch_dtype=torch.float16,
+                trust_remote_code=True
+            ).to(self.device)
+            return True
+        except Exception as e:
+            print(f"Error loading models: {str(e)}")
+            return False
+class ImageProcessor:
+    def __init__(self, model_manager: ModelManager):
+        self.model_manager = model_manager
+        self.temp_dir = Path("temp")
+        self.temp_dir.mkdir(exist_ok=True)
+    async def process_image(
+        self,
+        image: Image.Image,
+        box_threshold: float = 0.05,
+        iou_threshold: float = 0.1
+    ) -> ProcessResponse:
+        """Process the input image and return results"""
+        try:
+            # Save temporary image
+            temp_image_path = self.temp_dir / "temp_image.png"
+            image.save(temp_image_path)
+            # Calculate overlay ratio
+            box_overlay_ratio = image.size[0] / 3200
+            draw_config = self._get_draw_config(box_overlay_ratio)
+            # Process image
+            ocr_results = self._perform_ocr(temp_image_path)
+            labeled_results = self._get_labeled_image(
+                temp_image_path,
+                ocr_results,
+                box_threshold,
+                iou_threshold,
+                draw_config
+            )
+            # Create response
+            response = self._create_response(labeled_results)
+            # Cleanup
+            temp_image_path.unlink(missing_ok=True)
+            return response
+        except Exception as e:
+            raise HTTPException(
+                status_code=500,
+                detail=f"Image processing failed: {str(e)}"
+            )
+    def _get_draw_config(self, ratio: float) -> Dict:
+        """Generate drawing configuration based on image ratio"""
+        return {
+            "text_scale": 0.8 * ratio,
+            "text_thickness": max(int(2 * ratio), 1),
+            "text_padding": max(int(3 * ratio), 1),
+            "thickness": max(int(3 * ratio), 1),
+        }
+    def _perform_ocr(self, image_path: Path) -> Tuple[List[str], List]:
+        """Perform OCR on the image"""
+        # Implement OCR logic here
+        # This is a placeholder - implement actual OCR logic
+        return [], []
+    def _get_labeled_image(
+        self,
+        image_path: Path,
+        ocr_results: Tuple[List[str], List],
+        box_threshold: float,
+        iou_threshold: float,
+        draw_config: Dict
+    ) -> Tuple[str, Dict, List[str]]:
+        """Get labeled image with detected objects"""
+        # Implement labeling logic here
+        # This is a placeholder - implement actual labeling logic
+        return "", {}, []
+    def _create_response(
+        self,
+        labeled_results: Tuple[str, Dict, List[str]]
+    ) -> ProcessResponse:
+        """Create API response from processing results"""
+        labeled_image, coordinates, content_list = labeled_results
+        return ProcessResponse(
+            image=labeled_image,
+            parsed_content_list="\n".join(content_list),
+            label_coordinates=str(coordinates)
+        )
+# Initialize FastAPI app
+app = FastAPI(
+    title="Image Processing API",
+    description="API for processing and analyzing images",
+    version="1.0.0"
 )
+# Initialize model manager and image processor
+model_manager = ModelManager()
+image_processor = ImageProcessor(model_manager)
+@app.on_event("startup")
+async def startup_event():
+    """Initialize models on startup"""
+    if not model_manager.load_models():
+        raise RuntimeError("Failed to load required models")
+@app.post(
+    "/process_image",
+    response_model=ProcessResponse,
+    summary="Process an uploaded image",
+    response_description="Processed image results"
+)
 async def process_image(
     image_file: UploadFile = File(...),
+    box_threshold: float = Field(0.05, ge=0, le=1),
+    iou_threshold: float = Field(0.1, ge=0, le=1)
 ):
+    """
+    Process an uploaded image file and return the results.
+    Parameters:
+    - image_file: The image file to process
+    - box_threshold: Threshold for box detection (0-1)
+    - iou_threshold: IOU threshold for overlap detection (0-1)
+    Returns:
+    - ProcessResponse containing the processed image and results
+    """
     try:
+        # Validate file type
+        if not image_file.content_type.startswith('image/'):
+            raise HTTPException(
+                status_code=400,
+                detail="File must be an image"
+            )
+        # Read and validate image
         contents = await image_file.read()
+        try:
+            image = Image.open(io.BytesIO(contents)).convert("RGB")
+        except Exception as e:
+            raise HTTPException(
+                status_code=400,
+                detail="Invalid image format"
+            )
+        # Process image
+        return await image_processor.process_image(
+            image,
+            box_threshold,
+            iou_threshold
+        )
+    except HTTPException:
+        raise
     except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Internal server error: {str(e)}"
+        )
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)