qwen2.5-VL-api

Running

App Files Files Community

bla commited on Feb 26

Commit

40415b8

verified ·

1 Parent(s): cf4cb00

Update main.py

Browse files

Files changed (1) hide show

main.py +108 -59

main.py CHANGED Viewed

@@ -1,47 +1,47 @@
 import base64
 import logging
 import time
 from io import BytesIO
 import torch
-from fastapi import Body, FastAPI, File, HTTPException, Query, UploadFile
 from PIL import Image
 from pydantic import BaseModel
 from qwen_vl_utils import process_vision_info
-from transformers import (
-    AutoProcessor,
-    Qwen2_5_VLForConditionalGeneration,
-    Qwen2VLForConditionalGeneration,
-)
 app = FastAPI()
-# Define request model
 class PredictRequest(BaseModel):
     image_base64: list[str]
     prompt: str
-# checkpoint = "Qwen/Qwen2-VL-2B-Instruct"
-# min_pixels = 256 * 28 * 28
-# max_pixels = 1280 * 28 * 28
-# processor = AutoProcessor.from_pretrained(
-#     checkpoint, min_pixels=min_pixels, max_pixels=max_pixels
-# )
-# model = Qwen2VLForConditionalGeneration.from_pretrained(
-#     checkpoint,
-#     torch_dtype=torch.bfloat16,
-#     device_map="auto",
-#     # attn_implementation="flash_attention_2",
-# )
-# checkpoint = "Qwen/Qwen2.5-VL-7B-Instruct"
 checkpoint = "Qwen/Qwen2.5-VL-3B-Instruct-AWQ"
 min_pixels = 256 * 28 * 28
 max_pixels = 1280 * 28 * 28
 processor = AutoProcessor.from_pretrained(
     checkpoint, min_pixels=min_pixels, max_pixels=max_pixels
 )
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     checkpoint,
     torch_dtype="auto",
@@ -49,12 +49,10 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     # attn_implementation="flash_attention_2",
 )
 @app.get("/")
 def read_root():
     return {"message": "API is live. Use the /predict endpoint."}
 def encode_image(image_data: BytesIO, max_size=(800, 800), quality=85):
     """
     Converts an image from file data to a Base64-encoded string with optimized size.
@@ -69,7 +67,6 @@ def encode_image(image_data: BytesIO, max_size=(800, 800), quality=85):
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error encoding image: {e}")
 @app.post("/encode-image/")
 async def upload_and_encode_image(file: UploadFile = File(...)):
     """
@@ -82,46 +79,43 @@ async def upload_and_encode_image(file: UploadFile = File(...)):
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Invalid file: {e}")
 @app.post("/predict")
-def predict(data: PredictRequest):
     """
-    Generates a description for an image using the Qwen-2-VL model.
-    Args:
-        data (PredictRequest): The request containing encoded images and a prompt.
-    Returns:
-        dict: The generated description of the image(s).
     """
     logging.warning("Calling /predict endpoint...")
-    # Ensure image_base64 is a list (even if a single image is provided)
-    image_list = (
-        data.image_base64
-        if isinstance(data.image_base64, list)
-        else [data.image_base64]
-    )
-    # Create the input message structure with multiple images
     messages = [
         {
             "role": "user",
             "content": [
                 {"type": "image", "image": f"data:image;base64,{image}"}
                 for image in image_list
-            ]
-            + [{"type": "text", "text": data.prompt}],
         }
     ]
-    logging.info("Processing inputs...", len(image_list))
-    # Prepare inputs for the model
-    text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
     image_inputs, video_inputs = process_vision_info(messages)
     inputs = processor(
         text=[text],
@@ -132,21 +126,76 @@ def predict(data: PredictRequest):
     ).to(model.device)
     logging.warning("Starting generation...")
     start_time = time.time()
-    # Generate the output
     generated_ids = model.generate(**inputs, max_new_tokens=2056)
     generated_ids_trimmed = [
-        out_ids[len(in_ids) :]
-        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
     output_text = processor.batch_decode(
-        generated_ids_trimmed,
-        skip_special_tokens=True,
-        clean_up_tokenization_spaces=False,
     )
-    logging.warning(f"Generation completed in {time.time() - start_time:.2f}s.")
-    return {"response": output_text[0] if output_text else "No description generated."}

+#!/usr/bin/env python3
+# -- coding: utf-8 --
 import base64
+import json
 import logging
+import os
 import time
+import uuid
 from io import BytesIO
 import torch
+from fastapi import FastAPI, HTTPException, UploadFile, File
+from fastapi.staticfiles import StaticFiles
 from PIL import Image
 from pydantic import BaseModel
 from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+# Create the temporary folder if it doesn't exist.
+TEMP_DIR = "temp"
+os.makedirs(TEMP_DIR, exist_ok=True)
 app = FastAPI()
+# Mount the temporary folder so annotated images can be served at /temp/<filename>
+app.mount("/temp", StaticFiles(directory=TEMP_DIR), name="temp")
+# Define the request model
 class PredictRequest(BaseModel):
     image_base64: list[str]
     prompt: str
+# Use the desired checkpoint: Qwen/Qwen2.5-VL-3B-Instruct-AWQ
 checkpoint = "Qwen/Qwen2.5-VL-3B-Instruct-AWQ"
 min_pixels = 256 * 28 * 28
 max_pixels = 1280 * 28 * 28
+# Load the processor with the image resolution settings
 processor = AutoProcessor.from_pretrained(
     checkpoint, min_pixels=min_pixels, max_pixels=max_pixels
 )
+# Load the Qwen2.5-VL model.
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     checkpoint,
     torch_dtype="auto",
     # attn_implementation="flash_attention_2",
 )
 @app.get("/")
 def read_root():
     return {"message": "API is live. Use the /predict endpoint."}
 def encode_image(image_data: BytesIO, max_size=(800, 800), quality=85):
     """
     Converts an image from file data to a Base64-encoded string with optimized size.
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error encoding image: {e}")
 @app.post("/encode-image/")
 async def upload_and_encode_image(file: UploadFile = File(...)):
     """
     except Exception as e:
         raise HTTPException(status_code=400, detail=f"Invalid file: {e}")
 @app.post("/predict")
+def predict(data: PredictRequest, annotate: bool = False):
     """
+    Generates a description (e.g. bounding boxes with labels) for image(s) using Qwen2.5-VL-3B-Instruct-AWQ.
+    If 'annotate' is True (as a query parameter), the first image is annotated with the predicted bounding boxes,
+    stored in a temporary folder, and its URL is returned.
+    Request:
+        - image_base64: List of base64-encoded images.
+        - prompt: A prompt string.
+    Response (JSON):
+        {
+            "response": <text generated by Qwen2.5-VL>,
+            "annotated_image_url": "/temp/<filename>"   # only if annotate=True
+        }
     """
     logging.warning("Calling /predict endpoint...")
+    # Ensure image_base64 is a list.
+    image_list = data.image_base64 if isinstance(data.image_base64, list) else [data.image_base64]
+    # Create input messages: include all images and then the prompt.
     messages = [
         {
             "role": "user",
             "content": [
                 {"type": "image", "image": f"data:image;base64,{image}"}
                 for image in image_list
+            ] + [{"type": "text", "text": data.prompt}],
         }
     ]
+    logging.info("Processing inputs... Number of images: %d", len(image_list))
+    # Prepare inputs for the model using the processor's chat interface.
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs = process_vision_info(messages)
     inputs = processor(
         text=[text],
     ).to(model.device)
     logging.warning("Starting generation...")
     start_time = time.time()
+    # Generate output using the model.
     generated_ids = model.generate(**inputs, max_new_tokens=2056)
     generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
     output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )
+    generation_time = time.time() - start_time
+    logging.warning("Generation completed in %.2fs.", generation_time)
+    # The generated output text is expected to be JSON (e.g., list of detections).
+    result_text = output_text[0] if output_text else "No description generated."
+    response_data = {"response": result_text}
+    if annotate:
+        # Decode the first image for annotation.
+        try:
+            img_str = image_list[0]
+            # If the image string contains a data URI prefix, remove it.
+            if img_str.startswith("data:image"):
+                img_str = img_str.split(",")[1]
+            img_data = base64.b64decode(img_str)
+            image = Image.open(BytesIO(img_data))
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Error decoding image for annotation: {e}")
+        # Determine image dimensions (width, height)
+        input_wh = image.size
+        resolution_wh = input_wh  # Assuming no resolution change
+        # Parse the detection result from the model output.
+        try:
+            detection_result = json.loads(result_text)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Error parsing detection result: {e}")
+        # Use the supervision library to create detections and annotate the image.
+        try:
+            import supervision as sv
+            detections = sv.Detections.from_vlm(
+                vlm=sv.VLM.QWEN_2_5_VL,
+                result=detection_result,
+                input_wh=input_wh,
+                resolution_wh=resolution_wh
+            )
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Error creating detections: {e}")
+        try:
+            box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
+            label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
+            annotated_image = image.copy()
+            annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
+            annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Error annotating image: {e}")
+        # Save the annotated image in the temporary folder.
+        try:
+            filename = f"{uuid.uuid4()}.jpg"
+            filepath = os.path.join(TEMP_DIR, filename)
+            annotated_image.save(filepath, format="JPEG")
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Error saving annotated image: {e}")
+        # Add the annotated image URL to the response.
+        response_data["annotated_image_url"] = f"/temp/{filename}"
+    return response_data