Spaces:

reab5555
/

AI-Image-Anomaly-Detection

Running

App Files Files Community

reab5555 commited on Jan 7

Commit

f8ce6ee

verified ·

1 Parent(s): f57fde9

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -61

app.py CHANGED Viewed

@@ -19,48 +19,42 @@ load_dotenv()
 OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
-def resize_and_compress(image, max_width=800, max_height=800, quality=50):
-    """Resize (if > max_width/height) and compress the image to keep Base64 under ~1MB."""
     if not isinstance(image, Image.Image):
-        raise ValueError("Input must be a PIL Image")
-    width, height = image.size
-    if width > max_width or height > max_height:
-        aspect_ratio = width / height
-        if aspect_ratio > 1:
-            new_width = max_width
-            new_height = int(new_width / aspect_ratio)
-        else:
-            new_height = max_height
-            new_width = int(new_height * aspect_ratio)
-        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
     buffered = io.BytesIO()
-    # Save as JPEG with reduced quality
-    image.save(buffered, format="JPEG", quality=quality)
-    buffered.seek(0)
     return base64.b64encode(buffered.getvalue()).decode('utf-8')
 def analyze_image(image):
     client = OpenAI(api_key=OPENAI_API_KEY)
-    # Step 1: Resize + compress to keep the Base64 string under 1 MB
-    base64_image = resize_and_compress(image, max_width=800, max_height=800, quality=50)
-    # Build the list-of-dicts prompt
-    prompt_dict = [
         {
             "type": "text",
-            "text": """Your task is to determine if the image is surprising or not.
-            If the image is surprising, which element is surprising (max 6 words).
-            Otherwise, 'NA'. Also rate how surprising (1-5).
-            Return JSON like:
             {
-              "label": "[surprising or not surprising]",
-              "element": "[element]",
-              "rating": [1-5]
-            }
-            """
         },
         {
             "type": "image_url",
@@ -70,27 +64,29 @@ def analyze_image(image):
         }
     ]
-    # JSON-encode to ensure content is a string
-    json_prompt = json.dumps(prompt_dict)
-    # Send request
     response = client.chat.completions.create(
-        model="gpt-4o-mini",
-        messages=[
-            {
-                "role": "user",
-                "content": json_prompt
-            }
-        ],
         max_tokens=100,
         temperature=0.1,
-        response_format={"type": "json_object"}
     )
     return response.choices[0].message.content
 def show_mask(mask, ax, random_color=False):
     if random_color:
         color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
@@ -114,7 +110,7 @@ def process_image_detection(image, target_label, surprise_rating):
     original_size = image.size
     # Calculate relative font size based on image dimensions
-    base_fontsize = min(original_size) / 40  # Adjust this divisor as needed
     owlv2_processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16")
     owlv2_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16").to(device)
@@ -137,6 +133,7 @@ def process_image_detection(image, target_label, surprise_rating):
     ax = plt.Axes(fig, [0., 0., 1., 1.])
     fig.add_axes(ax)
     plt.imshow(image)
     scores = results["scores"]
@@ -165,7 +162,7 @@ def process_image_detection(image, target_label, surprise_rating):
             mask = masks[0].numpy() if isinstance(masks[0], torch.Tensor) else masks[0]
             show_mask(mask, ax=ax)
-            # Draw rectangle around the detected area
             rect = patches.Rectangle(
                 (box[0], box[1]),
                 box[2] - box[0],
@@ -176,7 +173,7 @@ def process_image_detection(image, target_label, surprise_rating):
             )
             ax.add_patch(rect)
-            # Confidence score
             plt.text(
                 box[0], box[1] - base_fontsize,
                 f'{max_score:.2f}',
@@ -186,7 +183,7 @@ def process_image_detection(image, target_label, surprise_rating):
                 bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=2)
             )
-            # Label + rating
             plt.text(
                 box[2] + base_fontsize / 2, box[1],
                 f'Unexpected (Rating: {surprise_rating}/5)\n{target_label}',
@@ -199,20 +196,17 @@ def process_image_detection(image, target_label, surprise_rating):
     plt.axis('off')
-    # Save figure to buffer
     buf = io.BytesIO()
-    plt.savefig(
-        buf,
-        format='png',
-        dpi=dpi,
-        bbox_inches='tight',
-        pad_inches=0,
-        metadata={'dpi': original_dpi}
-    )
     buf.seek(0)
     plt.close()
-    # Convert buffer back to PIL
     output_image = Image.open(buf)
     output_image = output_image.resize(original_size, Image.Resampling.LANCZOS)
@@ -233,17 +227,16 @@ def process_and_analyze(image):
     try:
         # Handle different input types
         if isinstance(image, tuple):
-            image = image[0]
         if isinstance(image, np.ndarray):
             image = Image.fromarray(image)
         if not isinstance(image, Image.Image):
             raise ValueError("Invalid image format")
-        # Analyze image with GPT
         gpt_response = analyze_image(image)
         response_data = json.loads(gpt_response)
-        # If surprising, try to detect the element
         if response_data["label"].lower() == "surprising" and response_data["element"].lower() != "na":
             result_buf = process_image_detection(image, response_data["element"], response_data["rating"])
             result_image = Image.open(result_buf)
@@ -254,7 +247,6 @@ def process_and_analyze(image):
             )
             return result_image, analysis_text
         else:
-            # If not surprising or element=NA
             return image, "Not Surprising"
     except Exception as e:

 OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+def encode_image_to_base64(image):
+    # If image is a tuple (as sometimes provided by Gradio), take the first element
+    if isinstance(image, tuple):
+        image = image[0]
+    # If image is a numpy array, convert to PIL Image
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    # Ensure image is in PIL Image format
     if not isinstance(image, Image.Image):
+        raise ValueError("Input must be a PIL Image, numpy array, or tuple containing an image")
     buffered = io.BytesIO()
+    image.save(buffered, format="PNG")
     return base64.b64encode(buffered.getvalue()).decode('utf-8')
 def analyze_image(image):
     client = OpenAI(api_key=OPENAI_API_KEY)
+    base64_image = encode_image_to_base64(image)
+    # --- MINIMAL FIX START ---
+    # We build a Python list of dicts, then JSON-encode it:
+    prompt_list = [
         {
             "type": "text",
+            "text": """Your task is to determine if the image is surprising or not surprising.
+            if the image is surprising, determine which element, figure or object in the image is making the image surprising and write it only in one sentence with no more then 6 words, otherwise, write 'NA'.
+            Also rate how surprising the image is on a scale of 1-5, where 1 is not surprising at all and 5 is highly surprising.
+            Provide the response as a JSON with the following structure:
             {
+                "label": "[surprising OR not surprising]",
+                "element": "[element]",
+                "rating": [1-5]
+            }"""
         },
         {
             "type": "image_url",
         }
     ]
+    prompt_json = json.dumps(prompt_list)
+    messages = [
+        {
+            "role": "user",
+            "content": prompt_json  # content must be a single string
+        }
+    ]
+    # --- MINIMAL FIX END ---
     response = client.chat.completions.create(
+        model="gpt-4o-mini",  # or whichever model you have access to
+        messages=messages,
         max_tokens=100,
         temperature=0.1,
+        response_format={
+            "type": "json_object"
+        }
     )
     return response.choices[0].message.content
 def show_mask(mask, ax, random_color=False):
     if random_color:
         color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
     original_size = image.size
     # Calculate relative font size based on image dimensions
+    base_fontsize = min(original_size) / 40  # Adjust this divisor to change overall font size
     owlv2_processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16")
     owlv2_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16").to(device)
     ax = plt.Axes(fig, [0., 0., 1., 1.])
     fig.add_axes(ax)
     plt.imshow(image)
     scores = results["scores"]
             mask = masks[0].numpy() if isinstance(masks[0], torch.Tensor) else masks[0]
             show_mask(mask, ax=ax)
+            # Draw rectangle with increased line width
             rect = patches.Rectangle(
                 (box[0], box[1]),
                 box[2] - box[0],
             )
             ax.add_patch(rect)
+            # Add confidence score with improved visibility
             plt.text(
                 box[0], box[1] - base_fontsize,
                 f'{max_score:.2f}',
                 bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=2)
             )
+            # Add label and rating with improved visibility
             plt.text(
                 box[2] + base_fontsize / 2, box[1],
                 f'Unexpected (Rating: {surprise_rating}/5)\n{target_label}',
     plt.axis('off')
     buf = io.BytesIO()
+    plt.savefig(buf,
+                format='png',
+                dpi=dpi,
+                bbox_inches='tight',
+                pad_inches=0,
+                metadata={'dpi': original_dpi})
     buf.seek(0)
     plt.close()
+    # Process final image
     output_image = Image.open(buf)
     output_image = output_image.resize(original_size, Image.Resampling.LANCZOS)
     try:
         # Handle different input types
         if isinstance(image, tuple):
+            image = image[0]  # Take the first element if it's a tuple
         if isinstance(image, np.ndarray):
             image = Image.fromarray(image)
         if not isinstance(image, Image.Image):
             raise ValueError("Invalid image format")
+        # Analyze image
         gpt_response = analyze_image(image)
         response_data = json.loads(gpt_response)
         if response_data["label"].lower() == "surprising" and response_data["element"].lower() != "na":
             result_buf = process_image_detection(image, response_data["element"], response_data["rating"])
             result_image = Image.open(result_buf)
             )
             return result_image, analysis_text
         else:
             return image, "Not Surprising"
     except Exception as e: