Spaces:

reab5555
/

AI-Image-Anomaly-Detection

Running

App Files Files Community

reab5555 commited on Jan 7

Commit

a27e6f2

verified ·

1 Parent(s): 6ab9894

Update app.py

Browse files

Files changed (1) hide show

app.py +181 -36

app.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import torch
 from PIL import Image
 import requests
-import openai
 from transformers import (Owlv2Processor, Owlv2ForObjectDetection,
-                          AutoProcessor, AutoModelForMaskGeneration,
-                          BlipProcessor, BlipForConditionalGeneration)
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 import base64
@@ -18,50 +17,198 @@ from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
 OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
-openai.api_key = OPENAI_API_KEY
-def generate_image_caption(image):
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
-    model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base').to(device)
-    inputs = processor(image, return_tensors='pt').to(device)
-    out = model.generate(**inputs)
-    caption = processor.decode(out[0], skip_special_tokens=True)
-    return caption
-def analyze_caption(caption):
-    messages = [
-        {
-            "role": "user",
-            "content": f"""Your task is to determine if the following image description is surprising or not surprising.
-Description: "{caption}"
-If the description is surprising, determine which element, figure, or object is making it surprising and write it only in one sentence with no more than 6 words; otherwise, write 'NA'.
-Also, rate how surprising the image is on a scale of 1-5, where 1 is not surprising at all and 5 is highly surprising.
-Provide the response as a JSON with the following structure:
-{{
-    "label": "[surprising OR not surprising]",
-    "element": "[element]",
-    "rating": [1-5]
-}}
-"""
         }
     ]
-    response = openai.ChatCompletion.create(
-        model="gpt-4",
         messages=messages,
         max_tokens=100,
-        temperature=0.1
     )
     return response.choices[0].message.content
-# The rest of your functions (process_image_detection, show_mask, etc.) remain the same
 def process_and_analyze(image):
     if image is None:
@@ -79,11 +226,8 @@ def process_and_analyze(image):
         if not isinstance(image, Image.Image):
             raise ValueError("Invalid image format")
-        # Generate caption
-        caption = generate_image_caption(image)
-        # Analyze caption
-        gpt_response = analyze_caption(caption)
         response_data = json.loads(gpt_response)
         if response_data["label"].lower() == "surprising" and response_data["element"].lower() != "na":
@@ -97,6 +241,7 @@ def process_and_analyze(image):
     except Exception as e:
         return None, f"Error processing image: {str(e)}"
 # Create Gradio interface
 def create_interface():
     with gr.Blocks() as demo:

 import torch
 from PIL import Image
 import requests
+from openai import OpenAI
 from transformers import (Owlv2Processor, Owlv2ForObjectDetection,
+                          AutoProcessor, AutoModelForMaskGeneration)
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 import base64
 # Load environment variables
 load_dotenv()
 OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+def encode_image_to_base64(image):
+    # If image is a tuple (as sometimes provided by Gradio), take the first element
+    if isinstance(image, tuple):
+        image = image[0]
+    # If image is a numpy array, convert to PIL Image
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    # Ensure image is in PIL Image format
+    if not isinstance(image, Image.Image):
+        raise ValueError("Input must be a PIL Image, numpy array, or tuple containing an image")
+    buffered = io.BytesIO()
+    image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode('utf-8')
+def analyze_image(image):
+    client = OpenAI(api_key=OPENAI_API_KEY)
+    base64_image = encode_image_to_base64(image)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": """Your task is to determine if the image is surprising or not surprising.
+                    if the image is surprising, determine which element, figure or object in the image is making the image surprising and write it only in one sentence with no more then 6 words, otherwise, write 'NA'.
+                    Also rate how surprising the image is on a scale of 1-5, where 1 is not surprising at all and 5 is highly surprising.
+                    Provide the response as a JSON with the following structure:
+                    {
+                        "label": "[surprising OR not surprising]",
+                        "element": "[element]",
+                        "rating": [1-5]
+                    }"""
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}"
+                    }
+                }
+            ]
         }
     ]
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
         messages=messages,
         max_tokens=100,
+        temperature=0.1,
+        response_format={
+            "type": "json_object"
+        }
     )
     return response.choices[0].message.content
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([1.0, 0.0, 0.0, 0.5])
+    if len(mask.shape) == 4:
+        mask = mask[0, 0]
+    mask_image = np.zeros((*mask.shape, 4), dtype=np.float32)
+    mask_image[mask > 0] = color
+    ax.imshow(mask_image)
+def process_image_detection(image, target_label, surprise_rating):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Get original image DPI and size
+    original_dpi = image.info.get('dpi', (72, 72))
+    original_size = image.size
+    # Calculate relative font size based on image dimensions
+    base_fontsize = min(original_size) / 40  # Adjust this divisor to change overall font size
+    owlv2_processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16")
+    owlv2_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16").to(device)
+    sam_processor = AutoProcessor.from_pretrained("facebook/sam-vit-base")
+    sam_model = AutoModelForMaskGeneration.from_pretrained("facebook/sam-vit-base").to(device)
+    image_np = np.array(image)
+    inputs = owlv2_processor(text=[target_label], images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = owlv2_model(**inputs)
+    target_sizes = torch.tensor([image.size[::-1]]).to(device)
+    results = owlv2_processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]
+    dpi = 300  # Increased DPI for better text rendering
+    figsize = (original_size[0] / dpi, original_size[1] / dpi)
+    fig = plt.figure(figsize=figsize, dpi=dpi)
+    ax = plt.Axes(fig, [0., 0., 1., 1.])
+    fig.add_axes(ax)
+    plt.imshow(image)
+    scores = results["scores"]
+    if len(scores) > 0:
+        max_score_idx = scores.argmax().item()
+        max_score = scores[max_score_idx].item()
+        if max_score > 0.2:
+            box = results["boxes"][max_score_idx].cpu().numpy()
+            sam_inputs = sam_processor(
+                image,
+                input_boxes=[[[box[0], box[1], box[2], box[3]]]],
+                return_tensors="pt"
+            ).to(device)
+            with torch.no_grad():
+                sam_outputs = sam_model(**sam_inputs)
+            masks = sam_processor.image_processor.post_process_masks(
+                sam_outputs.pred_masks.cpu(),
+                sam_inputs["original_sizes"].cpu(),
+                sam_inputs["reshaped_input_sizes"].cpu()
+            )
+            mask = masks[0].numpy() if isinstance(masks[0], torch.Tensor) else masks[0]
+            show_mask(mask, ax=ax)
+            # Draw rectangle with increased line width
+            rect = patches.Rectangle(
+                (box[0], box[1]),
+                box[2] - box[0],
+                box[3] - box[1],
+                linewidth=max(2, min(original_size) / 500),  # Scale line width with image size
+                edgecolor='red',
+                facecolor='none'
+            )
+            ax.add_patch(rect)
+            # Add confidence score with improved visibility
+            plt.text(
+                box[0], box[1] - base_fontsize,
+                f'{max_score:.2f}',
+                color='red',
+                fontsize=base_fontsize,
+                fontweight='bold',
+                bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=2)
+            )
+            # Add label and rating with improved visibility
+            plt.text(
+                box[2] + base_fontsize / 2, box[1],
+                f'Unexpected (Rating: {surprise_rating}/5)\n{target_label}',
+                color='red',
+                fontsize=base_fontsize,
+                fontweight='bold',
+                bbox=dict(facecolor='white', alpha=0.7, edgecolor='none', pad=2),
+                verticalalignment='bottom'
+            )
+    plt.axis('off')
+    # Save with high DPI
+    buf = io.BytesIO()
+    plt.savefig(buf,
+                format='png',
+                dpi=dpi,
+                bbox_inches='tight',
+                pad_inches=0,
+                metadata={'dpi': original_dpi})
+    buf.seek(0)
+    plt.close()
+    # Process final image
+    output_image = Image.open(buf)
+    output_image = output_image.resize(original_size, Image.Resampling.LANCZOS)
+    final_buf = io.BytesIO()
+    output_image.save(final_buf, format='PNG', dpi=original_dpi)
+    final_buf.seek(0)
+    return final_buf
 def process_and_analyze(image):
     if image is None:
         if not isinstance(image, Image.Image):
             raise ValueError("Invalid image format")
+        # Analyze image
+        gpt_response = analyze_image(image)
         response_data = json.loads(gpt_response)
         if response_data["label"].lower() == "surprising" and response_data["element"].lower() != "na":
     except Exception as e:
         return None, f"Error processing image: {str(e)}"
 # Create Gradio interface
 def create_interface():
     with gr.Blocks() as demo: