|
import torch |
|
from PIL import Image |
|
import requests |
|
import openai |
|
from transformers import (Owlv2Processor, Owlv2ForObjectDetection, |
|
AutoProcessor, AutoModelForMaskGeneration, |
|
BlipProcessor, BlipForConditionalGeneration) |
|
import matplotlib.pyplot as plt |
|
import matplotlib.patches as patches |
|
import base64 |
|
import io |
|
import numpy as np |
|
import gradio as gr |
|
import json |
|
import os |
|
from dotenv import load_dotenv |
|
|
|
|
|
load_dotenv() |
|
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') |
|
openai.api_key = OPENAI_API_KEY |
|
|
|
def generate_image_caption(image): |
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base') |
|
model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base').to(device) |
|
|
|
inputs = processor(image, return_tensors='pt').to(device) |
|
out = model.generate(**inputs) |
|
caption = processor.decode(out[0], skip_special_tokens=True) |
|
return caption |
|
|
|
def analyze_caption(caption): |
|
messages = [ |
|
{ |
|
"role": "user", |
|
"content": f"""Your task is to determine if the following image description is surprising or not surprising. |
|
|
|
Description: "{caption}" |
|
|
|
If the description is surprising, determine which element, figure, or object is making it surprising and write it only in one sentence with no more than 6 words; otherwise, write 'NA'. |
|
|
|
Also, rate how surprising the image is on a scale of 1-5, where 1 is not surprising at all and 5 is highly surprising. |
|
|
|
Provide the response as a JSON with the following structure: |
|
{{ |
|
"label": "[surprising OR not surprising]", |
|
"element": "[element]", |
|
"rating": [1-5] |
|
}} |
|
""" |
|
} |
|
] |
|
|
|
response = openai.ChatCompletion.create( |
|
model="gpt-4", |
|
messages=messages, |
|
max_tokens=100, |
|
temperature=0.1 |
|
) |
|
|
|
return response.choices[0].message.content |
|
|
|
|
|
|
|
def process_and_analyze(image): |
|
if image is None: |
|
return None, "Please upload an image first." |
|
|
|
if OPENAI_API_KEY is None: |
|
return None, "OpenAI API key not found in environment variables." |
|
|
|
try: |
|
|
|
if isinstance(image, tuple): |
|
image = image[0] |
|
if isinstance(image, np.ndarray): |
|
image = Image.fromarray(image) |
|
if not isinstance(image, Image.Image): |
|
raise ValueError("Invalid image format") |
|
|
|
|
|
caption = generate_image_caption(image) |
|
|
|
|
|
gpt_response = analyze_caption(caption) |
|
response_data = json.loads(gpt_response) |
|
|
|
if response_data["label"].lower() == "surprising" and response_data["element"].lower() != "na": |
|
result_buf = process_image_detection(image, response_data["element"], response_data["rating"]) |
|
result_image = Image.open(result_buf) |
|
analysis_text = f"Label: {response_data['label']}\nElement: {response_data['element']}\nRating: {response_data['rating']}/5" |
|
return result_image, analysis_text |
|
else: |
|
return image, "Not Surprising" |
|
|
|
except Exception as e: |
|
return None, f"Error processing image: {str(e)}" |
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Image Surprise Analysis") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
input_image = gr.Image(label="Upload Image") |
|
analyze_btn = gr.Button("Analyze Image") |
|
|
|
with gr.Column(): |
|
output_image = gr.Image(label="Processed Image") |
|
output_text = gr.Textbox(label="Analysis Results") |
|
|
|
analyze_btn.click( |
|
fn=process_and_analyze, |
|
inputs=[input_image], |
|
outputs=[output_image, output_text] |
|
) |
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
demo = create_interface() |
|
demo.launch() |