|
import os
|
|
from typing import Literal
|
|
import spaces
|
|
import gradio as gr
|
|
import modelscope_studio.components.antd as antd
|
|
import modelscope_studio.components.antdx as antdx
|
|
import modelscope_studio.components.base as ms
|
|
from transformers import pipeline, AutoImageProcessor, SwinForImageClassification, Swinv2ForImageClassification, AutoFeatureExtractor, AutoModelForImageClassification
|
|
from torchvision import transforms
|
|
import torch
|
|
from PIL import Image
|
|
import numpy as np
|
|
import io
|
|
import logging
|
|
from utils.utils import softmax, augment_image, convert_pil_to_bytes
|
|
from utils.gradient import gradient_processing
|
|
from utils.minmax import preprocess as minmax_preprocess
|
|
from utils.ela import genELA as ELA
|
|
from utils.wavelet import wavelet_blocking_noise_estimation
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
header_style = {
|
|
"textAlign": 'center',
|
|
"color": '#fff',
|
|
"height": 64,
|
|
"paddingInline": 48,
|
|
"lineHeight": '64px',
|
|
"backgroundColor": '#4096ff',
|
|
}
|
|
|
|
content_style = {
|
|
"textAlign": 'center',
|
|
"minHeight": 120,
|
|
"lineHeight": '120px',
|
|
"color": '#fff',
|
|
"backgroundColor": '#0958d9',
|
|
}
|
|
|
|
sider_style = {
|
|
"textAlign": 'center',
|
|
"lineHeight": '120px',
|
|
"color": '#fff',
|
|
"backgroundColor": '#1677ff',
|
|
}
|
|
|
|
footer_style = {
|
|
"textAlign": 'center',
|
|
"color": '#fff',
|
|
"backgroundColor": '#4096ff',
|
|
}
|
|
|
|
layout_style = {
|
|
"borderRadius": 8,
|
|
"overflow": 'hidden',
|
|
"width": 'calc(100% - 8px)',
|
|
"maxWidth": 'calc(100% - 8px)',
|
|
}
|
|
|
|
MODEL_PATHS = {
|
|
"model_1": "haywoodsloan/ai-image-detector-deploy",
|
|
"model_2": "Heem2/AI-vs-Real-Image-Detection",
|
|
"model_3": "Organika/sdxl-detector",
|
|
"model_4": "cmckinle/sdxl-flux-detector_v1.1",
|
|
"model_5": "prithivMLmods/Deep-Fake-Detector-v2-Model",
|
|
"model_5b": "prithivMLmods/Deepfake-Detection-Exp-02-22",
|
|
"model_6": "ideepankarsharma2003/AI_ImageClassification_MidjourneyV6_SDXL",
|
|
"model_7": "date3k2/vit-real-fake-classification-v4"
|
|
}
|
|
|
|
CLASS_NAMES = {
|
|
"model_1": ['artificial', 'real'],
|
|
"model_2": ['AI Image', 'Real Image'],
|
|
"model_3": ['AI', 'Real'],
|
|
"model_4": ['AI', 'Real'],
|
|
"model_5": ['Realism', 'Deepfake'],
|
|
"model_5b": ['Real', 'Deepfake'],
|
|
"model_6": ['ai_gen', 'human'],
|
|
"model_7": ['Fake', 'Real'],
|
|
|
|
}
|
|
|
|
|
|
def load_models():
|
|
image_processor_1 = AutoImageProcessor.from_pretrained(MODEL_PATHS["model_1"], use_fast=True)
|
|
model_1 = Swinv2ForImageClassification.from_pretrained(MODEL_PATHS["model_1"])
|
|
model_1 = model_1.to(device)
|
|
clf_1 = pipeline(model=model_1, task="image-classification", image_processor=image_processor_1, device=device)
|
|
|
|
clf_2 = pipeline("image-classification", model=MODEL_PATHS["model_2"], device=device)
|
|
|
|
feature_extractor_3 = AutoFeatureExtractor.from_pretrained(MODEL_PATHS["model_3"], device=device)
|
|
model_3 = AutoModelForImageClassification.from_pretrained(MODEL_PATHS["model_3"]).to(device)
|
|
|
|
feature_extractor_4 = AutoFeatureExtractor.from_pretrained(MODEL_PATHS["model_4"], device=device)
|
|
model_4 = AutoModelForImageClassification.from_pretrained(MODEL_PATHS["model_4"]).to(device)
|
|
|
|
clf_5 = pipeline("image-classification", model=MODEL_PATHS["model_5"], device=device)
|
|
clf_5b = pipeline("image-classification", model=MODEL_PATHS["model_5b"], device=device)
|
|
|
|
image_processor_6 = AutoImageProcessor.from_pretrained(MODEL_PATHS["model_6"], use_fast=True)
|
|
model_6 = SwinForImageClassification.from_pretrained(MODEL_PATHS["model_6"]).to(device)
|
|
clf_6 = pipeline(model=model_6, task="image-classification", image_processor=image_processor_6, device=device)
|
|
|
|
image_processor_7 = AutoImageProcessor.from_pretrained(MODEL_PATHS["model_7"], use_fast=True)
|
|
model_7 = AutoModelForImageClassification.from_pretrained(MODEL_PATHS["model_7"]).to(device)
|
|
clf_7 = pipeline(model=model_7, task="image-classification", image_processor=image_processor_7, device=device)
|
|
|
|
return clf_1, clf_2, feature_extractor_3, model_3, feature_extractor_4, model_4, clf_5, clf_5b, clf_6, model_7, clf_7
|
|
|
|
clf_1, clf_2, feature_extractor_3, model_3, feature_extractor_4, model_4, clf_5, clf_5b, clf_6, model_7, clf_7 = load_models()
|
|
|
|
@spaces.GPU(duration=10)
|
|
def predict_with_model(img_pil, clf, class_names, confidence_threshold, model_name, model_id, feature_extractor=None):
|
|
try:
|
|
if feature_extractor:
|
|
inputs = feature_extractor(img_pil, return_tensors="pt").to(device)
|
|
with torch.no_grad():
|
|
outputs = clf(**inputs)
|
|
logits = outputs.logits
|
|
probabilities = softmax(logits.cpu().numpy()[0])
|
|
result = {class_names[i]: probabilities[i] for i in range(len(class_names))}
|
|
else:
|
|
prediction = clf(img_pil)
|
|
result = {pred['label']: pred['score'] for pred in prediction}
|
|
|
|
result_output = [model_id, model_name, result.get(class_names[1], 0.0), result.get(class_names[0], 0.0)]
|
|
logger.info(result_output)
|
|
for class_name in class_names:
|
|
if class_name not in result:
|
|
result[class_name] = 0.0
|
|
if result[class_names[0]] >= confidence_threshold:
|
|
label = f"AI, Confidence: {result[class_names[0]]:.4f}"
|
|
result_output.append('AI')
|
|
elif result[class_names[1]] >= confidence_threshold:
|
|
label = f"Real, Confidence: {result[class_names[1]]:.4f}"
|
|
result_output.append('REAL')
|
|
else:
|
|
label = "Uncertain Classification"
|
|
result_output.append('UNCERTAIN')
|
|
except Exception as e:
|
|
label = f"Error: {str(e)}"
|
|
result_output = [model_id, model_name, 0.0, 0.0, 'ERROR']
|
|
return label, result_output
|
|
|
|
@spaces.GPU(duration=10)
|
|
def predict_image(img, confidence_threshold):
|
|
if not isinstance(img, Image.Image):
|
|
raise ValueError(f"Expected a PIL Image, but got {type(img)}")
|
|
if img.mode != 'RGB':
|
|
img_pil = img.convert('RGB')
|
|
else:
|
|
img_pil = img
|
|
img_pil = transforms.Resize((256, 256))(img_pil)
|
|
img_pilvits = transforms.Resize((224, 224))(img_pil)
|
|
|
|
label_1, result_1output = predict_with_model(img_pil, clf_1, CLASS_NAMES["model_1"], confidence_threshold, "SwinV2-base", 1)
|
|
label_2, result_2output = predict_with_model(img_pilvits, clf_2, CLASS_NAMES["model_2"], confidence_threshold, "ViT-base Classifier", 2)
|
|
label_3, result_3output = predict_with_model(img_pil, model_3, CLASS_NAMES["model_3"], confidence_threshold, "SDXL-Trained", 3, feature_extractor_3)
|
|
label_4, result_4output = predict_with_model(img_pil, model_4, CLASS_NAMES["model_4"], confidence_threshold, "SDXL + FLUX", 4, feature_extractor_4)
|
|
label_5, result_5output = predict_with_model(img_pilvits, clf_5, CLASS_NAMES["model_5"], confidence_threshold, "ViT-base Newcomer", 5)
|
|
label_5b, result_5boutput = predict_with_model(img_pilvits, clf_5b, CLASS_NAMES["model_5b"], confidence_threshold, "ViT-base Newcomer", 6)
|
|
label_6, result_6output = predict_with_model(img_pilvits, clf_6, CLASS_NAMES["model_6"], confidence_threshold, "Swin Midjourney/SDXL", 7)
|
|
label_7, result_7output = predict_with_model(img_pilvits, clf_7, CLASS_NAMES["model_7"], confidence_threshold, "Vit", 7)
|
|
|
|
combined_results = {
|
|
"SwinV2/detect": label_1,
|
|
"ViT/AI-vs-Real": label_2,
|
|
"Swin/SDXL": label_3,
|
|
"Swin/SDXL-FLUX": label_4,
|
|
"prithivMLmods": label_5,
|
|
"prithivMLmods-2-22": label_5b,
|
|
"SwinMidSDXL": label_6,
|
|
"Vit": label_7
|
|
}
|
|
print(combined_results)
|
|
|
|
combined_outputs = [result_1output, result_2output, result_3output, result_4output, result_5output, result_5boutput, result_6output, result_7output]
|
|
return img_pil, combined_outputs
|
|
|
|
|
|
def generate_results_html(results):
|
|
def get_header_color(label):
|
|
if label == 'AI':
|
|
return 'bg-red-500 text-red-700', 'bg-red-400', 'bg-red-100', 'bg-red-700 text-red-700', 'bg-red-200'
|
|
elif label == 'REAL':
|
|
return 'bg-green-500 text-green-700', 'bg-green-400', 'bg-green-100', 'bg-green-700 text-green-700', 'bg-green-200'
|
|
elif label == 'UNCERTAIN':
|
|
return 'bg-yellow-500 text-yellow-700 bg-yellow-100', 'bg-yellow-400', 'bg-yellow-100', 'bg-yellow-700 text-yellow-700', 'bg-yellow-200'
|
|
elif label == 'MAINTENANCE':
|
|
return 'bg-blue-500 text-blue-700', 'bg-blue-400', 'bg-blue-100', 'bg-blue-700 text-blue-700', 'bg-blue-200'
|
|
else:
|
|
return 'bg-gray-300 text-gray-700', 'bg-gray-400', 'bg-gray-100', 'bg-gray-700 text-gray-700', 'bg-gray-200'
|
|
|
|
def generate_tile_html(index, result, model_name, contributor, model_path):
|
|
label = result[-1]
|
|
header_colors = get_header_color(label)
|
|
real_conf = result[2]
|
|
ai_conf = result[3]
|
|
return f"""
|
|
<div
|
|
class="flex flex-col bg-gray-800 rounded-sm p-4 m-1 border border-gray-800 shadow-xs transition hover:shadow-lg dark:shadow-gray-700/25">
|
|
<div
|
|
class="-m-4 h-24 {header_colors[0]} rounded-sm rounded-b-none transition border group-hover:border-gray-100 group-hover:shadow-lg group-hover:{header_colors[4]}">
|
|
<span class="text-gray-300 font-mono tracking-widest p-4 pb-3 block text-xs text-center">MODEL {index + 1}:</span>
|
|
<span
|
|
class="flex w-30 mx-auto tracking-wide items-center justify-center rounded-full {header_colors[2]} px-1 py-0.5 {header_colors[3]}"
|
|
>
|
|
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="3" stroke="currentColor" class="w-4 h-4 mr-2 -ml-3 group-hover:animate group-hover:animate-pulse">
|
|
{'<path stroke-linecap="round" stroke-linejoin="round" d="M9 12.75 11.25 15 15 9.75M21 12a9 9 0 1 1-18 0 9 9 0 0 1 18 0Z" />' if label == 'REAL' else '<path stroke-linecap="round" stroke-linejoin="round" d="m9.75 9.75 4.5 4.5m0-4.5-4.5 4.5M21 12a9 9 0 1 1-18 0 9 9 0 0 1 18 0Z" />'}
|
|
</svg>
|
|
<p class="text-base whitespace-nowrap leading-normal font-bold text-center self-center align-middle py-px">{label}</p>
|
|
</span>
|
|
</div>
|
|
<div>
|
|
<div class="mt-4 relative -mx-4 bg-gray-800">
|
|
<div class="w-full bg-gray-400 rounded-none h-8">
|
|
<div class="inline-flex whitespace-nowrap bg-green-400 h-full rounded-none" style="width: {real_conf * 100:.2f}%;">
|
|
<p class="p-2 px-4 text-xs self-center align-middle">Conf:
|
|
<span class="ml-1 font-medium font-mono">{real_conf:.4f}</span>
|
|
</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="relative -mx-4 bg-gray-800">
|
|
<div class="w-full bg-gray-400 rounded-none h-8">
|
|
<div class="inline-flex whitespace-nowrap bg-red-400 h-full rounded-none" style="width: {ai_conf * 100:.2f}%;">
|
|
<p class="p-2 px-4 text-xs self-center align-middle">Conf:
|
|
<span class="ml-1 font-medium font-mono">{ai_conf:.4f}</span>
|
|
</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="flex flex-col items-start">
|
|
<h4 class="mt-4 text-sm font-semibold tracking-wide">{model_name}</h4>
|
|
<div class="text-xs font-mono">Real: {real_conf:.4f}, AI: {ai_conf:.4f}</div>
|
|
<div class="card-footer">
|
|
<a href="https://huggingface.co/{model_path}" target="_blank" class="mt-2 text-xs text-nowrap nowrap" style="font-size:0.66rem !important;">by @{contributor}</a>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
"""
|
|
|
|
html_content = f"""
|
|
<link href="https://unpkg.com/[email protected]/dist/tailwind.min.css" rel="stylesheet">
|
|
<div class="container mx-auto">
|
|
<div class="grid xl:grid-cols-4 md:grid-cols-4 grid-cols-1 gap-1">
|
|
{generate_tile_html(0, results[0], "SwinV2 Based", "haywoodsloan", MODEL_PATHS["model_1"])}
|
|
{generate_tile_html(1, results[1], "ViT Based", "Heem2", MODEL_PATHS["model_2"])}
|
|
{generate_tile_html(2, results[2], "SDXL Dataset", "Organika", MODEL_PATHS["model_3"])}
|
|
{generate_tile_html(3, results[3], "SDXL + FLUX", "cmckinle", MODEL_PATHS["model_4"])}
|
|
{generate_tile_html(4, results[4], "Vit Based", "prithivMLmods", MODEL_PATHS["model_5"])}
|
|
{generate_tile_html(5, results[5], "Vit Based, Newer Dataset", "prithivMLmods", MODEL_PATHS["model_5b"])}
|
|
{generate_tile_html(6, results[6], "Swin, Midj + SDXL", "ideepankarsharma2003", MODEL_PATHS["model_6"])}
|
|
{generate_tile_html(7, results[7], "ViT", "temp", MODEL_PATHS["model_7"])}
|
|
</div>
|
|
</div>
|
|
"""
|
|
return html_content
|
|
|
|
|
|
def predict_image_with_html(img, confidence_threshold, augment_methods, rotate_degrees, noise_level, sharpen_strength):
|
|
if augment_methods:
|
|
img_pil, _ = augment_image(img, augment_methods, rotate_degrees, noise_level, sharpen_strength)
|
|
else:
|
|
img_pil = img
|
|
img_pil, results = predict_image(img_pil, confidence_threshold)
|
|
img_np = np.array(img_pil)
|
|
img_np_og = np.array(img)
|
|
|
|
gradient_image = gradient_processing(img_np)
|
|
minmax_image = minmax_preprocess(img_np)
|
|
|
|
|
|
ela1 = ELA(img_np_og, quality=75, scale=50, contrast=20, linear=False, grayscale=True)
|
|
|
|
|
|
ela2 = ELA(img_np_og, quality=75, scale=75, contrast=25, linear=False, grayscale=True)
|
|
ela3 = ELA(img_np_og, quality=75, scale=75, contrast=25, linear=False, grayscale=False)
|
|
|
|
forensics_images = [img_pil, ela1, ela2, ela3, gradient_image, minmax_image]
|
|
|
|
html_content = generate_results_html(results)
|
|
return img_pil, forensics_images, html_content
|
|
|
|
with gr.Blocks(css="#post-gallery { overflow: hidden !important;} .grid-wrap{ overflow-y: hidden !important;} .ms-gr-ant-welcome-icon{ height:unset !important;} .tabs{margin-top:10px;}") as iface:
|
|
with ms.Application() as app:
|
|
with antd.ConfigProvider():
|
|
antdx.Welcome(
|
|
icon=
|
|
"https://cdn-avatars.huggingface.co/v1/production/uploads/639daf827270667011153fbc/WpeSFhuB81DY-1TjNUmV_.png",
|
|
title="Welcome to Project OpenSight",
|
|
description=
|
|
"The OpenSight aims to be an open-source SOTA generated image detection model. This HF Space is not only an introduction but a educational playground for the public to evaluate and challenge current open source models. **Space will be upgraded shortly; inference on all 6 models should take about 1.2~ seconds.** "
|
|
)
|
|
with gr.Tab("๐ Detection Models Eval / Playground"):
|
|
gr.Markdown("# Open Source Detection Models Found on the Hub\n\n - **Space will be upgraded shortly;** inference on all 6 models should take about 1.2~ seconds once we're back on CUDA.\n - The **Community Forensics** mother of all detection models is now available for inference, head to the middle tab above this.\n - Lots of exciting things coming up, stay tuned!")
|
|
|
|
with gr.Row():
|
|
with gr.Column(scale=1):
|
|
image_input = gr.Image(label="Upload Image to Analyze", sources=['upload', 'webcam'], type='pil')
|
|
with gr.Accordion("Settings (Optional)", open=False, elem_id="settings_accordion"):
|
|
augment_checkboxgroup = gr.CheckboxGroup(["rotate", "add_noise", "sharpen"], label="Augmentation Methods")
|
|
rotate_slider = gr.Slider(0, 45, value=2, step=1, label="Rotate Degrees", visible=False)
|
|
noise_slider = gr.Slider(0, 50, value=4, step=1, label="Noise Level", visible=False)
|
|
sharpen_slider = gr.Slider(0, 50, value=11, step=1, label="Sharpen Strength", visible=False)
|
|
confidence_slider = gr.Slider(0.0, 1.0, value=0.75, step=0.05, label="Confidence Threshold")
|
|
inputs = [image_input, confidence_slider, augment_checkboxgroup, rotate_slider, noise_slider, sharpen_slider]
|
|
predict_button = gr.Button("Predict")
|
|
augment_button = gr.Button("Augment & Predict")
|
|
image_output = gr.Image(label="Processed Image", visible=False)
|
|
|
|
|
|
with gr.Column(scale=2):
|
|
|
|
results_html = gr.HTML(label="Model Predictions")
|
|
forensics_gallery = gr.Gallery(label="Post Processed Images", visible=True, columns=[4], rows=[2], container=False, height="auto", object_fit="contain", elem_id="post-gallery")
|
|
|
|
outputs = [image_output, forensics_gallery, results_html]
|
|
|
|
|
|
augment_checkboxgroup.change(lambda methods: gr.update(visible="rotate" in methods), inputs=[augment_checkboxgroup], outputs=[rotate_slider])
|
|
augment_checkboxgroup.change(lambda methods: gr.update(visible="add_noise" in methods), inputs=[augment_checkboxgroup], outputs=[noise_slider])
|
|
augment_checkboxgroup.change(lambda methods: gr.update(visible="sharpen" in methods), inputs=[augment_checkboxgroup], outputs=[sharpen_slider])
|
|
|
|
predict_button.click(
|
|
fn=predict_image_with_html,
|
|
inputs=inputs,
|
|
outputs=outputs
|
|
)
|
|
augment_button.click(
|
|
fn=predict_image_with_html,
|
|
inputs=[
|
|
image_input,
|
|
confidence_slider,
|
|
gr.CheckboxGroup(["rotate", "add_noise", "sharpen"], value=["rotate", "add_noise", "sharpen"], visible=False),
|
|
rotate_slider,
|
|
noise_slider,
|
|
sharpen_slider
|
|
],
|
|
outputs=outputs
|
|
)
|
|
predict_button.click(
|
|
fn=None,
|
|
js="() => {document.getElementById('project_accordion').open = false;}",
|
|
inputs=[],
|
|
outputs=[]
|
|
)
|
|
with gr.Tab("๐ Community Forensics Preview"):
|
|
temp_space = gr.load("aiwithoutborders-xyz/OpenSight-Community-Forensics-Preview", src="spaces")
|
|
|
|
with gr.Tab("๐ฅ Leaderboard"):
|
|
gr.Markdown("# AI Generated / Deepfake Detection Models Leaderboard: Soonโข")
|
|
|
|
|
|
|
|
iface.launch() |