|
import os |
|
import cv2 |
|
import torch |
|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
import onnxruntime as rt |
|
import pytorch_lightning as pl |
|
import torch.nn as nn |
|
from transformers import pipeline |
|
from PIL import Image |
|
import inspect |
|
import safetensors.torch |
|
|
|
|
|
|
|
|
|
|
|
pipe_shadow = pipeline( |
|
"image-classification", |
|
model="NeoChen1024/aesthetic-shadow-v2-backup", |
|
device=0 if torch.cuda.is_available() else -1 |
|
) |
|
|
|
def score_aesthetic_shadow(image: Image.Image) -> float: |
|
"""Returns the 'hq' score from the aesthetic-shadow model.""" |
|
result = pipe_shadow(image) |
|
|
|
for pred in result[0]: |
|
if pred['label'] == 'hq': |
|
return round(pred['score'], 2) |
|
return 0.0 |
|
|
|
|
|
|
|
|
|
class MLP(pl.LightningModule): |
|
def __init__(self, input_size, batch_norm=True): |
|
super().__init__() |
|
self.layers = nn.Sequential( |
|
nn.Linear(input_size, 2048), |
|
nn.ReLU(), |
|
nn.BatchNorm1d(2048) if batch_norm else nn.Identity(), |
|
nn.Dropout(0.3), |
|
nn.Linear(2048, 512), |
|
nn.ReLU(), |
|
nn.BatchNorm1d(512) if batch_norm else nn.Identity(), |
|
nn.Dropout(0.3), |
|
nn.Linear(512, 256), |
|
nn.ReLU(), |
|
nn.BatchNorm1d(256) if batch_norm else nn.Identity(), |
|
nn.Dropout(0.2), |
|
nn.Linear(256, 128), |
|
nn.ReLU(), |
|
nn.BatchNorm1d(128) if batch_norm else nn.Identity(), |
|
nn.Dropout(0.1), |
|
nn.Linear(128, 32), |
|
nn.ReLU(), |
|
nn.Linear(32, 1) |
|
) |
|
|
|
def forward(self, x): |
|
return self.layers(x) |
|
|
|
def normalized(a: torch.Tensor, order=2, dim=-1): |
|
l2 = a.norm(order, dim, keepdim=True) |
|
l2[l2 == 0] = 1 |
|
return a / l2 |
|
|
|
def load_clip_models(name: str = "ViT-L/14", device='cuda'): |
|
import open_clip |
|
model2, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(name, device=device) |
|
preprocess = preprocess_val |
|
return model2, preprocess |
|
|
|
def load_model(model_path: str, input_size=768, device: str = 'cuda', dtype=None): |
|
model = MLP(input_size=input_size) |
|
if model_path.endswith(".safetensors"): |
|
state_dict = safetensors.torch.load_file(model_path, device=device) |
|
else: |
|
state = torch.load(model_path, map_location=device, weights_only=False) |
|
state_dict = state |
|
model.load_state_dict(state_dict) |
|
model.to(device) |
|
if dtype: |
|
model = model.to(dtype=dtype) |
|
return model |
|
|
|
def encode_images(images, model2, preprocess, device='cuda'): |
|
if isinstance(images, Image.Image): |
|
images = [images] |
|
image_tensors = [preprocess(img).unsqueeze(0) for img in images] |
|
image_batch = torch.cat(image_tensors).to(device) |
|
image_features = model2.encode_image(image_batch) |
|
im_emb_arr = normalized(image_features).cpu().float() |
|
return im_emb_arr |
|
|
|
class WaifuScorer: |
|
def __init__(self, model_path=None, device='cuda', cache_dir=None, verbose=False): |
|
self.verbose = verbose |
|
if model_path is None: |
|
|
|
model_path = "Eugeoter/waifu-scorer-v4-beta/model.safetensors" |
|
if not os.path.isfile(model_path): |
|
from huggingface_hub import hf_hub_download |
|
model_path = hf_hub_download("Eugeoter/waifu-scorer-v4-beta", "model.safetensors", cache_dir=cache_dir) |
|
print(f"Loading pretrained WaifuScorer model from {model_path}") |
|
self.mlp = load_model(model_path, input_size=768, device=device) |
|
self.model2, self.preprocess = load_clip_models("ViT-L/14", device=device) |
|
self.device = device |
|
self.mlp.eval() |
|
|
|
@torch.no_grad() |
|
def __call__(self, images): |
|
if isinstance(images, Image.Image): |
|
images = [images] |
|
n = len(images) |
|
if n == 1: |
|
images = images * 2 |
|
images_encoded = encode_images(images, self.model2, self.preprocess, device=self.device).to(self.device, dtype=torch.float32) |
|
predictions = self.mlp(images_encoded) |
|
scores = predictions.clamp(0, 10).cpu().numpy().reshape(-1).tolist() |
|
return scores[0] if len(scores) == 1 else scores |
|
|
|
|
|
waifu_scorer_instance = WaifuScorer(device='cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
def score_waifu(image: Image.Image) -> float: |
|
"""Scores an image using the WaifuScorer model (range 0-10).""" |
|
score = waifu_scorer_instance(image) |
|
if isinstance(score, list): |
|
return round(score[0], 2) |
|
return round(score, 2) |
|
|
|
|
|
|
|
|
|
class AestheticPredictor: |
|
def __init__(self): |
|
from aesthetic_predictor_v2_5 import convert_v2_5_from_siglip |
|
|
|
self.model, self.preprocessor = convert_v2_5_from_siglip( |
|
low_cpu_mem_usage=True, |
|
trust_remote_code=True, |
|
) |
|
if torch.cuda.is_available(): |
|
self.model = self.model.to(torch.bfloat16).cuda() |
|
|
|
def inference(self, image: Image.Image) -> float: |
|
|
|
pixel_values = self.preprocessor(images=image.convert("RGB"), return_tensors="pt").pixel_values |
|
if torch.cuda.is_available(): |
|
pixel_values = pixel_values.to(torch.bfloat16).cuda() |
|
with torch.inference_mode(): |
|
score = self.model(pixel_values).logits.squeeze().float().cpu().numpy() |
|
return score |
|
|
|
|
|
aesthetic_predictor_instance = AestheticPredictor() |
|
|
|
def score_aesthetic_predictor(image: Image.Image) -> float: |
|
"""Returns the aesthetic score from aesthetic-predictor-v2-5 (usually between 1 and 10).""" |
|
score = aesthetic_predictor_instance.inference(image) |
|
return round(float(score), 2) |
|
|
|
|
|
|
|
|
|
pipe_cafe_aesthetic = pipeline( |
|
"image-classification", |
|
"cafeai/cafe_aesthetic", |
|
device=0 if torch.cuda.is_available() else -1 |
|
) |
|
pipe_cafe_style = pipeline( |
|
"image-classification", |
|
"cafeai/cafe_style", |
|
device=0 if torch.cuda.is_available() else -1 |
|
) |
|
pipe_cafe_waifu = pipeline( |
|
"image-classification", |
|
"cafeai/cafe_waifu", |
|
device=0 if torch.cuda.is_available() else -1 |
|
) |
|
|
|
def score_cafe(image: Image.Image): |
|
"""Returns a tuple of (cafe aesthetic, cafe style, cafe waifu) scores/dicts.""" |
|
result_aesthetic = pipe_cafe_aesthetic(image, top_k=2) |
|
score_aesthetic = {d["label"]: d["score"] for d in result_aesthetic} |
|
result_style = pipe_cafe_style(image, top_k=5) |
|
score_style = {d["label"]: d["score"] for d in result_style} |
|
result_waifu = pipe_cafe_waifu(image, top_k=5) |
|
score_waifu_dict = {d["label"]: d["score"] for d in result_waifu} |
|
|
|
top_aesthetic = list(score_aesthetic.values())[0] if score_aesthetic else None |
|
return top_aesthetic, score_style, score_waifu_dict |
|
|
|
|
|
|
|
|
|
|
|
model_path_anime = None |
|
try: |
|
from huggingface_hub import hf_hub_download |
|
model_path_anime = hf_hub_download(repo_id="skytnt/anime-aesthetic", filename="model.onnx") |
|
except Exception as e: |
|
print("Error downloading anime aesthetic model:", e) |
|
if model_path_anime: |
|
model_anime = rt.InferenceSession(model_path_anime, providers=['CPUExecutionProvider']) |
|
else: |
|
model_anime = None |
|
|
|
def score_anime_aesthetic(image: Image.Image) -> float: |
|
"""Returns the aesthetic score from the anime-aesthetic model.""" |
|
img = np.array(image) |
|
img = img.astype(np.float32) / 255.0 |
|
s = 768 |
|
h, w = img.shape[:2] |
|
if h > w: |
|
new_h, new_w = s, int(s * w / h) |
|
else: |
|
new_h, new_w = int(s * h / w), s |
|
resized = cv2.resize(img, (new_w, new_h)) |
|
ph, pw = s - new_h, s - new_w |
|
img_input = np.zeros((s, s, 3), dtype=np.float32) |
|
img_input[ph//2:ph//2+new_h, pw//2:pw//2+new_w] = resized |
|
img_input = np.transpose(img_input, (2, 0, 1)) |
|
img_input = img_input[np.newaxis, :] |
|
if model_anime: |
|
pred = model_anime.run(None, {"img": img_input})[0].item() |
|
return round(pred, 2) |
|
else: |
|
return 0.0 |
|
|
|
|
|
|
|
|
|
def evaluate_images(images): |
|
""" |
|
For each uploaded image, compute scores from multiple models. |
|
Returns: |
|
- A Pandas DataFrame with rows for each image and columns for each score. |
|
- A list of images (previews) for display. |
|
""" |
|
results = [] |
|
previews = [] |
|
for idx, img in enumerate(images): |
|
filename = f"Image {idx+1}" |
|
try: |
|
score_shadow = score_aesthetic_shadow(img) |
|
except Exception as e: |
|
score_shadow = None |
|
try: |
|
score_waifu_val = score_waifu(img) |
|
except Exception as e: |
|
score_waifu_val = None |
|
try: |
|
score_ap = score_aesthetic_predictor(img) |
|
except Exception as e: |
|
score_ap = None |
|
try: |
|
cafe_aesthetic, _, _ = score_cafe(img) |
|
except Exception as e: |
|
cafe_aesthetic = None |
|
try: |
|
score_anime = score_anime_aesthetic(img) |
|
except Exception as e: |
|
score_anime = None |
|
|
|
results.append({ |
|
"Filename": filename, |
|
"Aesthetic Shadow": score_shadow, |
|
"Waifu Scorer": score_waifu_val, |
|
"Aesthetic Predictor": score_ap, |
|
"Cafe Aesthetic": cafe_aesthetic, |
|
"Anime Aesthetic": score_anime |
|
}) |
|
previews.append(img) |
|
df = pd.DataFrame(results) |
|
return df, previews |
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Ultimate Image Aesthetic Evaluator") as demo: |
|
gr.Markdown( |
|
""" |
|
# Ultimate Image Aesthetic Evaluator |
|
Upload multiple images to evaluate their aesthetic scores using various models. |
|
The table below shows the scores from: |
|
- **Aesthetic Shadow** |
|
- **Waifu Scorer** |
|
- **Aesthetic Predictor V2.5** |
|
- **Cafe Aesthetic** |
|
- **Anime Aesthetic** |
|
""" |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_images = gr.Image( |
|
label="Upload Images", |
|
type="pil", |
|
tool="editor", |
|
source="upload", |
|
image_mode="RGB", |
|
interactive=True, |
|
multiple=True |
|
) |
|
evaluate_button = gr.Button("Evaluate Images") |
|
with gr.Column(): |
|
output_table = gr.Dataframe( |
|
headers=["Filename", "Aesthetic Shadow", "Waifu Scorer", "Aesthetic Predictor", "Cafe Aesthetic", "Anime Aesthetic"], |
|
label="Evaluation Results" |
|
) |
|
output_gallery = gr.Gallery(label="Image Previews").style(grid=[2], height="auto") |
|
evaluate_button.click(fn=evaluate_images, inputs=input_images, outputs=[output_table, output_gallery]) |
|
|
|
demo.queue().launch() |