import os import cv2 import torch import gradio as gr import numpy as np import pandas as pd import onnxruntime as rt import pytorch_lightning as pl import torch.nn as nn from transformers import pipeline from PIL import Image import inspect # ============================================================================= # Aesthetic-Shadow (using Hugging Face transformers pipeline) # ============================================================================= # Initialize the pipeline; if CUDA is available, use GPU (device=0), else CPU (device=-1) pipe_shadow = pipeline( "image-classification", model="NeoChen1024/aesthetic-shadow-v2-backup", device=0 if torch.cuda.is_available() else -1 ) def score_aesthetic_shadow(image: Image.Image) -> float: """Returns the 'hq' score from the aesthetic-shadow model.""" result = pipe_shadow(image) # The result is a list (one per image) of predictions; find the one with label "hq" for pred in result[0]: if pred['label'] == 'hq': return round(pred['score'], 2) return 0.0 # ============================================================================= # Waifu-Scorer (including all necessary utility functions and model definition) # ============================================================================= class MLP(pl.LightningModule): def __init__(self, input_size, batch_norm=True): super().__init__() self.layers = nn.Sequential( nn.Linear(input_size, 2048), nn.ReLU(), nn.BatchNorm1d(2048) if batch_norm else nn.Identity(), nn.Dropout(0.3), nn.Linear(2048, 512), nn.ReLU(), nn.BatchNorm1d(512) if batch_norm else nn.Identity(), nn.Dropout(0.3), nn.Linear(512, 256), nn.ReLU(), nn.BatchNorm1d(256) if batch_norm else nn.Identity(), nn.Dropout(0.2), nn.Linear(256, 128), nn.ReLU(), nn.BatchNorm1d(128) if batch_norm else nn.Identity(), nn.Dropout(0.1), nn.Linear(128, 32), nn.ReLU(), nn.Linear(32, 1) ) def forward(self, x): return self.layers(x) def normalized(a: torch.Tensor, order=2, dim=-1): l2 = a.norm(order, dim, keepdim=True) l2[l2 == 0] = 1 return a / l2 def load_clip_models(name: str = "ViT-L/14", device='cuda'): import clip model2, preprocess = clip.load(name, device=device) return model2, preprocess def load_model(model_path: str, input_size=768, device: str = 'cuda', dtype=None): model = MLP(input_size=input_size) state = torch.load(model_path, map_location=device) model.load_state_dict(state) model.to(device) if dtype: model = model.to(dtype=dtype) return model def encode_images(images, model2, preprocess, device='cuda'): if isinstance(images, Image.Image): images = [images] image_tensors = [preprocess(img).unsqueeze(0) for img in images] image_batch = torch.cat(image_tensors).to(device) image_features = model2.encode_image(image_batch) im_emb_arr = normalized(image_features).cpu().float() return im_emb_arr class WaifuScorer: def __init__(self, model_path=None, device='cuda', cache_dir=None, verbose=False): self.verbose = verbose if model_path is None: # Use default repo path – if the model file is not present locally, it will be downloaded. model_path = "Eugeoter/waifu-scorer-v4-beta/model.pth" if not os.path.isfile(model_path): from huggingface_hub import hf_hub_download model_path = hf_hub_download("Eugeoter/waifu-scorer-v4-beta", "model.pth", cache_dir=cache_dir) print(f"Loading pretrained WaifuScorer model from {model_path}") self.mlp = load_model(model_path, input_size=768, device=device) self.model2, self.preprocess = load_clip_models("ViT-L/14", device=device) self.device = device self.mlp.eval() @torch.no_grad() def __call__(self, images): if isinstance(images, Image.Image): images = [images] n = len(images) if n == 1: images = images * 2 # duplicate single image for batch norm consistency images_encoded = encode_images(images, self.model2, self.preprocess, device=self.device).to(self.device, dtype=torch.float32) predictions = self.mlp(images_encoded) scores = predictions.clamp(0, 10).cpu().numpy().reshape(-1).tolist() return scores[0] if len(scores) == 1 else scores # Instantiate a global waifu scorer instance waifu_scorer_instance = WaifuScorer(device='cuda' if torch.cuda.is_available() else 'cpu') def score_waifu(image: Image.Image) -> float: """Scores an image using the WaifuScorer model (range 0-10).""" score = waifu_scorer_instance(image) if isinstance(score, list): return round(score[0], 2) return round(score, 2) # ============================================================================= # Aesthetic Predictor V2.5 # ============================================================================= class AestheticPredictor: def __init__(self): from aesthetic_predictor_v2_5 import convert_v2_5_from_siglip # Load model and preprocessor self.model, self.preprocessor = convert_v2_5_from_siglip( low_cpu_mem_usage=True, trust_remote_code=True, ) if torch.cuda.is_available(): self.model = self.model.to(torch.bfloat16).cuda() def inference(self, image: Image.Image) -> float: # Preprocess image pixel_values = self.preprocessor(images=image.convert("RGB"), return_tensors="pt").pixel_values if torch.cuda.is_available(): pixel_values = pixel_values.to(torch.bfloat16).cuda() with torch.inference_mode(): score = self.model(pixel_values).logits.squeeze().float().cpu().numpy() return score # Instantiate a global aesthetic predictor aesthetic_predictor_instance = AestheticPredictor() def score_aesthetic_predictor(image: Image.Image) -> float: """Returns the aesthetic score from aesthetic-predictor-v2-5 (usually between 1 and 10).""" score = aesthetic_predictor_instance.inference(image) return round(float(score), 2) # ============================================================================= # Cafe Aesthetic / Style / Waifu scoring using separate pipelines # ============================================================================= pipe_cafe_aesthetic = pipeline( "image-classification", "cafeai/cafe_aesthetic", device=0 if torch.cuda.is_available() else -1 ) pipe_cafe_style = pipeline( "image-classification", "cafeai/cafe_style", device=0 if torch.cuda.is_available() else -1 ) pipe_cafe_waifu = pipeline( "image-classification", "cafeai/cafe_waifu", device=0 if torch.cuda.is_available() else -1 ) def score_cafe(image: Image.Image): """Returns a tuple of (cafe aesthetic, cafe style, cafe waifu) scores/dicts.""" result_aesthetic = pipe_cafe_aesthetic(image, top_k=2) score_aesthetic = {d["label"]: d["score"] for d in result_aesthetic} result_style = pipe_cafe_style(image, top_k=5) score_style = {d["label"]: d["score"] for d in result_style} result_waifu = pipe_cafe_waifu(image, top_k=5) score_waifu_dict = {d["label"]: d["score"] for d in result_waifu} # For convenience, we take the top aesthetic score top_aesthetic = list(score_aesthetic.values())[0] if score_aesthetic else None return top_aesthetic, score_style, score_waifu_dict # ============================================================================= # Anime Aesthetic Predict using ONNX Runtime # ============================================================================= # Download the model (only once) model_path_anime = None try: from huggingface_hub import hf_hub_download model_path_anime = hf_hub_download(repo_id="skytnt/anime-aesthetic", filename="model.onnx") except Exception as e: print("Error downloading anime aesthetic model:", e) if model_path_anime: model_anime = rt.InferenceSession(model_path_anime, providers=['CPUExecutionProvider']) else: model_anime = None def score_anime_aesthetic(image: Image.Image) -> float: """Returns the aesthetic score from the anime-aesthetic model.""" img = np.array(image) img = img.astype(np.float32) / 255.0 s = 768 h, w = img.shape[:2] if h > w: new_h, new_w = s, int(s * w / h) else: new_h, new_w = int(s * h / w), s resized = cv2.resize(img, (new_w, new_h)) ph, pw = s - new_h, s - new_w img_input = np.zeros((s, s, 3), dtype=np.float32) img_input[ph//2:ph//2+new_h, pw//2:pw//2+new_w] = resized img_input = np.transpose(img_input, (2, 0, 1)) img_input = img_input[np.newaxis, :] if model_anime: pred = model_anime.run(None, {"img": img_input})[0].item() return round(pred, 2) else: return 0.0 # ============================================================================= # Main Evaluation Function: Process a list of images and return a results table and gallery preview # ============================================================================= def evaluate_images(images): """ For each uploaded image, compute scores from multiple models. Returns: - A Pandas DataFrame with rows for each image and columns for each score. - A list of images (previews) for display. """ results = [] previews = [] for idx, img in enumerate(images): filename = f"Image {idx+1}" try: score_shadow = score_aesthetic_shadow(img) except Exception as e: score_shadow = None try: score_waifu_val = score_waifu(img) except Exception as e: score_waifu_val = None try: score_ap = score_aesthetic_predictor(img) except Exception as e: score_ap = None try: cafe_aesthetic, _, _ = score_cafe(img) except Exception as e: cafe_aesthetic = None try: score_anime = score_anime_aesthetic(img) except Exception as e: score_anime = None results.append({ "Filename": filename, "Aesthetic Shadow": score_shadow, "Waifu Scorer": score_waifu_val, "Aesthetic Predictor": score_ap, "Cafe Aesthetic": cafe_aesthetic, "Anime Aesthetic": score_anime }) previews.append(img) df = pd.DataFrame(results) return df, previews # ============================================================================= # Gradio Interface # ============================================================================= with gr.Blocks(title="Ultimate Image Aesthetic Evaluator") as demo: gr.Markdown( """ # Ultimate Image Aesthetic Evaluator Upload multiple images to evaluate their aesthetic scores using various models. The table below shows the scores from: - **Aesthetic Shadow** - **Waifu Scorer** - **Aesthetic Predictor V2.5** - **Cafe Aesthetic** - **Anime Aesthetic** """ ) with gr.Row(): with gr.Column(): input_images = gr.Image( label="Upload Images", type="pil", tool="editor", source="upload", image_mode="RGB", interactive=True, multiple=True ) evaluate_button = gr.Button("Evaluate Images") with gr.Column(): output_table = gr.Dataframe( headers=["Filename", "Aesthetic Shadow", "Waifu Scorer", "Aesthetic Predictor", "Cafe Aesthetic", "Anime Aesthetic"], label="Evaluation Results" ) output_gallery = gr.Gallery(label="Image Previews").style(grid=[2], height="auto") evaluate_button.click(fn=evaluate_images, inputs=input_images, outputs=[output_table, output_gallery]) demo.queue().launch()