Spaces:

VOIDER
/

CIET

Runtime error

File size: 12,562 Bytes

import os
import cv2
import torch
import gradio as gr
import numpy as np
import pandas as pd
import onnxruntime as rt
import pytorch_lightning as pl
import torch.nn as nn
from transformers import pipeline
from PIL import Image
import inspect
import safetensors.torch

# =============================================================================
# Aesthetic-Shadow (using Hugging Face transformers pipeline)
# =============================================================================
# Initialize the pipeline; if CUDA is available, use GPU (device=0), else CPU (device=-1)
pipe_shadow = pipeline(
    "image-classification", 
    model="NeoChen1024/aesthetic-shadow-v2-backup", 
    device=0 if torch.cuda.is_available() else -1
)

def score_aesthetic_shadow(image: Image.Image) -> float:
    """Returns the 'hq' score from the aesthetic-shadow model."""
    result = pipe_shadow(image)
    # The result is a list (one per image) of predictions; find the one with label "hq"
    for pred in result[0]:
        if pred['label'] == 'hq':
            return round(pred['score'], 2)
    return 0.0

# =============================================================================
# Waifu-Scorer (including all necessary utility functions and model definition)
# =============================================================================
class MLP(pl.LightningModule):
    def __init__(self, input_size, batch_norm=True):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 2048),
            nn.ReLU(),
            nn.BatchNorm1d(2048) if batch_norm else nn.Identity(),
            nn.Dropout(0.3),
            nn.Linear(2048, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512) if batch_norm else nn.Identity(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256) if batch_norm else nn.Identity(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128) if batch_norm else nn.Identity(),
            nn.Dropout(0.1),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.layers(x)

def normalized(a: torch.Tensor, order=2, dim=-1):
    l2 = a.norm(order, dim, keepdim=True)
    l2[l2 == 0] = 1
    return a / l2

def load_clip_models(name: str = "ViT-L/14", device='cuda'):
    import open_clip
    model2, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(name, device=device)
    preprocess = preprocess_val
    return model2, preprocess 

def load_model(model_path: str, input_size=768, device: str = 'cuda', dtype=None):
    model = MLP(input_size=input_size)
    if model_path.endswith(".safetensors"):
        state_dict = safetensors.torch.load_file(model_path, device=device)
    else:
        state = torch.load(model_path, map_location=device, weights_only=False)
        state_dict = state
    model.load_state_dict(state_dict)
    model.to(device)
    if dtype:
        model = model.to(dtype=dtype)
    return model

def encode_images(images, model2, preprocess, device='cuda'):
    if isinstance(images, Image.Image):
        images = [images]
    image_tensors = [preprocess(img).unsqueeze(0) for img in images]
    image_batch = torch.cat(image_tensors).to(device)
    image_features = model2.encode_image(image_batch)
    im_emb_arr = normalized(image_features).cpu().float()
    return im_emb_arr

class WaifuScorer:
    def __init__(self, model_path=None, device='cuda', cache_dir=None, verbose=False):
        self.verbose = verbose
        if model_path is None:
            # Use default repo path – if the model file is not present locally, it will be downloaded.
            model_path = "Eugeoter/waifu-scorer-v4-beta/model.safetensors"
        if not os.path.isfile(model_path):
            from huggingface_hub import hf_hub_download
            model_path = hf_hub_download("Eugeoter/waifu-scorer-v4-beta", "model.safetensors", cache_dir=cache_dir)
        print(f"Loading pretrained WaifuScorer model from {model_path}")
        self.mlp = load_model(model_path, input_size=768, device=device)
        self.model2, self.preprocess = load_clip_models("ViT-L/14", device=device)
        self.device = device
        self.mlp.eval()

    @torch.no_grad()
    def __call__(self, images):
        if isinstance(images, Image.Image):
            images = [images]
        n = len(images)
        if n == 1:
            images = images * 2  # duplicate single image for batch norm consistency
        images_encoded = encode_images(images, self.model2, self.preprocess, device=self.device).to(self.device, dtype=torch.float32)
        predictions = self.mlp(images_encoded)
        scores = predictions.clamp(0, 10).cpu().numpy().reshape(-1).tolist()
        return scores[0] if len(scores) == 1 else scores

# Instantiate a global waifu scorer instance
waifu_scorer_instance = WaifuScorer(device='cuda' if torch.cuda.is_available() else 'cpu')

def score_waifu(image: Image.Image) -> float:
    """Scores an image using the WaifuScorer model (range 0-10)."""
    score = waifu_scorer_instance(image)
    if isinstance(score, list):
        return round(score[0], 2)
    return round(score, 2)

# =============================================================================
# Aesthetic Predictor V2.5
# =============================================================================
class AestheticPredictor:
    def __init__(self):
        from aesthetic_predictor_v2_5 import convert_v2_5_from_siglip
        # Load model and preprocessor
        self.model, self.preprocessor = convert_v2_5_from_siglip(
            low_cpu_mem_usage=True,
            trust_remote_code=True,
        )
        if torch.cuda.is_available():
            self.model = self.model.to(torch.bfloat16).cuda()

    def inference(self, image: Image.Image) -> float:
        # Preprocess image
        pixel_values = self.preprocessor(images=image.convert("RGB"), return_tensors="pt").pixel_values
        if torch.cuda.is_available():
            pixel_values = pixel_values.to(torch.bfloat16).cuda()
        with torch.inference_mode():
            score = self.model(pixel_values).logits.squeeze().float().cpu().numpy()
        return score

# Instantiate a global aesthetic predictor
aesthetic_predictor_instance = AestheticPredictor()

def score_aesthetic_predictor(image: Image.Image) -> float:
    """Returns the aesthetic score from aesthetic-predictor-v2-5 (usually between 1 and 10)."""
    score = aesthetic_predictor_instance.inference(image)
    return round(float(score), 2)

# =============================================================================
# Cafe Aesthetic / Style / Waifu scoring using separate pipelines
# =============================================================================
pipe_cafe_aesthetic = pipeline(
    "image-classification", 
    "cafeai/cafe_aesthetic", 
    device=0 if torch.cuda.is_available() else -1
)
pipe_cafe_style = pipeline(
    "image-classification", 
    "cafeai/cafe_style", 
    device=0 if torch.cuda.is_available() else -1
)
pipe_cafe_waifu = pipeline(
    "image-classification", 
    "cafeai/cafe_waifu", 
    device=0 if torch.cuda.is_available() else -1
)

def score_cafe(image: Image.Image):
    """Returns a tuple of (cafe aesthetic, cafe style, cafe waifu) scores/dicts."""
    result_aesthetic = pipe_cafe_aesthetic(image, top_k=2)
    score_aesthetic = {d["label"]: d["score"] for d in result_aesthetic}
    result_style = pipe_cafe_style(image, top_k=5)
    score_style = {d["label"]: d["score"] for d in result_style}
    result_waifu = pipe_cafe_waifu(image, top_k=5)
    score_waifu_dict = {d["label"]: d["score"] for d in result_waifu}
    # For convenience, we take the top aesthetic score
    top_aesthetic = list(score_aesthetic.values())[0] if score_aesthetic else None
    return top_aesthetic, score_style, score_waifu_dict

# =============================================================================
# Anime Aesthetic Predict using ONNX Runtime
# =============================================================================
# Download the model (only once)
model_path_anime = None
try:
    from huggingface_hub import hf_hub_download
    model_path_anime = hf_hub_download(repo_id="skytnt/anime-aesthetic", filename="model.onnx")
except Exception as e:
    print("Error downloading anime aesthetic model:", e)
if model_path_anime:
    model_anime = rt.InferenceSession(model_path_anime, providers=['CPUExecutionProvider'])
else:
    model_anime = None

def score_anime_aesthetic(image: Image.Image) -> float:
    """Returns the aesthetic score from the anime-aesthetic model."""
    img = np.array(image)
    img = img.astype(np.float32) / 255.0
    s = 768
    h, w = img.shape[:2]
    if h > w:
        new_h, new_w = s, int(s * w / h)
    else:
        new_h, new_w = int(s * h / w), s
    resized = cv2.resize(img, (new_w, new_h))
    ph, pw = s - new_h, s - new_w
    img_input = np.zeros((s, s, 3), dtype=np.float32)
    img_input[ph//2:ph//2+new_h, pw//2:pw//2+new_w] = resized
    img_input = np.transpose(img_input, (2, 0, 1))
    img_input = img_input[np.newaxis, :]
    if model_anime:
        pred = model_anime.run(None, {"img": img_input})[0].item()
        return round(pred, 2)
    else:
        return 0.0

# =============================================================================
# Main Evaluation Function: Process a list of images and return a results table and gallery preview
# =============================================================================
def evaluate_images(images):
    """
    For each uploaded image, compute scores from multiple models.
    Returns:
      - A Pandas DataFrame with rows for each image and columns for each score.
      - A list of images (previews) for display.
    """
    results = []
    previews = []
    for idx, img in enumerate(images):
        filename = f"Image {idx+1}"
        try:
            score_shadow = score_aesthetic_shadow(img)
        except Exception as e:
            score_shadow = None
        try:
            score_waifu_val = score_waifu(img)
        except Exception as e:
            score_waifu_val = None
        try:
            score_ap = score_aesthetic_predictor(img)
        except Exception as e:
            score_ap = None
        try:
            cafe_aesthetic, _, _ = score_cafe(img)
        except Exception as e:
            cafe_aesthetic = None
        try:
            score_anime = score_anime_aesthetic(img)
        except Exception as e:
            score_anime = None

        results.append({
            "Filename": filename,
            "Aesthetic Shadow": score_shadow,
            "Waifu Scorer": score_waifu_val,
            "Aesthetic Predictor": score_ap,
            "Cafe Aesthetic": cafe_aesthetic,
            "Anime Aesthetic": score_anime
        })
        previews.append(img)
    df = pd.DataFrame(results)
    return df, previews

# =============================================================================
# Gradio Interface
# =============================================================================
with gr.Blocks(title="Ultimate Image Aesthetic Evaluator") as demo:
    gr.Markdown(
        """
        # Ultimate Image Aesthetic Evaluator
        Upload multiple images to evaluate their aesthetic scores using various models.
        The table below shows the scores from:
        - **Aesthetic Shadow**
        - **Waifu Scorer**
        - **Aesthetic Predictor V2.5**
        - **Cafe Aesthetic**
        - **Anime Aesthetic**
        """
    )
    with gr.Row():
        with gr.Column():
            input_images = gr.Image(
                label="Upload Images", 
                type="pil", 
                tool="editor", 
                source="upload", 
                image_mode="RGB", 
                interactive=True, 
                multiple=True
            )
            evaluate_button = gr.Button("Evaluate Images")
        with gr.Column():
            output_table = gr.Dataframe(
                headers=["Filename", "Aesthetic Shadow", "Waifu Scorer", "Aesthetic Predictor", "Cafe Aesthetic", "Anime Aesthetic"],
                label="Evaluation Results"
            )
            output_gallery = gr.Gallery(label="Image Previews").style(grid=[2], height="auto")
    evaluate_button.click(fn=evaluate_images, inputs=input_images, outputs=[output_table, output_gallery])

demo.queue().launch()