cyber-tagger

Sleeping

App Files Files Community

cyber-tagger / app.py

CyberWaifu

Try to add the image preprocessing adapted from the original code.

19b8151 verified 5 months ago

raw

history blame

8.84 kB

	import gradio as gr
	import onnxruntime as ort
	import numpy as np
	from PIL import Image
	import json
	from huggingface_hub import hf_hub_download
	import torchvision.transforms as transforms

	# Constants
	MODEL_REPO = "AngelBottomless/camie-tagger-onnxruntime"
	MODEL_FILE = "camie_tagger_initial.onnx"
	META_FILE = "metadata.json"
	DEFAULT_THRESHOLD = 0.35 # Default value if slider is not used

	# Download model and metadata from Hugging Face Hub
	model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, cache_dir=".")
	meta_path = hf_hub_download(repo_id=MODEL_REPO, filename=META_FILE, cache_dir=".")

	# Initialize ONNX Runtime session and load metadata
	session = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])
	with open(meta_path, "r", encoding="utf-8") as f:
	metadata = json.load(f)

	def escape_tag(tag: str) -> str:
	"""Escape underscores and parentheses for Markdown."""
	return tag.replace("_", " ").replace("(", r"\\(").replace(")", r"\\)")

	def preprocess_image(pil_image: Image.Image) -> np.ndarray:
	"""Process an image for inference using same preprocessing as training"""

	image_size=512

	# Initialize the same transform used during training
	transform = transforms.Compose([
	transforms.ToTensor(),
	])

	img = pil_image # Use the PIL image directly

	# Convert RGBA or Palette images to RGB
	if img.mode in ('RGBA', 'P'):
	img = img.convert('RGB')

	# Get original dimensions
	width, height = img.size
	aspect_ratio = width / height

	# Calculate new dimensions to maintain aspect ratio
	if aspect_ratio > 1:
	new_width = image_size
	new_height = int(new_width / aspect_ratio)
	else:
	new_height = image_size
	new_width = int(new_height * aspect_ratio)

	# Resize with LANCZOS filter
	img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)

	# Create new image with padding
	new_image = Image.new('RGB', (image_size, image_size), (0, 0, 0))
	paste_x = (image_size - new_width) // 2
	paste_y = (image_size - new_height) // 2
	new_image.paste(img, (paste_x, paste_y))

	# Apply transforms (without normalization)
	img_tensor = transform(new_image)
	return img_tensor.numpy() # Convert the PyTorch tensor to NumPy array


	def run_inference(pil_image: Image.Image) -> np.ndarray:
	"""
	Preprocess the image and run the ONNX model inference.

	Returns the refined logits as a numpy array.
	"""
	input_tensor = preprocess_image(pil_image)
	input_name = session.get_inputs()[0].name
	# Only refined_logits are used (initial_logits is ignored)
	_, refined_logits = session.run(None, {input_name: input_tensor})
	return refined_logits[0]

	def get_tags(refined_logits: np.ndarray, metadata: dict, default_threshold: float):
	"""
	Compute probabilities from logits and collect tag predictions.

	Returns:
	results_by_cat: Dictionary mapping each category to a list of (tag, probability) above its threshold.
	prompt_tags_by_cat: Dictionary for prompt-style output (character, general).
	all_artist_tags: All artist tags (with probabilities) regardless of threshold.
	"""
	probs = 1 / (1 + np.exp(-refined_logits))
	idx_to_tag = metadata["idx_to_tag"]
	tag_to_category = metadata.get("tag_to_category", {})
	category_thresholds = metadata.get("category_thresholds", {})

	results_by_cat = {}
	# For prompt style, only include character and general tags (artists handled separately)
	prompt_tags_by_cat = {"character": [], "general": []}
	all_artist_tags = []

	for idx, prob in enumerate(probs):
	tag = idx_to_tag[str(idx)]
	cat = tag_to_category.get(tag, "unknown")
	thresh = category_thresholds.get(cat, default_threshold)
	if cat == "artist":
	all_artist_tags.append((tag, float(prob)))
	if float(prob) >= thresh:
	results_by_cat.setdefault(cat, []).append((tag, float(prob)))
	if cat in prompt_tags_by_cat:
	prompt_tags_by_cat[cat].append((tag, float(prob)))
	return results_by_cat, prompt_tags_by_cat, all_artist_tags

	def format_prompt_tags(prompt_tags_by_cat: dict, all_artist_tags: list) -> str:
	"""
	Format the tags for prompt-style output.
	Only the top artist tag is shown (regardless of threshold), and all character and general tags are shown.

	Returns a comma-separated string of escaped tags.
	"""
	# Always select the best artist tag from all_artist_tags, regardless of threshold.
	best_artist_tag = None
	if all_artist_tags:
	best_artist = max(all_artist_tags, key=lambda item: item[1])
	best_artist_tag = escape_tag(best_artist[0])

	# Sort character and general tags by probability (descending)
	for cat in prompt_tags_by_cat:
	prompt_tags_by_cat[cat].sort(key=lambda x: x[1], reverse=True)

	character_tags = [escape_tag(tag) for tag, _ in prompt_tags_by_cat.get("character", [])]
	general_tags = [escape_tag(tag) for tag, _ in prompt_tags_by_cat.get("general", [])]

	prompt_tags = []
	if best_artist_tag:
	prompt_tags.append(best_artist_tag)
	prompt_tags.extend(character_tags)
	prompt_tags.extend(general_tags)

	return ", ".join(prompt_tags) if prompt_tags else "No tags predicted."

	def format_detailed_output(results_by_cat: dict, all_artist_tags: list) -> str:
	"""
	Format the tags for detailed output.

	Returns a Markdown-formatted string listing tags by category.
	"""
	if not results_by_cat:
	return "No tags predicted for this image."

	# Include an artist tag even if below threshold
	if "artist" not in results_by_cat and all_artist_tags:
	best_artist_tag, best_artist_prob = max(all_artist_tags, key=lambda item: item[1])
	results_by_cat["artist"] = [(best_artist_tag, best_artist_prob)]

	lines = ["Predicted Tags by Category: \n"]
	for cat, tag_list in results_by_cat.items():
	tag_list.sort(key=lambda x: x[1], reverse=True)
	lines.append(f"Category: {cat} – {len(tag_list)} tags")
	for tag, prob in tag_list:
	lines.append(f"- {escape_tag(tag)} (Prob: {prob:.3f})")
	lines.append("") # blank line between categories
	return "\n".join(lines)

	def tag_image(pil_image: Image.Image, output_format: str, threshold: float) -> str:
	"""
	Run inference on the image and return formatted tags based on the chosen output format.

	The slider value (threshold) overrides the default threshold for tag selection.
	"""
	if pil_image is None:
	return "Please upload an image."

	refined_logits = run_inference(pil_image)
	results_by_cat, prompt_tags_by_cat, all_artist_tags = get_tags(refined_logits, metadata, default_threshold=threshold)

	if output_format == "Prompt-style Tags":
	return format_prompt_tags(prompt_tags_by_cat, all_artist_tags)
	else:
	return format_detailed_output(results_by_cat, all_artist_tags)

	# Build the Gradio Blocks UI
	demo = gr.Blocks(theme="gradio/soft")

	with demo:
	gr.Markdown(
	"# 🏷️ Camie Tagger – Anime Image Tagging\n"
	"This demo uses an ONNX model of Camie Tagger to label anime illustrations with tags. "
	"Upload an image, adjust the threshold, and click Tag Image to see predictions."
	)
	gr.Markdown(
	"(Note: In prompt-style output, only the top artist tag is displayed along with all character and general tags.)"
	)
	with gr.Row():
	with gr.Column():
	image_in = gr.Image(type="pil", label="Input Image")
	format_choice = gr.Radio(
	choices=["Prompt-style Tags", "Detailed Output"],
	value="Prompt-style Tags",
	label="Output Format"
	)
	# Slider to modify the default threshold value used in inference.
	threshold_slider = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	step=0.05,
	value=DEFAULT_THRESHOLD,
	label="Threshold"
	)
	tag_button = gr.Button("🔍 Tag Image")
	with gr.Column():
	output_box = gr.Markdown("") # Markdown output for formatted results

	# Pass the threshold_slider value into the tag_image function
	tag_button.click(fn=tag_image, inputs=[image_in, format_choice, threshold_slider], outputs=output_box)

	gr.Markdown(
	"----\n"
	"Model: [Camie Tagger ONNX](https://huggingface.co/AngelBottomless/camie-tagger-onnxruntime) • "
	"Base Model: Camais03/camie-tagger (61% F1 on 70k tags) • "
	"ONNX Runtime: for efficient CPU inference • "
	"Demo built with Gradio Blocks."
	)

	if __name__ == "__main__":
	demo.launch()