Spaces:

XAI
/

Cleaning-ImageNet-Hard

Paused

App Files Files Community

Cleaning-ImageNet-Hard / app.py

taesiri

backup

fbfb369 about 2 years ago

raw

history blame

10.2 kB

	import csv
	import json
	import os
	import pickle
	import random
	import string
	import sys
	import time
	from glob import glob

	import datasets
	import gdown
	import gradio as gr
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import torch
	import torchvision
	from huggingface_hub import HfApi, login, snapshot_download
	from PIL import Image

	session_token = os.environ.get("SessionToken")
	login(token=session_token)

	csv.field_size_limit(sys.maxsize)

	np.random.seed(int(time.time()))

	with open("./imagenet_hard_nearest_indices.pkl", "rb") as f:
	knn_results = pickle.load(f)

	with open("imagenet-labels.json") as f:
	wnid_to_label = json.load(f)

	with open("id_to_label.json", "r") as f:
	id_to_labels = json.load(f)

	imagenet_training_samples_path = "imagenet_traning_samples"

	bad_items = open("./ex2.txt", "r").read().split("\n")
	bad_items = [x.split(".")[0] for x in bad_items]
	bad_items = [int(x) for x in bad_items if x != ""]

	NUMBER_OF_IMAGES = 100 # len(bad_items)

	# download and extract folders

	gdown.cached_download(
	url="https://huggingface.co/datasets/taesiri/imagenet_hard_review_samples/resolve/main/data.zip",
	path="./data.zip",
	quiet=False,
	md5="8666a9b361f6eea79878be6c09701def",
	)

	# EXTRACT if needed

	if not os.path.exists("./imagenet_traning_samples") or not os.path.exists(
	"./knn_cache_for_imagenet_hard"
	):
	torchvision.datasets.utils.extract_archive(
	from_path="data.zip",
	to_path="./",
	remove_finished=False,
	)

	imagenet_hard = datasets.load_dataset("taesiri/imagenet-hard", split="validation")


	def update_snapshot(username):
	output_dir = snapshot_download(
	repo_id="taesiri/imagenet_hard_review_data",
	allow_patterns="*.json",
	repo_type="dataset",
	)
	files = glob(f"{output_dir}/*.json")

	df = pd.DataFrame()
	columns = ["id", "user_id", "time", "decision"]
	rows = []
	for file in files:
	with open(file) as f:
	data = json.load(f)
	tdf = [data[x] for x in columns]

	# add filename as a column
	rows.append(tdf)

	df = pd.DataFrame(rows, columns=columns)
	df = df[df["user_id"] == username]

	return df


	def generate_dataset(username):
	global NUMBER_OF_IMAGES
	df = update_snapshot(username)

	all_images = set(bad_items)
	answered = set(df.id)
	remaining = list(all_images - answered)

	if len(remaining) < NUMBER_OF_IMAGES and len(remaining) > 0:
	NUMBER_OF_IMAGES = len(remaining)
	random_indices = list(remaining)
	elif len(remaining) == 0:
	return []
	else:
	random_indices = np.random.choice(remaining, NUMBER_OF_IMAGES, replace=False)

	random_images = [imagenet_hard[int(i)]["image"] for i in random_indices]
	random_gt_ids = [imagenet_hard[int(i)]["label"] for i in random_indices]
	random_gt_labels = [imagenet_hard[int(x)]["english_label"] for x in random_indices]

	data = []
	for i, image in enumerate(random_images):
	data.append(
	{
	"id": random_indices[i],
	"image": image,
	"correct_label": random_gt_labels[i],
	"original_id": int(random_indices[i]),
	}
	)
	return data


	def string_to_image(text):
	text = text.replace("_", " ").lower().replace(", ", "\n")
	# Create a blank white square image
	img = np.ones((220, 75, 3))

	fig, ax = plt.subplots(figsize=(6, 2.25))
	ax.imshow(img, extent=[0, 1, 0, 1])
	ax.text(0.5, 0.75, text, fontsize=18, ha="center", va="center")
	ax.set_xticks([])
	ax.set_yticks([])
	ax.set_xticklabels([])
	ax.set_yticklabels([])
	for spine in ax.spines.values():
	spine.set_visible(False)

	return fig


	all_samples = glob("./imagenet_traning_samples/*.JPEG")
	qid_to_sample = {
	int(x.split("/")[-1].split(".")[0].split("_")[0]): x for x in all_samples
	}

	# user-e3z5b


	def get_training_samples(qid):
	labels_id = imagenet_hard[int(qid)]["label"]
	samples = [qid_to_sample[x] for x in labels_id]
	return samples


	def load_sample(data, current_index):
	image_id = data[current_index]["id"]
	qimage = data[current_index]["image"]

	labels = data[current_index]["correct_label"]
	return qimage, labels


	def preprocessing(data, current_index, history, username):
	data = generate_dataset(username)

	if len(data) == 0:
	fake_plot = string_to_image("No more images to review")
	empty_image = Image.new("RGB", (224, 224))
	return (
	empty_image,
	fake_plot,
	current_index,
	history,
	data,
	None,
	)

	current_index = 0
	qimage, labels = load_sample(data, current_index)
	image_id = data[current_index]["id"]
	training_samples_image = get_training_samples(image_id)
	training_samples_image = [
	Image.open(x).convert("RGB") for x in training_samples_image
	]

	# labels is a list of labels, conver it to a string
	labels = ", ".join(labels)
	label_plot = string_to_image(labels)

	return qimage, label_plot, current_index, history, data, training_samples_image


	def update_app(decision, data, current_index, history, username):
	global NUMBER_OF_IMAGES
	if current_index == -1:
	return

	if current_index == NUMBER_OF_IMAGES - 1:
	time_stamp = int(time.time())

	image_id = data[current_index]["id"]
	# convert to percentage
	dicision_dict = {
	"id": int(image_id),
	"user_id": username,
	"time": time_stamp,
	"decision": decision,
	}

	# upload the decision to the server
	temp_filename = f"results_{username}_{time_stamp}.json"
	# convert decision_dict to json and save it on the disk
	with open(temp_filename, "w") as f:
	json.dump(dicision_dict, f)

	api = HfApi()
	api.upload_file(
	path_or_fileobj=temp_filename,
	path_in_repo=temp_filename,
	repo_id="taesiri/imagenet_hard_review_data",
	repo_type="dataset",
	)

	os.remove(temp_filename)

	fake_plot = string_to_image("Thank you for your time!")
	empty_image = Image.new("RGB", (224, 224))
	return empty_image, fake_plot, current_index, history, data, None

	if current_index >= 0 and current_index < NUMBER_OF_IMAGES - 1:
	time_stamp = int(time.time())

	image_id = data[current_index]["id"]
	# convert to percentage
	dicision_dict = {
	"id": int(image_id),
	"user_id": username,
	"time": time_stamp,
	"decision": decision,
	}

	# upload the decision to the server
	temp_filename = f"results_{username}_{time_stamp}.json"
	# convert decision_dict to json and save it on the disk
	with open(temp_filename, "w") as f:
	json.dump(dicision_dict, f)

	api = HfApi()
	api.upload_file(
	path_or_fileobj=temp_filename,
	path_in_repo=temp_filename,
	repo_id="taesiri/imagenet_hard_review_data",
	repo_type="dataset",
	)

	os.remove(temp_filename)

	# Load the Next Image

	current_index += 1
	qimage, labels = load_sample(data, current_index)
	image_id = data[current_index]["id"]
	training_samples_image = get_training_samples(image_id)
	training_samples_image = [
	Image.open(x).convert("RGB") for x in training_samples_image
	]

	# labels is a list of labels, conver it to a string
	labels = ", ".join(labels)
	label_plot = string_to_image(labels)

	return qimage, label_plot, current_index, history, data, training_samples_image


	newcss = """
	#query_image{
	height: auto !important;
	}

	#nn_gallery {
	height: auto !important;
	}

	#sample_gallery {
	height: auto !important;
	}
	"""

	with gr.Blocks(css=newcss) as demo:
	data_gr = gr.State({})
	current_index = gr.State(-1)
	history = gr.State({})

	gr.Markdown("# Cleaning ImageNet-Hard!")

	random_str = "".join(
	random.choice(string.ascii_lowercase + string.digits) for _ in range(5)
	)

	with gr.Row():
	username = gr.Textbox(label="Username", value=f"user-{random_str}")
	prepare_btn = gr.Button(value="Load Samples")

	with gr.Column():
	with gr.Row():
	accept_btn = gr.Button(value="Accept")
	myabe_btn = gr.Button(value="Not Sure!")
	reject_btn = gr.Button(value="Reject")
	with gr.Row():
	query_image = gr.Image(type="pil", label="Query", elem_id="query_image")
	with gr.Column():
	label_plot = gr.Plot(
	label="Is this a correct label for this image?", type="fig"
	)
	training_samples = gr.Gallery(
	type="pil", label="Training samples", elem_id="sample_gallery"
	)

	accept_btn.click(
	update_app,
	inputs=[accept_btn, data_gr, current_index, history, username],
	outputs=[
	query_image,
	label_plot,
	current_index,
	history,
	data_gr,
	training_samples,
	],
	)
	myabe_btn.click(
	update_app,
	inputs=[myabe_btn, data_gr, current_index, history, username],
	outputs=[
	query_image,
	label_plot,
	current_index,
	history,
	data_gr,
	training_samples,
	],
	)

	reject_btn.click(
	update_app,
	inputs=[reject_btn, data_gr, current_index, history, username],
	outputs=[
	query_image,
	label_plot,
	current_index,
	history,
	data_gr,
	training_samples,
	],
	)

	prepare_btn.click(
	preprocessing,
	inputs=[data_gr, current_index, history, username],
	outputs=[
	query_image,
	label_plot,
	current_index,
	history,
	data_gr,
	training_samples,
	],
	)

	demo.launch()