Spaces:

HaohuaLv
/

one-shot_object_detection

Running

App Files Files

xet

Community

one-shot_object_detection / app.py

HaohuaLv

Upload app.py

d1083aa almost 2 years ago

raw

history blame

8.4 kB

	import gradio as gr
	from PIL import Image, ImageDraw
	import torch
	from transformers import OwlViTProcessor, OwlViTForObjectDetection, OwlViTModel, OwlViTImageProcessor
	from transformers.image_transforms import center_to_corners_format
	from transformers.models.owlvit.modeling_owlvit import box_iou
	from functools import partial

	# from utils import iou

	processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
	model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

	from transformers.models.owlvit.modeling_owlvit import OwlViTImageGuidedObjectDetectionOutput, OwlViTClassPredictionHead





	def classpredictionhead_box_forward(
	self,
	image_embeds,
	query_indice,
	query_mask,
	):
	image_class_embeds = self.dense0(image_embeds)

	# Normalize image and text features
	image_class_embeds = image_class_embeds / (torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6)
	print(image_class_embeds.shape)
	query_embeds = image_class_embeds[0, query_indice].unsqueeze(0).unsqueeze(0)
	print(query_embeds.shape)
	# query_embeds = query_embeds / (torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6)

	# Get class predictions
	pred_logits = torch.einsum("...pd,...qd->...pq", image_class_embeds, query_embeds)

	# Apply a learnable shift and scale to logits
	logit_shift = self.logit_shift(image_embeds)
	logit_scale = self.logit_scale(image_embeds)
	logit_scale = self.elu(logit_scale) + 1
	pred_logits = (pred_logits + logit_shift) * logit_scale

	if query_mask is not None:
	if query_mask.ndim > 1:
	query_mask = torch.unsqueeze(query_mask, dim=-2)

	pred_logits = pred_logits.to(torch.float64)
	pred_logits = torch.where(query_mask == 0, -1e6, pred_logits)
	pred_logits = pred_logits.to(torch.float32)

	return (pred_logits, image_class_embeds)



	def class_predictor(
	self,
	image_feats,
	query_indice=None,
	query_mask=None,
	):

	(pred_logits, image_class_embeds) = self.class_head.classpredictionhead_box_forward(image_feats, query_indice, query_mask)

	return (pred_logits, image_class_embeds)








	def get_max_iou_indice(target_pred_boxes, query_box, target_sizes):
	boxes = center_to_corners_format(target_pred_boxes)
	img_h, img_w = target_sizes.unbind(1)
	scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
	boxes = boxes * scale_fct[:, None, :]

	iou, _ = box_iou(boxes.squeeze(0), query_box)

	return iou.argmax()


	def box_guided_detection(
	self: OwlViTForObjectDetection,
	pixel_values,
	query_box=None,
	target_sizes=None,
	output_attentions=None,
	output_hidden_states=None,
	return_dict=None,
	):

	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.return_dict

	# Compute feature maps for the input and query images
	# query_feature_map = self.image_embedder(pixel_values=query_pixel_values)[0]
	feature_map, vision_outputs = self.image_embedder(
	pixel_values=pixel_values,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	)

	batch_size, num_patches, num_patches, hidden_dim = feature_map.shape
	image_feats = torch.reshape(feature_map, (batch_size, num_patches * num_patches, hidden_dim))

	# batch_size, num_patches, num_patches, hidden_dim = query_feature_map.shape
	# query_image_feats = torch.reshape(query_feature_map, (batch_size, num_patches * num_patches, hidden_dim))
	# # Get top class embedding and best box index for each query image in batch
	# query_embeds, best_box_indices, query_pred_boxes = self.embed_image_query(query_image_feats, query_feature_map)

	# Predict object boxes
	target_pred_boxes = self.box_predictor(image_feats, feature_map)

	# Get MAX IOU box corresponding embedding
	query_indice = get_max_iou_indice(target_pred_boxes, query_box, target_sizes)

	# Predict object classes [batch_size, num_patches, num_queries+1]
	(pred_logits, class_embeds) = self.class_predictor(image_feats=image_feats, query_indice=query_indice)





	if not return_dict:
	output = (
	feature_map,
	# query_feature_map,
	target_pred_boxes,
	# query_pred_boxes,
	pred_logits,
	class_embeds,
	vision_outputs.to_tuple(),
	)
	output = tuple(x for x in output if x is not None)
	return output

	return OwlViTImageGuidedObjectDetectionOutput(
	image_embeds=feature_map,
	# query_image_embeds=query_feature_map,
	target_pred_boxes=target_pred_boxes,
	# query_pred_boxes=query_pred_boxes,
	logits=pred_logits,
	class_embeds=class_embeds,
	text_model_output=None,
	vision_model_output=vision_outputs,
	)


	model.box_guided_detection = partial(box_guided_detection, model)
	model.class_predictor = partial(class_predictor, model)
	model.class_head.classpredictionhead_box_forward = partial(classpredictionhead_box_forward, model.class_head)


	outputs = None
	def prepare_embedds(xmin, ymin, xmax, ymax, image):
	box = (int(xmin), int(ymin), int(xmax), int(ymax))
	return (image, [(box, "manul")])

	def manul_box_change(xmin, ymin, xmax, ymax, image):
	box = (int(xmin), int(ymin), int(xmax), int(ymax))
	return (image, [(box, "manul")])

	def threshold_change(xmin, ymin, xmax, ymax, image, threshold, nms):
	manul_box = (int(xmin), int(ymin), int(xmax), int(ymax))

	global outputs
	target_sizes = torch.Tensor([image.size[::-1]])

	results = processor.post_process_image_guided_detection(outputs=outputs, threshold=threshold, nms_threshold=nms, target_sizes=target_sizes)

	boxes = results[0]['boxes'].type(torch.int64).tolist()
	scores = results[0]['scores'].tolist()
	labels = list(zip(boxes, scores))
	labels.append((manul_box, "manual"))

	cnt = len(boxes) - 1

	return (image, labels), cnt

	def one_shot_detect(xmin, ymin, xmax, ymax, image, threshold, nms):
	manul_box = (int(xmin), int(ymin), int(xmax), int(ymax))

	global outputs
	target_sizes = torch.Tensor([image.size[::-1]])
	inputs = processor(images=image.convert("RGB"), return_tensors="pt")
	outputs = model.box_guided_detection(**inputs, query_box=torch.Tensor([manul_box]), target_sizes=target_sizes)

	results = processor.post_process_image_guided_detection(outputs=outputs, threshold=threshold, nms_threshold=nms, target_sizes=target_sizes)

	boxes = results[0]['boxes'].type(torch.int64).tolist()
	scores = results[0]['scores'].tolist()
	labels = list(zip(boxes, scores))
	labels.append((manul_box, "manual"))

	cnt = len(boxes) - 1

	return (image, labels), cnt


	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column():
	image = gr.Image(type="pil")
	threshold = gr.Number(0.95, label="threshold", step=0.01)
	nms = gr.Number(0.3, label="nms", step=0.01)
	cnt = gr.Number(0, label="count", interactive=False)
	with gr.Column():
	annotatedimage = gr.AnnotatedImage()
	with gr.Row():
	xmin = gr.Number(8, label="xmin")
	ymin = gr.Number(198, label="ymin")
	xmax = gr.Number(100, label="xmax")
	ymax = gr.Number(428, label="ymax")
	button = gr.Button(variant="primary")

	xmin.change(manul_box_change, [xmin, ymin, xmax, ymax, image], [annotatedimage])
	ymin.change(manul_box_change, [xmin, ymin, xmax, ymax, image], [annotatedimage])
	xmax.change(manul_box_change, [xmin, ymin, xmax, ymax, image], [annotatedimage])
	ymax.change(manul_box_change, [xmin, ymin, xmax, ymax, image], [annotatedimage])
	threshold.change(threshold_change, [xmin, ymin, xmax, ymax, image, threshold, nms], [annotatedimage, cnt])
	nms.change(threshold_change, [xmin, ymin, xmax, ymax, image, threshold, nms], [annotatedimage, cnt])
	image.upload(prepare_embedds, [xmin, ymin, xmax, ymax, image], [annotatedimage])
	button.click(one_shot_detect, [xmin, ymin, xmax, ymax, image, threshold, nms], [annotatedimage, cnt])



	demo.launch(server_port=7861)