Spaces:

Dref360
/

human-interaction-demo

Sleeping

App Files Files Community

human-interaction-demo / app.py

Dref360

use local images

1415779 8 months ago

raw

history blame contribute delete

10.8 kB

	import cv2
	import gradio as gr
	import numpy as np
	import supervision as sv
	import torch
	from PIL import Image
	from transformers import (
	RTDetrForObjectDetection,
	RTDetrImageProcessor,
	VitPoseForPoseEstimation,
	VitPoseImageProcessor,
	pipeline,
	)

	KEYPOINT_LABEL_MAP = {
	0: "Nose",
	1: "L_Eye",
	2: "R_Eye",
	3: "L_Ear",
	4: "R_Ear",
	5: "L_Shoulder",
	6: "R_Shoulder",
	7: "L_Elbow",
	8: "R_Elbow",
	9: "L_Wrist",
	10: "R_Wrist",
	11: "L_Hip",
	12: "R_Hip",
	13: "L_Knee",
	14: "R_Knee",
	15: "L_Ankle",
	16: "R_Ankle",
	}


	class InteractionDetector:
	def __init__(self):
	self.person_detector = None
	self.person_processor = None
	self.pose_model = None
	self.pose_processor = None
	self.depth_model = None
	self.segmentation_model = None
	self.interaction_threshold = 2
	self.load_models()

	def load_models(self):
	"""Load all required models"""
	# Person detection model
	self.person_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
	self.person_detector = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")

	# Pose estimation model
	self.pose_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
	self.pose_model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")

	# Depth estimation model
	self.depth_model = pipeline("depth-estimation", model="depth-anything/Depth-Anything-V2-Small-hf")

	# Semantic segmentation model
	self.segmentation_model = pipeline("image-segmentation", model="facebook/maskformer-swin-base-ade")
	self.segmentation_id2label = self.segmentation_model.model.config.id2label
	self.segmentation_label2id = {v: k for k, v in self.segmentation_model.model.config.id2label.items()}

	def get_nearest_pixel_class(self, joint, depth_map, segmentation_map):
	"""
	Find the nearest pixel of a specific class to a given joint coordinate
	Args:
	joint: (x, y) coordinates of the joint
	depth_map: Depth map
	segmentation_map: Semantic segmentation results
	Returns:
	tuple: class_name of nearest pixel, distance to that pixel
	"""
	PERSON_ID = 12
	grid_x, grid_y = np.meshgrid(np.arange(depth_map.shape[0]), np.arange(depth_map.shape[1]))
	dist_x = np.abs(grid_x.T - joint[1])
	dist_y = np.abs(grid_y.T - joint[0])
	dist_coord = dist_x + dist_y

	depth_dist = np.abs(depth_map - depth_map[joint[1], joint[0]])
	depth_dist[(segmentation_map == PERSON_ID) \| (dist_coord > 50)] = 255
	min_dist = np.unravel_index(np.argmin(depth_dist), depth_dist.shape)
	return segmentation_map[min_dist], depth_dist[min_dist]

	def detect_persons(self, image: Image.Image):
	"""Detect persons in the image"""
	inputs = self.person_processor(images=image, return_tensors="pt")
	with torch.no_grad():
	outputs = self.person_detector(**inputs)

	results = self.person_processor.post_process_object_detection(
	outputs,
	target_sizes=torch.tensor([(image.height, image.width)]),
	threshold=0.3
	)

	boxes = results[0]["boxes"][results[0]["labels"] == 0]
	scores = results[0]["scores"][results[0]["labels"] == 0]
	return boxes.cpu().numpy(), scores.cpu().numpy()

	def detect_keypoints(self, image: Image.Image):
	"""Detect keypoints in the image"""
	boxes, scores = self.detect_persons(image)

	pixel_values = self.pose_processor(image, boxes=[boxes], return_tensors="pt").pixel_values
	with torch.no_grad():
	outputs = self.pose_model(pixel_values)

	pose_results = self.pose_processor.post_process_pose_estimation(outputs, boxes=[boxes])[0]
	return pose_results, boxes, scores

	def estimate_depth(self, image: Image.Image):
	"""Estimate depth for the image"""
	with torch.no_grad():
	depth_map = np.array(self.depth_model(image)['depth'])
	return depth_map

	def segment_image(self, image: Image.Image):
	"""Perform semantic segmentation on the image"""
	with torch.no_grad():
	segmentation_map = self.segmentation_model(image)
	result = np.zeros(np.array(image).shape[:2], dtype=np.uint8)
	print("Found", [l['label'] for l in segmentation_map])
	for cls_item in sorted(segmentation_map, key=lambda l: np.sum(l['mask']), reverse=True):
	result[np.array(cls_item['mask']) > 0] = self.segmentation_label2id[cls_item['label']]

	return result

	def detect_wall_interaction(self, image: Image.Image):
	"""Detect if hands are touching walls"""
	# Get all necessary information
	pose_results, boxes, scores = self.detect_keypoints(image)
	depth_map = self.estimate_depth(image)
	segmentation_map = self.segment_image(image)

	interactions = []

	for person_idx, pose_result in enumerate(pose_results):
	# Get hand keypoints
	right_hand = pose_result["keypoints"][10].numpy().astype(int)
	left_hand = pose_result["keypoints"][9].numpy().astype(int)

	# Find nearest anything pixels
	right_cls, r_distance = self.get_nearest_pixel_class(right_hand[:2], depth_map, segmentation_map)
	left_cls, l_distance = self.get_nearest_pixel_class(left_hand[:2], depth_map, segmentation_map)

	# Check for interactions
	right_touching = r_distance < self.interaction_threshold
	left_touching = l_distance < self.interaction_threshold

	interactions.append({
	"person_id": person_idx,
	"right_hand_touching_object": self.segmentation_id2label[right_cls],
	"left_hand_touching_object": self.segmentation_id2label[left_cls],
	"right_hand_touching": right_touching,
	"left_hand_touching": left_touching,
	"right_hand_distance": r_distance,
	"left_hand_distance": l_distance
	})

	return interactions, pose_results, segmentation_map, depth_map

	def visualize_results(self, image: Image.Image, interactions, pose_results):
	"""Visualize detection results"""
	# Create base visualization from original image
	vis_image = np.array(image).copy()

	# Add pose keypoints
	edge_annotator = sv.EdgeAnnotator(color=sv.Color.GREEN, thickness=2)
	key_points = sv.KeyPoints(
	xy=torch.cat([pose_result['keypoints'].unsqueeze(0) for pose_result in pose_results]).cpu().numpy()
	)
	vis_image = edge_annotator.annotate(scene=vis_image, key_points=key_points)

	# Add interaction indicators
	for interaction in interactions:
	person_id = interaction["person_id"]
	pose_result = pose_results[person_id]

	# Draw indicators for touching hands
	if interaction["right_hand_touching"]:
	cv2.circle(vis_image,
	tuple(map(int, pose_result["keypoints"][10][:2])),
	10, (0, 0, 255), -1)

	if interaction["left_hand_touching"]:
	cv2.circle(vis_image,
	tuple(map(int, pose_result["keypoints"][9][:2])),
	10, (0, 0, 255), -1)

	return Image.fromarray(vis_image)

	def process_image(self, input_image):
	"""Process image and return visualization with interaction detection"""
	if input_image is None:
	return None, ""

	# Convert to PIL Image if necessary
	if isinstance(input_image, np.ndarray):
	image = Image.fromarray(input_image)
	else:
	image = input_image

	image = image.resize((1280, 720))

	# Detect interactions
	interactions, pose_results, segmentation_map, depth_map = self.detect_wall_interaction(image)

	# Visualize results
	result_image = self.visualize_results(image, interactions, pose_results)

	# Create interaction information text
	info_text = []
	for interaction in interactions:
	info_text.append(f"\nPerson {interaction['person_id'] + 1}:")
	if interaction["right_hand_touching"]:
	info_text.append(f"Right hand is touching {interaction['right_hand_touching_object']}")
	if interaction["left_hand_touching"]:
	info_text.append(f"Left hand is touching {interaction['left_hand_touching_object']}")
	info_text.append(f"Right hand distance to wall: {interaction['right_hand_distance']:.2f}")
	info_text.append(f"Left hand distance to wall: {interaction['left_hand_distance']:.2f}")

	# Add color to segmentation
	mask = np.zeros((*segmentation_map.shape, 3), dtype=np.uint8)
	colors = np.random.randint(0, 255, size=(100, 3))

	for cl_id in np.unique(segmentation_map):
	mask_array = np.array(segmentation_map == cl_id)
	color = colors[cl_id % len(colors)]
	mask[mask_array] = color

	return result_image, mask, depth_map, "\n".join(info_text)


	def create_gradio_interface():
	"""Create Gradio interface"""
	detector = InteractionDetector()

	with gr.Blocks() as interface:
	gr.Markdown("# Object Interaction Detection")
	gr.Markdown("Upload an image to detect when people are touching objects.")

	with gr.Row():
	with gr.Column():
	input_image = gr.Image(label="Input Image")
	process_button = gr.Button("Detect Interactions")

	with gr.Column():
	output_image = gr.Image(label="Detection Results")
	interaction_info = gr.Textbox(
	label="Interaction Information",
	lines=10,
	placeholder="Interaction details will appear here..."
	)
	segmentation_im = gr.Image(label="Segmentaiton Results")
	depth_im = gr.Image(label="Depth Results")

	process_button.click(
	fn=detector.process_image,
	inputs=input_image,
	outputs=[output_image, segmentation_im, depth_im, interaction_info]
	)

	gr.Examples(
	examples=[
	"images/1-8ea4418f.jpg",
	"images/276757975.jpg"
	],
	inputs=input_image
	)

	return interface


	interface = create_gradio_interface()
	if __name__ == "__main__":
	interface.launch(debug=True)