Spaces:

kaifz
/

pgnd

Running on Zero

pgnd / src /experiments /real_world /modules_planning /segment_perception.py

kywind

update

f96995c 2 months ago

11.2 kB

	from pathlib import Path
	import os
	import time
	import numpy as np
	import torch
	import cv2
	import open3d as o3d
	from threadpoolctl import threadpool_limits
	import multiprocess as mp
	from functools import partial
	from PIL import Image
	import supervision as sv

	from pgnd.utils import get_root
	root: Path = get_root(__file__)

	from camera.multi_realsense import MultiRealsense
	from camera.single_realsense import SingleRealsense

	from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
	from sam2.build_sam import build_sam2, build_sam2_video_predictor
	from sam2.sam2_image_predictor import SAM2ImagePredictor

	from utils.pcd_utils import depth2fgpcd


	def get_mask_raw(depth, intr, extr, bbox, depth_threshold=[0, 2]):
	points = depth2fgpcd(depth, intr).reshape(-1, 3)
	mask = np.logical_and((depth > depth_threshold[0]), (depth < depth_threshold[1])) # (H, W)

	points = (np.linalg.inv(extr) @ np.concatenate([points, np.ones((points.shape[0], 1)).astype(np.float32)], axis=1).T).T[:, :3] # (N, 3)
	mask_bbox = np.logical_and(
	np.logical_and(points[:, 0] > bbox[0][0], points[:, 0] < bbox[0][1]),
	np.logical_and(points[:, 1] > bbox[1][0], points[:, 1] < bbox[1][1])
	) # does not include z axis
	mask_bbox = mask_bbox.reshape(depth.shape[0], depth.shape[1])
	mask = np.logical_and(mask, mask_bbox)
	return mask


	def segment_process_func(cameras_output, intrs, extrs, text_prompts, processor, grounding_model, image_predictor, bbox, device, show_annotation=True):
	colors_list = []
	depths_list = []
	pts_list = []
	for ck, cv in cameras_output.items():

	image = cv["color"].copy()
	depth = cv["depth"].copy() / 1000.0
	image = Image.fromarray(image)

	# ground
	inputs = processor(images=image, text=text_prompts, return_tensors="pt").to(device)
	with torch.no_grad():
	outputs = grounding_model(**inputs)
	results = processor.post_process_grounded_object_detection(
	outputs,
	inputs.input_ids,
	box_threshold=0.325,
	text_threshold=0.3,
	target_sizes=[image.size[::-1]]
	)
	input_boxes = results[0]["boxes"].cpu().numpy()
	objects = results[0]["labels"]

	depth_mask = get_mask_raw(depth, intrs[ck], extrs[ck], bbox)

	multi_objs = False
	if len(objects) > 1:
	objects_masked = []
	input_boxes_masked = []
	if intrs is None or extrs is None:
	print("No camera intrinsics and extrinsics provided")
	return {
	"color": [],
	"depth": [],
	"pts": [],
	}
	for i, obj in enumerate(objects):
	if obj == '':
	continue
	box = input_boxes[i].astype(int)
	if (box[3] - box[1]) * (box[2] - box[0]) > 500 * 400:
	continue
	depth_mask_box = depth_mask[box[1]:box[3], box[0]:box[2]]
	if depth_mask_box.sum() > 0:
	objects_masked.append(obj)
	input_boxes_masked.append(box)
	objects = objects_masked
	input_boxes = input_boxes_masked
	if len(objects) == 0:
	print("No objects detected")
	return {
	"color": [],
	"depth": [],
	"pts": [],
	}
	elif len(objects) > 1:
	multi_objs = True

	image_predictor.set_image(np.array(image.convert("RGB")))
	masks, scores, logits = image_predictor.predict(
	point_coords=None,
	point_labels=None,
	box=input_boxes,
	multimask_output=False,
	)
	if masks.ndim == 3:
	pass
	elif masks.ndim == 4:
	assert multi_objs
	masks = masks.squeeze(1)
	masks = masks.astype(bool)

	ID_TO_OBJECTS = {i: obj for i, obj in enumerate(objects, start=1)}
	object_ids = np.arange(1, len(objects) + 1)

	detections = sv.Detections(
	xyxy=sv.mask_to_xyxy(masks), # (n, 4)
	mask=masks, # (n, h, w)
	class_id=np.array(object_ids, dtype=np.int32),
	)
	box_annotator = sv.BoxAnnotator()
	if show_annotation:
	annotated_frame = box_annotator.annotate(scene=np.array(image).astype(np.uint8), detections=detections)
	label_annotator = sv.LabelAnnotator()
	annotated_frame = label_annotator.annotate(annotated_frame, detections=detections, labels=[ID_TO_OBJECTS[i] for i in object_ids])
	mask_annotator = sv.MaskAnnotator()
	annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
	colors_list.append(annotated_frame)
	else:
	colors_list.append(np.array(image))

	depths_list.append(cv["depth"].copy())

	masks = np.logical_or.reduce(masks, axis=0, keepdims=True)
	masks = np.logical_and(masks, depth_mask)
	masks = masks.reshape(-1)
	assert masks.shape[0] == depth.shape[0] * depth.shape[1]
	points = depth2fgpcd(depth, intrs[ck]).reshape(-1, 3)
	points = (np.linalg.inv(extrs[ck]) @ np.concatenate([points, np.ones((points.shape[0], 1)).astype(np.float32)], axis=1).T).T[:, :3] # (N, 3)
	points = points[masks]
	pts_list.append(points)

	return {
	"color": colors_list,
	"depth": depths_list,
	"pts": pts_list,
	}


	class SegmentPerception(mp.Process):

	def __init__(
	self,
	realsense: MultiRealsense \| SingleRealsense,
	capture_fps,
	record_fps,
	record_time,
	exp_name=None,
	bbox=None,
	data_dir="data",
	text_prompts="white cotton rope.",
	show_annotation=True,
	device=None,
	verbose=False,
	):
	super().__init__()
	self.verbose = verbose

	self.capture_fps = capture_fps
	self.record_fps = record_fps
	self.record_time = record_time
	self.exp_name = exp_name
	self.data_dir = data_dir
	self.bbox = bbox

	self.text_prompts = text_prompts
	self.show_annotation = show_annotation

	if self.exp_name is None:
	assert self.record_fps == 0

	self.realsense = realsense
	self.perception_q = mp.Queue(maxsize=1)

	self.num_cam = len(realsense.cameras.keys())
	self.alive = mp.Value('b', False)
	self.record_restart = mp.Value('b', False)
	self.record_stop = mp.Value('b', False)
	self.do_process = mp.Value('b', True)

	self.intrs = mp.Array('d', [0.0] * 9 * self.num_cam)
	self.extrs = mp.Array('d', [0.0] * 16 * self.num_cam)

	def log(self, msg):
	if self.verbose:
	print(f"\033[92m{self.name}: {msg}\033[0m")

	@property
	def can_record(self):
	return self.record_fps != 0

	def update_intrinsics(self, intrs):
	self.intrs[:] = intrs.flatten()

	def update_extrinsics(self, extrs):
	self.extrs[:] = extrs.flatten()

	def run(self):
	# limit threads
	threadpool_limits(1)
	cv2.setNumThreads(1)

	realsense = self.realsense

	# i = self.index
	capture_fps = self.capture_fps
	record_fps = self.record_fps
	record_time = self.record_time

	cameras_output = None
	recording_frame = float("inf") # local record step index (since current record start), record fps
	record_start_frame = 0 # global step index (since process start), capture fps
	is_recording = False # recording state flag
	timestamps_f = None

	checkpoint = str(root.parent / "weights/sam2/sam2.1_hiera_large.pt")
	model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
	model_id = "IDEA-Research/grounding-dino-tiny"
	device = "cuda" if torch.cuda.is_available() else "cpu"
	processor = AutoProcessor.from_pretrained(model_id)
	grounding_model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
	image_predictor = SAM2ImagePredictor(build_sam2(model_cfg, checkpoint))

	process_func = partial(
	segment_process_func,
	text_prompts=self.text_prompts,
	processor=processor,
	grounding_model=grounding_model,
	image_predictor=image_predictor,
	bbox=self.bbox,
	device=device,
	show_annotation=self.show_annotation,
	)

	while self.alive.value:
	try:
	if not self.do_process.value:
	if not self.perception_q.empty():
	self.perception_q.get()
	time.sleep(1)
	continue
	cameras_output = realsense.get(out=cameras_output)
	get_time = time.time()
	timestamps = [cameras_output[i]['timestamp'].item() for i in range(self.num_cam)] # type: ignore
	if is_recording and not all([abs(timestamps[i] - timestamps[i+1]) < 0.05 for i in range(self.num_cam - 1)]):
	print(f"Captured at different timestamps: {[f'{x:.2f}' for x in timestamps]}")

	# treat captured time and record time as the same
	process_start_time = get_time

	intrs = np.frombuffer(self.intrs.get_obj()).reshape((self.num_cam, 3, 3))
	extrs = np.frombuffer(self.extrs.get_obj()).reshape((self.num_cam, 4, 4))

	if intrs.sum() == 0 or extrs.sum() == 0:
	print("No camera intrinsics and extrinsics provided")
	time.sleep(1)
	continue

	process_out = process_func(cameras_output, intrs, extrs)
	self.log(f"process time: {time.time() - process_start_time}")

	if not self.perception_q.full():
	self.perception_q.put(process_out)

	except BaseException as e:
	print("Perception error: ", e.with_traceback())
	break

	if self.can_record:
	if timestamps_f is not None and not timestamps_f.closed:
	timestamps_f.close()
	finish_time = time.time()
	self.stop()
	print("Perception process stopped")


	def start(self):
	self.alive.value = True
	super().start()

	def stop(self):
	self.alive.value = False
	self.perception_q.close()

	def set_record_start(self):
	if self.record_fps == 0:
	print("record disabled because record_fps is 0")
	assert self.record_restart.value == False
	else:
	self.record_restart.value = True
	print("record restart cmd received")

	def set_record_stop(self):
	if self.record_fps == 0:
	print("record disabled because record_fps is 0")
	assert self.record_stop.value == False
	else:
	self.record_stop.value = True
	print("record stop cmd received")