kywind
update
f96995c
raw
history blame
11.2 kB
from pathlib import Path
import os
import time
import numpy as np
import torch
import cv2
import open3d as o3d
from threadpoolctl import threadpool_limits
import multiprocess as mp
from functools import partial
from PIL import Image
import supervision as sv
from pgnd.utils import get_root
root: Path = get_root(__file__)
from camera.multi_realsense import MultiRealsense
from camera.single_realsense import SingleRealsense
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
from sam2.build_sam import build_sam2, build_sam2_video_predictor
from sam2.sam2_image_predictor import SAM2ImagePredictor
from utils.pcd_utils import depth2fgpcd
def get_mask_raw(depth, intr, extr, bbox, depth_threshold=[0, 2]):
points = depth2fgpcd(depth, intr).reshape(-1, 3)
mask = np.logical_and((depth > depth_threshold[0]), (depth < depth_threshold[1])) # (H, W)
points = (np.linalg.inv(extr) @ np.concatenate([points, np.ones((points.shape[0], 1)).astype(np.float32)], axis=1).T).T[:, :3] # (N, 3)
mask_bbox = np.logical_and(
np.logical_and(points[:, 0] > bbox[0][0], points[:, 0] < bbox[0][1]),
np.logical_and(points[:, 1] > bbox[1][0], points[:, 1] < bbox[1][1])
) # does not include z axis
mask_bbox = mask_bbox.reshape(depth.shape[0], depth.shape[1])
mask = np.logical_and(mask, mask_bbox)
return mask
def segment_process_func(cameras_output, intrs, extrs, text_prompts, processor, grounding_model, image_predictor, bbox, device, show_annotation=True):
colors_list = []
depths_list = []
pts_list = []
for ck, cv in cameras_output.items():
image = cv["color"].copy()
depth = cv["depth"].copy() / 1000.0
image = Image.fromarray(image)
# ground
inputs = processor(images=image, text=text_prompts, return_tensors="pt").to(device)
with torch.no_grad():
outputs = grounding_model(**inputs)
results = processor.post_process_grounded_object_detection(
outputs,
inputs.input_ids,
box_threshold=0.325,
text_threshold=0.3,
target_sizes=[image.size[::-1]]
)
input_boxes = results[0]["boxes"].cpu().numpy()
objects = results[0]["labels"]
depth_mask = get_mask_raw(depth, intrs[ck], extrs[ck], bbox)
multi_objs = False
if len(objects) > 1:
objects_masked = []
input_boxes_masked = []
if intrs is None or extrs is None:
print("No camera intrinsics and extrinsics provided")
return {
"color": [],
"depth": [],
"pts": [],
}
for i, obj in enumerate(objects):
if obj == '':
continue
box = input_boxes[i].astype(int)
if (box[3] - box[1]) * (box[2] - box[0]) > 500 * 400:
continue
depth_mask_box = depth_mask[box[1]:box[3], box[0]:box[2]]
if depth_mask_box.sum() > 0:
objects_masked.append(obj)
input_boxes_masked.append(box)
objects = objects_masked
input_boxes = input_boxes_masked
if len(objects) == 0:
print("No objects detected")
return {
"color": [],
"depth": [],
"pts": [],
}
elif len(objects) > 1:
multi_objs = True
image_predictor.set_image(np.array(image.convert("RGB")))
masks, scores, logits = image_predictor.predict(
point_coords=None,
point_labels=None,
box=input_boxes,
multimask_output=False,
)
if masks.ndim == 3:
pass
elif masks.ndim == 4:
assert multi_objs
masks = masks.squeeze(1)
masks = masks.astype(bool)
ID_TO_OBJECTS = {i: obj for i, obj in enumerate(objects, start=1)}
object_ids = np.arange(1, len(objects) + 1)
detections = sv.Detections(
xyxy=sv.mask_to_xyxy(masks), # (n, 4)
mask=masks, # (n, h, w)
class_id=np.array(object_ids, dtype=np.int32),
)
box_annotator = sv.BoxAnnotator()
if show_annotation:
annotated_frame = box_annotator.annotate(scene=np.array(image).astype(np.uint8), detections=detections)
label_annotator = sv.LabelAnnotator()
annotated_frame = label_annotator.annotate(annotated_frame, detections=detections, labels=[ID_TO_OBJECTS[i] for i in object_ids])
mask_annotator = sv.MaskAnnotator()
annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections)
colors_list.append(annotated_frame)
else:
colors_list.append(np.array(image))
depths_list.append(cv["depth"].copy())
masks = np.logical_or.reduce(masks, axis=0, keepdims=True)
masks = np.logical_and(masks, depth_mask)
masks = masks.reshape(-1)
assert masks.shape[0] == depth.shape[0] * depth.shape[1]
points = depth2fgpcd(depth, intrs[ck]).reshape(-1, 3)
points = (np.linalg.inv(extrs[ck]) @ np.concatenate([points, np.ones((points.shape[0], 1)).astype(np.float32)], axis=1).T).T[:, :3] # (N, 3)
points = points[masks]
pts_list.append(points)
return {
"color": colors_list,
"depth": depths_list,
"pts": pts_list,
}
class SegmentPerception(mp.Process):
def __init__(
self,
realsense: MultiRealsense | SingleRealsense,
capture_fps,
record_fps,
record_time,
exp_name=None,
bbox=None,
data_dir="data",
text_prompts="white cotton rope.",
show_annotation=True,
device=None,
verbose=False,
):
super().__init__()
self.verbose = verbose
self.capture_fps = capture_fps
self.record_fps = record_fps
self.record_time = record_time
self.exp_name = exp_name
self.data_dir = data_dir
self.bbox = bbox
self.text_prompts = text_prompts
self.show_annotation = show_annotation
if self.exp_name is None:
assert self.record_fps == 0
self.realsense = realsense
self.perception_q = mp.Queue(maxsize=1)
self.num_cam = len(realsense.cameras.keys())
self.alive = mp.Value('b', False)
self.record_restart = mp.Value('b', False)
self.record_stop = mp.Value('b', False)
self.do_process = mp.Value('b', True)
self.intrs = mp.Array('d', [0.0] * 9 * self.num_cam)
self.extrs = mp.Array('d', [0.0] * 16 * self.num_cam)
def log(self, msg):
if self.verbose:
print(f"\033[92m{self.name}: {msg}\033[0m")
@property
def can_record(self):
return self.record_fps != 0
def update_intrinsics(self, intrs):
self.intrs[:] = intrs.flatten()
def update_extrinsics(self, extrs):
self.extrs[:] = extrs.flatten()
def run(self):
# limit threads
threadpool_limits(1)
cv2.setNumThreads(1)
realsense = self.realsense
# i = self.index
capture_fps = self.capture_fps
record_fps = self.record_fps
record_time = self.record_time
cameras_output = None
recording_frame = float("inf") # local record step index (since current record start), record fps
record_start_frame = 0 # global step index (since process start), capture fps
is_recording = False # recording state flag
timestamps_f = None
checkpoint = str(root.parent / "weights/sam2/sam2.1_hiera_large.pt")
model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
model_id = "IDEA-Research/grounding-dino-tiny"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(model_id)
grounding_model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
image_predictor = SAM2ImagePredictor(build_sam2(model_cfg, checkpoint))
process_func = partial(
segment_process_func,
text_prompts=self.text_prompts,
processor=processor,
grounding_model=grounding_model,
image_predictor=image_predictor,
bbox=self.bbox,
device=device,
show_annotation=self.show_annotation,
)
while self.alive.value:
try:
if not self.do_process.value:
if not self.perception_q.empty():
self.perception_q.get()
time.sleep(1)
continue
cameras_output = realsense.get(out=cameras_output)
get_time = time.time()
timestamps = [cameras_output[i]['timestamp'].item() for i in range(self.num_cam)] # type: ignore
if is_recording and not all([abs(timestamps[i] - timestamps[i+1]) < 0.05 for i in range(self.num_cam - 1)]):
print(f"Captured at different timestamps: {[f'{x:.2f}' for x in timestamps]}")
# treat captured time and record time as the same
process_start_time = get_time
intrs = np.frombuffer(self.intrs.get_obj()).reshape((self.num_cam, 3, 3))
extrs = np.frombuffer(self.extrs.get_obj()).reshape((self.num_cam, 4, 4))
if intrs.sum() == 0 or extrs.sum() == 0:
print("No camera intrinsics and extrinsics provided")
time.sleep(1)
continue
process_out = process_func(cameras_output, intrs, extrs)
self.log(f"process time: {time.time() - process_start_time}")
if not self.perception_q.full():
self.perception_q.put(process_out)
except BaseException as e:
print("Perception error: ", e.with_traceback())
break
if self.can_record:
if timestamps_f is not None and not timestamps_f.closed:
timestamps_f.close()
finish_time = time.time()
self.stop()
print("Perception process stopped")
def start(self):
self.alive.value = True
super().start()
def stop(self):
self.alive.value = False
self.perception_q.close()
def set_record_start(self):
if self.record_fps == 0:
print("record disabled because record_fps is 0")
assert self.record_restart.value == False
else:
self.record_restart.value = True
print("record restart cmd received")
def set_record_stop(self):
if self.record_fps == 0:
print("record disabled because record_fps is 0")
assert self.record_stop.value == False
else:
self.record_stop.value = True
print("record stop cmd received")