import cv2 import torch import random import numpy as np from . import transforms def do_augmentation(scale_factor=0.2, trans_factor=0.1): scale = random.uniform(1.2 - scale_factor, 1.2 + scale_factor) trans_x = random.uniform(-trans_factor, trans_factor) trans_y = random.uniform(-trans_factor, trans_factor) return scale, trans_x, trans_y def get_transform(center, scale, res, rot=0): """Generate transformation matrix.""" # res: (height, width), (rows, cols) crop_aspect_ratio = res[0] / float(res[1]) h = 200 * scale w = h / crop_aspect_ratio t = np.zeros((3, 3)) t[0, 0] = float(res[1]) / w t[1, 1] = float(res[0]) / h t[0, 2] = res[1] * (-float(center[0]) / w + .5) t[1, 2] = res[0] * (-float(center[1]) / h + .5) t[2, 2] = 1 if not rot == 0: rot = -rot # To match direction of rotation from cropping rot_mat = np.zeros((3, 3)) rot_rad = rot * np.pi / 180 sn, cs = np.sin(rot_rad), np.cos(rot_rad) rot_mat[0, :2] = [cs, -sn] rot_mat[1, :2] = [sn, cs] rot_mat[2, 2] = 1 # Need to rotate around center t_mat = np.eye(3) t_mat[0, 2] = -res[1] / 2 t_mat[1, 2] = -res[0] / 2 t_inv = t_mat.copy() t_inv[:2, 2] *= -1 t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t))) return t def transform(pt, center, scale, res, invert=0, rot=0): """Transform pixel location to different reference.""" t = get_transform(center, scale, res, rot=rot) if invert: t = np.linalg.inv(t) new_pt = np.array([pt[0] - 1, pt[1] - 1, 1.]).T new_pt = np.dot(t, new_pt) return np.array([round(new_pt[0]), round(new_pt[1])], dtype=int) + 1 def crop_cliff(img, center, scale, res): """ Crop image according to the supplied bounding box. res: [rows, cols] """ # Upper left point ul = np.array(transform([1, 1], center, scale, res, invert=1)) - 1 # Bottom right point br = np.array(transform([res[1] + 1, res[0] + 1], center, scale, res, invert=1)) - 1 # Padding so that when rotated proper amount of context is included pad = int(np.linalg.norm(br - ul) / 2 - float(br[1] - ul[1]) / 2) new_shape = [br[1] - ul[1], br[0] - ul[0]] if len(img.shape) > 2: new_shape += [img.shape[2]] new_img = np.zeros(new_shape, dtype=np.float32) # Range to fill new array new_x = max(0, -ul[0]), min(br[0], len(img[0])) - ul[0] new_y = max(0, -ul[1]), min(br[1], len(img)) - ul[1] # Range to sample from original image old_x = max(0, ul[0]), min(len(img[0]), br[0]) old_y = max(0, ul[1]), min(len(img), br[1]) try: new_img[new_y[0]:new_y[1], new_x[0]:new_x[1]] = img[old_y[0]:old_y[1], old_x[0]:old_x[1]] except Exception as e: print(e) new_img = cv2.resize(new_img, (res[1], res[0])) # (cols, rows) return new_img, ul, br def obtain_bbox(center, scale, res, org_res): # Upper left point ul = np.array(transform([1, 1], center, scale, res, invert=1)) - 1 # Bottom right point br = np.array(transform([res[1] + 1, res[0] + 1], center, scale, res, invert=1)) - 1 # Padding so that when rotated proper amount of context is included pad = int(np.linalg.norm(br - ul) / 2 - float(br[1] - ul[1]) / 2) # Range to sample from original image old_x = max(0, ul[0]), min(org_res[0], br[0]) old_y = max(0, ul[1]), min(org_res[1], br[1]) return old_x, old_y def cam_crop2full(crop_cam, bbox, full_img_shape, focal_length=None): """ convert the camera parameters from the crop camera to the full camera :param crop_cam: shape=(N, 3) weak perspective camera in cropped img coordinates (s, tx, ty) :param center: shape=(N, 2) bbox coordinates (c_x, c_y) :param scale: shape=(N, 1) square bbox resolution (b / 200) :param full_img_shape: shape=(N, 2) original image height and width :param focal_length: shape=(N,) :return: """ cx = bbox[..., 0].clone(); cy = bbox[..., 1].clone(); b = bbox[..., 2].clone() * 200 img_h, img_w = full_img_shape[:, 0], full_img_shape[:, 1] w_2, h_2 = img_w / 2., img_h / 2. bs = b * crop_cam[:, :, 0] + 1e-9 if focal_length is None: focal_length = (img_w * img_w + img_h * img_h) ** 0.5 tz = 2 * focal_length.unsqueeze(-1) / bs tx = (2 * (cx - w_2.unsqueeze(-1)) / bs) + crop_cam[:, :, 1] ty = (2 * (cy - h_2.unsqueeze(-1)) / bs) + crop_cam[:, :, 2] full_cam = torch.stack([tx, ty, tz], dim=-1) return full_cam def cam_pred2full(crop_cam, center, scale, full_img_shape, focal_length=2000.,): """ Reference CLIFF: Carrying Location Information in Full Frames into Human Pose and Shape Estimation convert the camera parameters from the crop camera to the full camera :param crop_cam: shape=(N, 3) weak perspective camera in cropped img coordinates (s, tx, ty) :param center: shape=(N, 2) bbox coordinates (c_x, c_y) :param scale: shape=(N, ) square bbox resolution (b / 200) :param full_img_shape: shape=(N, 2) original image height and width :param focal_length: shape=(N,) :return: """ # img_h, img_w = full_img_shape[:, 0], full_img_shape[:, 1] img_w, img_h = full_img_shape[:, 0], full_img_shape[:, 1] cx, cy, b = center[:, 0], center[:, 1], scale * 200 w_2, h_2 = img_w / 2., img_h / 2. bs = b * crop_cam[:, 0] + 1e-9 tz = 2 * focal_length / bs tx = (2 * (cx - w_2) / bs) + crop_cam[:, 1] ty = (2 * (cy - h_2) / bs) + crop_cam[:, 2] full_cam = torch.stack([tx, ty, tz], dim=-1) return full_cam def cam_full2pred(full_cam, center, scale, full_img_shape, focal_length=2000.): # img_h, img_w = full_img_shape[:, 0], full_img_shape[:, 1] img_w, img_h = full_img_shape[:, 0], full_img_shape[:, 1] cx, cy, b = center[:, 0], center[:, 1], scale * 200 w_2, h_2 = img_w / 2., img_h / 2. bs = (2 * focal_length / full_cam[:, 2]) _s = bs / b _tx = full_cam[:, 0] - (2 * (cx - w_2) / bs) _ty = full_cam[:, 1] - (2 * (cy - h_2) / bs) crop_cam = torch.stack([_s, _tx, _ty], dim=-1) return crop_cam def obtain_camera_intrinsics(image_shape, focal_length): res_w = image_shape[..., 0].clone() res_h = image_shape[..., 1].clone() K = torch.eye(3).unsqueeze(0).expand(focal_length.shape[0], -1, -1).to(focal_length.device) K[..., 0, 0] = focal_length.clone() K[..., 1, 1] = focal_length.clone() K[..., 0, 2] = res_w / 2 K[..., 1, 2] = res_h / 2 return K.unsqueeze(1) def trans_point2d(pt_2d, trans): src_pt = np.array([pt_2d[0], pt_2d[1], 1.]).T dst_pt = np.dot(trans, src_pt) return dst_pt[0:2] def rotate_2d(pt_2d, rot_rad): x = pt_2d[0] y = pt_2d[1] sn, cs = np.sin(rot_rad), np.cos(rot_rad) xx = x * cs - y * sn yy = x * sn + y * cs return np.array([xx, yy], dtype=np.float32) def gen_trans_from_patch_cv(c_x, c_y, src_width, src_height, dst_width, dst_height, scale, rot, inv=False): # augment size with scale src_w = src_width * scale src_h = src_height * scale src_center = np.zeros(2) src_center[0] = c_x src_center[1] = c_y # np.array([c_x, c_y], dtype=np.float32) # augment rotation rot_rad = np.pi * rot / 180 src_downdir = rotate_2d(np.array([0, src_h * 0.5], dtype=np.float32), rot_rad) src_rightdir = rotate_2d(np.array([src_w * 0.5, 0], dtype=np.float32), rot_rad) dst_w = dst_width dst_h = dst_height dst_center = np.array([dst_w * 0.5, dst_h * 0.5], dtype=np.float32) dst_downdir = np.array([0, dst_h * 0.5], dtype=np.float32) dst_rightdir = np.array([dst_w * 0.5, 0], dtype=np.float32) src = np.zeros((3, 2), dtype=np.float32) src[0, :] = src_center src[1, :] = src_center + src_downdir src[2, :] = src_center + src_rightdir dst = np.zeros((3, 2), dtype=np.float32) dst[0, :] = dst_center dst[1, :] = dst_center + dst_downdir dst[2, :] = dst_center + dst_rightdir if inv: trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) else: trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) return trans def transform_keypoints(kp_2d, bbox, patch_width, patch_height): center_x, center_y, scale = bbox[:3] width = height = scale * 200 # scale, rot = 1.2, 0 scale, rot = 1.0, 0 # generate transformation trans = gen_trans_from_patch_cv( center_x, center_y, width, height, patch_width, patch_height, scale, rot, inv=False, ) for n_jt in range(kp_2d.shape[0]): kp_2d[n_jt] = trans_point2d(kp_2d[n_jt], trans) return kp_2d, trans def transform(pt, center, scale, res, invert=0, rot=0): """Transform pixel location to different reference.""" t = get_transform(center, scale, res, rot=rot) if invert: t = np.linalg.inv(t) new_pt = np.array([pt[0] - 1, pt[1] - 1, 1.]).T new_pt = np.dot(t, new_pt) return new_pt[:2].astype(int) + 1 def compute_cam_intrinsics(res): img_w, img_h = res focal_length = (img_w * img_w + img_h * img_h) ** 0.5 cam_intrinsics = torch.eye(3).repeat(1, 1, 1).float() cam_intrinsics[:, 0, 0] = focal_length cam_intrinsics[:, 1, 1] = focal_length cam_intrinsics[:, 0, 2] = img_w/2. cam_intrinsics[:, 1, 2] = img_h/2. return cam_intrinsics def flip_kp(kp, img_w=None): """Flip keypoints.""" flipped_parts = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] kp = kp[..., flipped_parts, :] if img_w is not None: # Assume 2D keypoints kp[...,0] = img_w - kp[...,0] return kp def flip_bbox(bbox, img_w, img_h): center = bbox[..., :2] scale = bbox[..., -1:] WH = np.ones_like(center) WH[..., 0] *= img_w WH[..., 1] *= img_h center = center - WH/2 center[...,0] = - center[...,0] center = center + WH/2 flipped_bbox = np.concatenate((center, scale), axis=-1) return flipped_bbox def flip_pose(rotation, representation='rotation_6d'): """Flip pose. The flipping is based on SMPL parameters. """ BN = rotation.shape[0] if representation == 'axis_angle': pose = rotation.reshape(BN, -1).transpose(0, 1) elif representation == 'matrix': pose = transforms.matrix_to_axis_angle(rotation).reshape(BN, -1).transpose(0, 1) elif representation == 'rotation_6d': pose = transforms.matrix_to_axis_angle( transforms.rotation_6d_to_matrix(rotation) ).reshape(BN, -1).transpose(0, 1) else: raise ValueError(f"Unknown representation: {representation}") SMPL_JOINTS_FLIP_PERM = [0, 2, 1, 3, 5, 4, 6, 8, 7, 9, 11, 10, 12, 14, 13, 15, 17, 16, 19, 18, 21, 20, 23, 22] SMPL_POSE_FLIP_PERM = [] for i in SMPL_JOINTS_FLIP_PERM: SMPL_POSE_FLIP_PERM.append(3*i) SMPL_POSE_FLIP_PERM.append(3*i+1) SMPL_POSE_FLIP_PERM.append(3*i+2) pose = pose[SMPL_POSE_FLIP_PERM] # we also negate the second and the third dimension of the axis-angle pose[1::3] = -pose[1::3] pose[2::3] = -pose[2::3] pose = pose.transpose(0, 1).reshape(BN, -1, 3) if representation == 'aa': return pose elif representation == 'rotmat': return transforms.axis_angle_to_matrix(pose) else: return transforms.matrix_to_rotation_6d( transforms.axis_angle_to_matrix(pose) ) def avg_preds(rotation, shape, flipped_rotation, flipped_shape, representation='rotation_6d'): # Rotation flipped_rotation = flip_pose(flipped_rotation, representation=representation) if representation != 'matrix': flipped_rotation = eval(f'transforms.{representation}_to_matrix')(flipped_rotation) rotation = eval(f'transforms.{representation}_to_matrix')(rotation) avg_rotation = torch.stack([rotation, flipped_rotation]) avg_rotation = transforms.avg_rot(avg_rotation) if representation != 'matrix': avg_rotation = eval(f'transforms.matrix_to_{representation}')(avg_rotation) # Shape avg_shape = (shape + flipped_shape) / 2.0 return avg_rotation, avg_shape