|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import sys |
|
|
|
sys.path.append("./") |
|
|
|
import pdb |
|
from dataclasses import dataclass |
|
|
|
import numpy as np |
|
import torch |
|
import torch.nn.functional as F |
|
from PIL import Image |
|
|
|
from engine.ouputs import BaseOutput |
|
from engine.pose_estimation.model import Model |
|
|
|
IMG_NORM_MEAN = [0.485, 0.456, 0.406] |
|
IMG_NORM_STD = [0.229, 0.224, 0.225] |
|
|
|
|
|
@dataclass |
|
class SMPLXOutput(BaseOutput): |
|
beta: np.ndarray |
|
is_full_body: bool |
|
msg: str |
|
|
|
|
|
def normalize_rgb_tensor(img, imgenet_normalization=True): |
|
img = img / 255.0 |
|
if imgenet_normalization: |
|
img = ( |
|
img - torch.tensor(IMG_NORM_MEAN, device=img.device).view(1, 3, 1, 1) |
|
) / torch.tensor(IMG_NORM_STD, device=img.device).view(1, 3, 1, 1) |
|
return img |
|
|
|
|
|
def load_model(ckpt_path, model_path, device=torch.device("cuda")): |
|
"""Open a checkpoint, build Multi-HMR using saved arguments, load the model weigths.""" |
|
|
|
|
|
assert os.path.isfile(ckpt_path), f"{ckpt_path} not found" |
|
|
|
|
|
ckpt = torch.load(ckpt_path, map_location=device) |
|
|
|
|
|
kwargs = {} |
|
for k, v in vars(ckpt["args"]).items(): |
|
kwargs[k] = v |
|
print(ckpt["args"].img_size) |
|
|
|
if isinstance(ckpt["args"].img_size, list): |
|
kwargs["img_size"] = ckpt["args"].img_size[0] |
|
else: |
|
kwargs["img_size"] = ckpt["args"].img_size |
|
kwargs["smplx_dir"] = model_path |
|
print("Loading model...") |
|
model = Model(**kwargs).to(device) |
|
print("Model loaded") |
|
|
|
model.load_state_dict(ckpt["model_state_dict"], strict=False) |
|
model.output_mesh = True |
|
model.eval() |
|
return model |
|
|
|
|
|
def inverse_perspective_projection(points, K, distance): |
|
""" |
|
This function computes the inverse perspective projection of a set of points given an estimated distance. |
|
Input: |
|
points (bs, N, 2): 2D points |
|
K (bs,3,3): camera intrinsics params |
|
distance (bs, N, 1): distance in the 3D world |
|
Similar to: |
|
- pts_l_norm = cv2.undistortPoints(np.expand_dims(pts_l, axis=1), cameraMatrix=K_l, distCoeffs=None) |
|
""" |
|
|
|
points = torch.cat([points, torch.ones_like(points[..., :1])], -1) |
|
points = torch.einsum("bij,bkj->bki", torch.inverse(K), points) |
|
|
|
|
|
if distance is None: |
|
return points |
|
points = points * distance |
|
return points |
|
|
|
|
|
class PoseEstimator(torch.nn.Module): |
|
def __init__(self, model_path, device="cuda"): |
|
super().__init__() |
|
self.device = torch.device(device) |
|
self.mhmr_model = load_model( |
|
os.path.join(model_path, "pose_estimate", "multiHMR_896_L.pt"), |
|
model_path=model_path, |
|
device=self.device, |
|
) |
|
|
|
self.pad_ratio = 0.2 |
|
self.img_size = 896 |
|
self.fov = 60 |
|
|
|
def get_camera_parameters(self): |
|
K = torch.eye(3) |
|
|
|
focal = self.img_size / (2 * np.tan(np.radians(self.fov) / 2)) |
|
K[0, 0], K[1, 1] = focal, focal |
|
|
|
K[0, -1], K[1, -1] = self.img_size // 2, self.img_size // 2 |
|
|
|
|
|
K = K.unsqueeze(0).to(self.device) |
|
return K |
|
|
|
def img_center_padding(self, img_np): |
|
|
|
ori_h, ori_w = img_np.shape[:2] |
|
|
|
w = round((1 + self.pad_ratio) * ori_w) |
|
h = round((1 + self.pad_ratio) * ori_h) |
|
|
|
img_pad_np = np.zeros((h, w, 3), dtype=np.uint8) |
|
offset_h, offset_w = (h - img_np.shape[0]) // 2, (w - img_np.shape[1]) // 2 |
|
img_pad_np[ |
|
offset_h : offset_h + img_np.shape[0] :, |
|
offset_w : offset_w + img_np.shape[1], |
|
] = img_np |
|
|
|
return img_pad_np, offset_w, offset_h |
|
|
|
def _preprocess(self, img_np): |
|
|
|
raw_img_size = max(img_np.shape[:2]) |
|
|
|
img_tensor = ( |
|
torch.Tensor(img_np).to(self.device).unsqueeze(0).permute(0, 3, 1, 2) |
|
) |
|
|
|
_, _, h, w = img_tensor.shape |
|
scale_factor = min(self.img_size / w, self.img_size / h) |
|
img_tensor = F.interpolate( |
|
img_tensor, scale_factor=scale_factor, mode="bilinear" |
|
) |
|
|
|
_, _, h, w = img_tensor.shape |
|
pad_left = (self.img_size - w) // 2 |
|
pad_top = (self.img_size - h) // 2 |
|
pad_right = self.img_size - w - pad_left |
|
pad_bottom = self.img_size - h - pad_top |
|
img_tensor = F.pad( |
|
img_tensor, |
|
(pad_left, pad_right, pad_top, pad_bottom), |
|
mode="constant", |
|
value=0, |
|
) |
|
|
|
resize_img = normalize_rgb_tensor(img_tensor) |
|
|
|
annotation = ( |
|
pad_left, |
|
pad_top, |
|
scale_factor, |
|
self.img_size / scale_factor, |
|
raw_img_size, |
|
) |
|
|
|
return resize_img, annotation |
|
|
|
@torch.no_grad() |
|
def forward(self, img_path): |
|
|
|
|
|
|
|
|
|
|
|
img_np = np.asarray(Image.open(img_path).convert("RGB")) |
|
|
|
raw_h, raw_w, _ = img_np.shape |
|
img_np, offset_w, offset_h = self.img_center_padding(img_np) |
|
img_tensor, annotation = self._preprocess(img_np) |
|
K = self.get_camera_parameters() |
|
|
|
|
|
target_human = self.mhmr_model( |
|
img_tensor, |
|
is_training=False, |
|
nms_kernel_size=int(3), |
|
det_thresh=0.3, |
|
K=K, |
|
idx=None, |
|
max_dist=None, |
|
) |
|
|
|
if not len(target_human) == 1: |
|
return SMPLXOutput( |
|
beta=None, |
|
is_full_body=False, |
|
msg="more than one human detected" if len(target_human) > 1 else "no human detected", |
|
) |
|
|
|
|
|
pad_left, pad_top, scale_factor, _, _ = annotation |
|
j2d = target_human[0]["j2d"] |
|
|
|
j2d = ( |
|
j2d - torch.tensor([pad_left, pad_top], device=self.device).unsqueeze(0) |
|
) / scale_factor |
|
j2d = j2d - torch.tensor([offset_w, offset_h], device=self.device).unsqueeze(0) |
|
|
|
|
|
scale_ratio = 0.025 |
|
|
|
is_full_body = ( |
|
( |
|
(j2d[..., 0] >= 0 - raw_w * scale_ratio) |
|
& (j2d[..., 0] < raw_w * (1 + scale_ratio)) |
|
& (j2d[..., 1] >= 0 - raw_h * scale_ratio) |
|
& (j2d[..., 1] < raw_h * (1 + scale_ratio)) |
|
) |
|
.sum(dim=-1) |
|
.item() >= 95 |
|
) |
|
|
|
return SMPLXOutput( |
|
beta=target_human[0]["shape"].cpu().numpy(), |
|
is_full_body=is_full_body, |
|
msg="success" if is_full_body else "no full-body human detected", |
|
) |
|
|