Spaces:

Form-Fighter
/

FormFighterAIStack

Sleeping

File size: 12,696 Bytes

c87d1bc

import cv2
import torch
import random
import numpy as np
from . import transforms

def do_augmentation(scale_factor=0.2, trans_factor=0.1):
    scale = random.uniform(1.2 - scale_factor, 1.2 + scale_factor)
    trans_x = random.uniform(-trans_factor, trans_factor)
    trans_y = random.uniform(-trans_factor, trans_factor)
    
    return scale, trans_x, trans_y

def get_transform(center, scale, res, rot=0):
    """Generate transformation matrix."""
    # res: (height, width), (rows, cols)
    crop_aspect_ratio = res[0] / float(res[1])
    h = 200 * scale
    w = h / crop_aspect_ratio
    t = np.zeros((3, 3))
    t[0, 0] = float(res[1]) / w
    t[1, 1] = float(res[0]) / h
    t[0, 2] = res[1] * (-float(center[0]) / w + .5)
    t[1, 2] = res[0] * (-float(center[1]) / h + .5)
    t[2, 2] = 1
    if not rot == 0:
        rot = -rot  # To match direction of rotation from cropping
        rot_mat = np.zeros((3, 3))
        rot_rad = rot * np.pi / 180
        sn, cs = np.sin(rot_rad), np.cos(rot_rad)
        rot_mat[0, :2] = [cs, -sn]
        rot_mat[1, :2] = [sn, cs]
        rot_mat[2, 2] = 1
        # Need to rotate around center
        t_mat = np.eye(3)
        t_mat[0, 2] = -res[1] / 2
        t_mat[1, 2] = -res[0] / 2
        t_inv = t_mat.copy()
        t_inv[:2, 2] *= -1
        t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
    return t


def transform(pt, center, scale, res, invert=0, rot=0):
    """Transform pixel location to different reference."""
    t = get_transform(center, scale, res, rot=rot)
    if invert:
        t = np.linalg.inv(t)
    new_pt = np.array([pt[0] - 1, pt[1] - 1, 1.]).T
    new_pt = np.dot(t, new_pt)
    return np.array([round(new_pt[0]), round(new_pt[1])], dtype=int) + 1


def crop_cliff(img, center, scale, res):
    """

    Crop image according to the supplied bounding box.

    res: [rows, cols]

    """
    # Upper left point
    ul = np.array(transform([1, 1], center, scale, res, invert=1)) - 1
    # Bottom right point
    br = np.array(transform([res[1] + 1, res[0] + 1], center, scale, res, invert=1)) - 1

    # Padding so that when rotated proper amount of context is included
    pad = int(np.linalg.norm(br - ul) / 2 - float(br[1] - ul[1]) / 2)

    new_shape = [br[1] - ul[1], br[0] - ul[0]]
    if len(img.shape) > 2:
        new_shape += [img.shape[2]]
    new_img = np.zeros(new_shape, dtype=np.float32)

    # Range to fill new array
    new_x = max(0, -ul[0]), min(br[0], len(img[0])) - ul[0]
    new_y = max(0, -ul[1]), min(br[1], len(img)) - ul[1]
    # Range to sample from original image
    old_x = max(0, ul[0]), min(len(img[0]), br[0])
    old_y = max(0, ul[1]), min(len(img), br[1])
    
    try:
        new_img[new_y[0]:new_y[1], new_x[0]:new_x[1]] = img[old_y[0]:old_y[1], old_x[0]:old_x[1]]
    except Exception as e:
        print(e)

    new_img = cv2.resize(new_img, (res[1], res[0]))  # (cols, rows)

    return new_img, ul, br


def obtain_bbox(center, scale, res, org_res):
    # Upper left point
    ul = np.array(transform([1, 1], center, scale, res, invert=1)) - 1
    # Bottom right point
    br = np.array(transform([res[1] + 1, res[0] + 1], center, scale, res, invert=1)) - 1

    # Padding so that when rotated proper amount of context is included
    pad = int(np.linalg.norm(br - ul) / 2 - float(br[1] - ul[1]) / 2)

    # Range to sample from original image
    old_x = max(0, ul[0]), min(org_res[0], br[0])
    old_y = max(0, ul[1]), min(org_res[1], br[1])
    
    return old_x, old_y


def cam_crop2full(crop_cam, bbox, full_img_shape, focal_length=None):
    """

    convert the camera parameters from the crop camera to the full camera

    :param crop_cam: shape=(N, 3) weak perspective camera in cropped img coordinates (s, tx, ty)

    :param center: shape=(N, 2) bbox coordinates (c_x, c_y)

    :param scale: shape=(N, 1) square bbox resolution  (b / 200)

    :param full_img_shape: shape=(N, 2) original image height and width

    :param focal_length: shape=(N,)

    :return:

    """
    
    cx = bbox[..., 0].clone(); cy = bbox[..., 1].clone(); b = bbox[..., 2].clone() * 200
    img_h, img_w = full_img_shape[:, 0], full_img_shape[:, 1]
    w_2, h_2 = img_w / 2., img_h / 2.
    bs = b * crop_cam[:, :, 0] + 1e-9
    
    if focal_length is None:
        focal_length = (img_w * img_w + img_h * img_h) ** 0.5
    
    tz = 2 * focal_length.unsqueeze(-1) / bs
    tx = (2 * (cx - w_2.unsqueeze(-1)) / bs) + crop_cam[:, :, 1]
    ty = (2 * (cy - h_2.unsqueeze(-1)) / bs) + crop_cam[:, :, 2]
    full_cam = torch.stack([tx, ty, tz], dim=-1)
    return full_cam


def cam_pred2full(crop_cam, center, scale, full_img_shape, focal_length=2000.,):
    """

    Reference CLIFF: Carrying Location Information in Full Frames into Human Pose and Shape Estimation

    

    convert the camera parameters from the crop camera to the full camera

    :param crop_cam: shape=(N, 3) weak perspective camera in cropped img coordinates (s, tx, ty)

    :param center: shape=(N, 2) bbox coordinates (c_x, c_y)

    :param scale: shape=(N, ) square bbox resolution  (b / 200)

    :param full_img_shape: shape=(N, 2) original image height and width

    :param focal_length: shape=(N,)

    :return:

    """
    
    # img_h, img_w = full_img_shape[:, 0], full_img_shape[:, 1]
    img_w, img_h = full_img_shape[:, 0], full_img_shape[:, 1]
    cx, cy, b = center[:, 0], center[:, 1], scale * 200
    w_2, h_2 = img_w / 2., img_h / 2.
    bs = b * crop_cam[:, 0] + 1e-9
    tz = 2 * focal_length / bs
    tx = (2 * (cx - w_2) / bs) + crop_cam[:, 1]
    ty = (2 * (cy - h_2) / bs) + crop_cam[:, 2]
    full_cam = torch.stack([tx, ty, tz], dim=-1)
    return full_cam


def cam_full2pred(full_cam, center, scale, full_img_shape, focal_length=2000.):
    # img_h, img_w = full_img_shape[:, 0], full_img_shape[:, 1]
    img_w, img_h = full_img_shape[:, 0], full_img_shape[:, 1]
    cx, cy, b = center[:, 0], center[:, 1], scale * 200
    w_2, h_2 = img_w / 2., img_h / 2.
    
    bs = (2 * focal_length / full_cam[:, 2])
    _s = bs / b
    _tx = full_cam[:, 0] - (2 * (cx - w_2) / bs)
    _ty = full_cam[:, 1] - (2 * (cy - h_2) / bs)
    crop_cam = torch.stack([_s, _tx, _ty], dim=-1)
    return crop_cam


def obtain_camera_intrinsics(image_shape, focal_length):
    res_w = image_shape[..., 0].clone()
    res_h = image_shape[..., 1].clone()
    K = torch.eye(3).unsqueeze(0).expand(focal_length.shape[0], -1, -1).to(focal_length.device)
    K[..., 0, 0] = focal_length.clone()
    K[..., 1, 1] = focal_length.clone()
    K[..., 0, 2] = res_w / 2
    K[..., 1, 2] = res_h / 2
    
    return K.unsqueeze(1)


def trans_point2d(pt_2d, trans):
    src_pt = np.array([pt_2d[0], pt_2d[1], 1.]).T
    dst_pt = np.dot(trans, src_pt)
    return dst_pt[0:2]

def rotate_2d(pt_2d, rot_rad):
    x = pt_2d[0]
    y = pt_2d[1]
    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
    xx = x * cs - y * sn
    yy = x * sn + y * cs
    return np.array([xx, yy], dtype=np.float32)

def gen_trans_from_patch_cv(c_x, c_y, src_width, src_height, dst_width, dst_height, scale, rot, inv=False):
    # augment size with scale
    src_w = src_width * scale
    src_h = src_height * scale
    src_center = np.zeros(2)
    src_center[0] = c_x
    src_center[1] = c_y # np.array([c_x, c_y], dtype=np.float32)
    # augment rotation
    rot_rad = np.pi * rot / 180
    src_downdir = rotate_2d(np.array([0, src_h * 0.5], dtype=np.float32), rot_rad)
    src_rightdir = rotate_2d(np.array([src_w * 0.5, 0], dtype=np.float32), rot_rad)

    dst_w = dst_width
    dst_h = dst_height
    dst_center = np.array([dst_w * 0.5, dst_h * 0.5], dtype=np.float32)
    dst_downdir = np.array([0, dst_h * 0.5], dtype=np.float32)
    dst_rightdir = np.array([dst_w * 0.5, 0], dtype=np.float32)

    src = np.zeros((3, 2), dtype=np.float32)
    src[0, :] = src_center
    src[1, :] = src_center + src_downdir
    src[2, :] = src_center + src_rightdir

    dst = np.zeros((3, 2), dtype=np.float32)
    dst[0, :] = dst_center
    dst[1, :] = dst_center + dst_downdir
    dst[2, :] = dst_center + dst_rightdir

    if inv:
        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
    else:
        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))

    return trans

def transform_keypoints(kp_2d, bbox, patch_width, patch_height):

    center_x, center_y, scale = bbox[:3]
    width = height = scale * 200
    # scale, rot = 1.2, 0
    scale, rot = 1.0, 0

    # generate transformation
    trans = gen_trans_from_patch_cv(
        center_x,
        center_y,
        width,
        height,
        patch_width,
        patch_height,
        scale,
        rot,
        inv=False,
    )

    for n_jt in range(kp_2d.shape[0]):
        kp_2d[n_jt] = trans_point2d(kp_2d[n_jt], trans)

    return kp_2d, trans


def transform(pt, center, scale, res, invert=0, rot=0):
    """Transform pixel location to different reference."""
    t = get_transform(center, scale, res, rot=rot)
    if invert:
        t = np.linalg.inv(t)
    new_pt = np.array([pt[0] - 1, pt[1] - 1, 1.]).T
    new_pt = np.dot(t, new_pt)
    return new_pt[:2].astype(int) + 1


def compute_cam_intrinsics(res):
    img_w, img_h = res
    focal_length = (img_w * img_w + img_h * img_h) ** 0.5
    cam_intrinsics = torch.eye(3).repeat(1, 1, 1).float()
    cam_intrinsics[:, 0, 0] = focal_length
    cam_intrinsics[:, 1, 1] = focal_length
    cam_intrinsics[:, 0, 2] = img_w/2.
    cam_intrinsics[:, 1, 2] = img_h/2.
    return cam_intrinsics


def flip_kp(kp, img_w=None):
    """Flip keypoints."""
    
    flipped_parts = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
    kp = kp[..., flipped_parts, :]
    
    if img_w is not None:
        # Assume 2D keypoints
        kp[...,0] = img_w - kp[...,0]
    return kp


def flip_bbox(bbox, img_w, img_h):
    center = bbox[..., :2]
    scale = bbox[..., -1:]
    
    WH = np.ones_like(center)
    WH[..., 0] *= img_w
    WH[..., 1] *= img_h
    
    center = center - WH/2
    center[...,0] = - center[...,0]
    center = center + WH/2
    
    flipped_bbox = np.concatenate((center, scale), axis=-1)
    return flipped_bbox


def flip_pose(rotation, representation='rotation_6d'):
    """Flip pose.

    The flipping is based on SMPL parameters.

    """
    
    BN = rotation.shape[0]
    
    if representation == 'axis_angle':
        pose = rotation.reshape(BN, -1).transpose(0, 1)
    elif representation == 'matrix':
        pose = transforms.matrix_to_axis_angle(rotation).reshape(BN, -1).transpose(0, 1)
    elif representation == 'rotation_6d':
        pose = transforms.matrix_to_axis_angle(
            transforms.rotation_6d_to_matrix(rotation)
        ).reshape(BN, -1).transpose(0, 1)
    else:
        raise ValueError(f"Unknown representation: {representation}")
    
    SMPL_JOINTS_FLIP_PERM = [0, 2, 1, 3, 5, 4, 6, 8, 7, 9, 11, 10, 12, 14, 13, 15, 17, 16, 19, 18, 21, 20, 23, 22]
    SMPL_POSE_FLIP_PERM = []
    for i in SMPL_JOINTS_FLIP_PERM:
        SMPL_POSE_FLIP_PERM.append(3*i)
        SMPL_POSE_FLIP_PERM.append(3*i+1)
        SMPL_POSE_FLIP_PERM.append(3*i+2)
    
    pose = pose[SMPL_POSE_FLIP_PERM]
    
    # we also negate the second and the third dimension of the axis-angle
    pose[1::3] = -pose[1::3]
    pose[2::3] = -pose[2::3]
    pose = pose.transpose(0, 1).reshape(BN, -1, 3)
    
    if representation == 'aa':
        return pose
    elif representation == 'rotmat':
        return transforms.axis_angle_to_matrix(pose)
    else:
        return transforms.matrix_to_rotation_6d(
            transforms.axis_angle_to_matrix(pose)
        )
        
def avg_preds(rotation, shape, flipped_rotation, flipped_shape, representation='rotation_6d'):
    # Rotation
    flipped_rotation = flip_pose(flipped_rotation, representation=representation)
    
    if representation != 'matrix':
        flipped_rotation = eval(f'transforms.{representation}_to_matrix')(flipped_rotation)
        rotation = eval(f'transforms.{representation}_to_matrix')(rotation)
    
    avg_rotation = torch.stack([rotation, flipped_rotation])
    avg_rotation = transforms.avg_rot(avg_rotation)
    
    if representation != 'matrix':
        avg_rotation = eval(f'transforms.matrix_to_{representation}')(avg_rotation)
    
    # Shape
    avg_shape = (shape + flipped_shape) / 2.0
    
    return avg_rotation, avg_shape