Spaces:

Form-Fighter
/

FormFighterAIStack

Sleeping

App Files Files Community

FormFighterAIStack / lib /data /utils /augmentor.py

Techt3o

e01e49325338173592071b44501d91416d6a9072d1040c9d9f5aecf816533bec

f561f8b verified 8 months ago

raw

history blame

11 kB

	from __future__ import absolute_import
	from __future__ import print_function
	from __future__ import division

	from configs import constants as _C

	import torch
	import numpy as np
	from torch.nn import functional as F

	from ...utils import transforms

	__all__ = ['VideoAugmentor', 'SMPLAugmentor', 'SequenceAugmentor', 'CameraAugmentor']


	num_joints = _C.KEYPOINTS.NUM_JOINTS
	class VideoAugmentor():
	def __init__(self, cfg, train=True):
	self.train = train
	self.l = cfg.DATASET.SEQLEN + 1
	self.aug_dict = torch.load(_C.KEYPOINTS.COCO_AUG_DICT)

	def get_jitter(self, ):
	"""Guassian jitter modeling."""
	jittering_noise = torch.normal(
	mean=torch.zeros((self.l, num_joints, 3)),
	std=self.aug_dict['jittering'].reshape(1, num_joints, 1).expand(self.l, -1, 3)
	) * _C.KEYPOINTS.S_JITTERING
	return jittering_noise

	def get_lfhp(self, ):
	"""Low-frequency high-peak noise modeling."""
	def get_peak_noise_mask():
	peak_noise_mask = torch.rand(self.l, num_joints).float() * self.aug_dict['pmask'].squeeze(0)
	peak_noise_mask = peak_noise_mask < _C.KEYPOINTS.S_PEAK_MASK
	return peak_noise_mask

	peak_noise_mask = get_peak_noise_mask()
	peak_noise = peak_noise_mask.float().unsqueeze(-1).repeat(1, 1, 3)
	peak_noise = peak_noise * torch.randn(3) * self.aug_dict['peak'].reshape(1, -1, 1) * _C.KEYPOINTS.S_PEAK
	return peak_noise

	def get_bias(self, ):
	"""Bias noise modeling."""
	bias_noise = torch.normal(
	mean=torch.zeros((num_joints, 3)), std=self.aug_dict['bias'].reshape(num_joints, 1)
	).unsqueeze(0) * _C.KEYPOINTS.S_BIAS
	return bias_noise

	def get_mask(self, scale=None):
	"""Mask modeling."""

	if scale is None:
	scale = _C.KEYPOINTS.S_MASK
	# Per-frame and joint
	mask = torch.rand(self.l, num_joints) < scale
	visible = (~mask).clone()
	for child in range(num_joints):
	parent = _C.KEYPOINTS.TREE[child]
	if parent == -1: continue
	if isinstance(parent, list):
	visible[:, child] = (visible[:, parent[0]] visible[:, parent[1]])
	else:
	visible[:, child] *= visible[:, parent]
	mask = (~visible).clone()

	return mask

	def __call__(self, keypoints):
	keypoints += self.get_bias() + self.get_jitter() + self.get_lfhp()
	return keypoints


	class SMPLAugmentor():
	noise_scale = 1e-2

	def __init__(self, cfg, augment=True):
	self.n_frames = cfg.DATASET.SEQLEN
	self.augment = augment

	def __call__(self, target):
	if not self.augment:
	# Only add initial frame augmentation
	if not 'init_pose' in target:
	target['init_pose'] = target['pose'][:1] @ self.get_initial_pose_augmentation()
	return target

	n_frames = target['pose'].shape[0]

	# Global rotation
	rmat = self.get_global_augmentation()
	target['pose'][:, 0] = rmat @ target['pose'][:, 0]
	target['transl'] = (rmat.squeeze() @ target['transl'].T).T

	# Shape
	shape_noise = self.get_shape_augmentation(n_frames)
	target['betas'] = target['betas'] + shape_noise

	# Initial frames mis-prediction
	target['init_pose'] = target['pose'][:1] @ self.get_initial_pose_augmentation()

	return target

	def get_global_augmentation(self, ):
	"""Global coordinate augmentation. Random rotation around y-axis"""

	angle_y = torch.rand(1) * 2 * np.pi * float(self.augment)
	aa = torch.tensor([0.0, angle_y, 0.0]).float().unsqueeze(0)
	rmat = transforms.axis_angle_to_matrix(aa)

	return rmat

	def get_shape_augmentation(self, n_frames):
	"""Shape noise modeling."""

	shape_noise = torch.normal(
	mean=torch.zeros((1, 10)),
	std=torch.ones((1, 10)) * 0.1 * float(self.augment)).expand(n_frames, 10)

	return shape_noise

	def get_initial_pose_augmentation(self, ):
	"""Initial frame pose noise modeling. Random rotation around all joints."""

	euler = torch.normal(
	mean=torch.zeros((24, 3)),
	std=torch.ones((24, 3))
	) * self.noise_scale #* float(self.augment)
	rmat = transforms.axis_angle_to_matrix(euler)

	return rmat.unsqueeze(0)


	class SequenceAugmentor:
	"""Augment the play speed of the motion sequence"""
	l_factor = 1.5
	def __init__(self, l_default):
	self.l_default = l_default

	def __call__(self, target):
	l = torch.randint(low=int(self.l_default / self.l_factor), high=int(self.l_default * self.l_factor), size=(1, ))

	pose = transforms.matrix_to_rotation_6d(target['pose'])
	resampled_pose = F.interpolate(
	pose[:l].permute(1, 2, 0), self.l_default, mode='linear', align_corners=True
	).permute(2, 0, 1)
	resampled_pose = transforms.rotation_6d_to_matrix(resampled_pose)

	transl = target['transl'].unsqueeze(1)
	resampled_transl = F.interpolate(
	transl[:l].permute(1, 2, 0), self.l_default, mode='linear', align_corners=True
	).squeeze(0).T

	target['pose'] = resampled_pose
	target['transl'] = resampled_transl
	target['betas'] = target['betas'][:self.l_default]

	return target


	class CameraAugmentor:
	rx_factor = np.pi/8
	ry_factor = np.pi/4
	rz_factor = np.pi/8

	pitch_std = np.pi/8
	pitch_mean = np.pi/36
	roll_std = np.pi/24
	t_factor = 1

	tz_scale = 10
	tz_min = 2

	motion_prob = 0.75
	interp_noise = 0.2

	def __init__(self, l, w, h, f):
	self.l = l
	self.w = w
	self.h = h
	self.f = f
	self.fov_tol = 1.2 * (0.5 ** 0.5)

	def __call__(self, target):

	R, T = self.create_camera(target)

	if np.random.rand() < self.motion_prob:
	R = self.create_rotation_move(R)
	T = self.create_translation_move(T)

	return self.apply(target, R, T)

	def create_camera(self, target):
	"""Create the initial frame camera pose"""
	yaw = np.random.rand() * 2 * np.pi
	pitch = np.random.normal(scale=self.pitch_std) + self.pitch_mean
	roll = np.random.normal(scale=self.roll_std)

	yaw_rm = transforms.axis_angle_to_matrix(torch.tensor([[0, yaw, 0]]).float())
	pitch_rm = transforms.axis_angle_to_matrix(torch.tensor([[pitch, 0, 0]]).float())
	roll_rm = transforms.axis_angle_to_matrix(torch.tensor([[0, 0, roll]]).float())
	R = (roll_rm @ pitch_rm @ yaw_rm)

	# Place people in the scene
	tz = np.random.rand() * self.tz_scale + self.tz_min
	max_d = self.w * tz / self.f / 2
	tx = np.random.normal(scale=0.25) * max_d
	ty = np.random.normal(scale=0.25) * max_d
	dist = torch.tensor([tx, ty, tz]).float()
	T = dist - torch.matmul(R, target['transl'][0])

	return R.repeat(self.l, 1, 1), T.repeat(self.l, 1)

	def create_rotation_move(self, R):
	"""Create rotational move for the camera"""

	# Create final camera pose
	rx = np.random.normal(scale=self.rx_factor)
	ry = np.random.normal(scale=self.ry_factor)
	rz = np.random.normal(scale=self.rz_factor)
	Rf = R[0] @ transforms.axis_angle_to_matrix(torch.tensor([rx, ry, rz]).float())

	# Inbetweening two poses
	Rs = torch.stack((R[0], Rf))
	rs = transforms.matrix_to_rotation_6d(Rs).numpy()
	rs_move = self.noisy_interpolation(rs)
	R_move = transforms.rotation_6d_to_matrix(torch.from_numpy(rs_move).float())
	return R_move

	def create_translation_move(self, T):
	"""Create translational move for the camera"""

	# Create final camera position
	tx = np.random.normal(scale=self.t_factor)
	ty = np.random.normal(scale=self.t_factor)
	tz = np.random.normal(scale=self.t_factor)
	Ts = np.array([[0, 0, 0], [tx, ty, tz]])

	T_move = self.noisy_interpolation(Ts)
	T_move = torch.from_numpy(T_move).float()
	return T_move + T

	def noisy_interpolation(self, data):
	"""Non-linear interpolation with noise"""

	dim = data.shape[-1]
	output = np.zeros((self.l, dim))

	linspace = np.stack([np.linspace(0, 1, self.l) for _ in range(dim)])
	noise = (linspace[0, 1] - linspace[0, 0]) * self.interp_noise
	space_noise = np.stack([np.random.uniform(-noise, noise, self.l - 2) for _ in range(dim)])

	linspace[:, 1:-1] = linspace[:, 1:-1] + space_noise
	for i in range(dim):
	output[:, i] = np.interp(linspace[i], np.array([0., 1.,]), data[:, i])
	return output

	def apply(self, target, R, T):
	target['R'] = R
	target['T'] = T

	# Recompute the translation
	transl_cam = torch.matmul(R, target['transl'].unsqueeze(-1)).squeeze(-1)
	transl_cam = transl_cam + T
	if transl_cam[..., 2].min() < 0.5: # If the person is too close to the camera
	transl_cam[..., 2] = transl_cam[..., 2] + (1.0 - transl_cam[..., 2].min())

	# If the subject is away from the field of view, put the camera behind
	fov = torch.div(transl_cam[..., :2], transl_cam[..., 2:]).abs()
	if fov.max() > self.fov_tol:
	t_max = transl_cam[fov.max(1)[0].max(0)[1].item()]
	z_trg = t_max[:2].abs().max(0)[0] / self.fov_tol
	pad = z_trg - t_max[2]
	transl_cam[..., 2] = transl_cam[..., 2] + pad

	target['transl_cam'] = transl_cam

	# Transform world coordinate to camera coordinate
	target['pose_root'] = target['pose'][:, 0].clone()
	target['pose'][:, 0] = R @ target['pose'][:, 0] # pose
	target['init_pose'][:, 0] = R[:1] @ target['init_pose'][:, 0] # init pose

	# Compute angular velocity
	cam_angvel = transforms.matrix_to_rotation_6d(R[:-1] @ R[1:].transpose(-1, -2))
	cam_angvel = cam_angvel - torch.tensor([[1, 0, 0, 0, 1, 0]]).to(cam_angvel) # Normalize
	target['cam_angvel'] = cam_angvel * 3e1 # assume 30-fps

	if 'kp3d' in target:
	target['kp3d'] = torch.matmul(R, target['kp3d'].transpose(1, 2)).transpose(1, 2) + target['transl_cam'].unsqueeze(1)

	return target