Metric3D / training /mono /datasets /scannet_dataset.py

zach

initial commit based on github repo

3ef1661 over 1 year ago

14 kB

	import os
	import json
	import torch
	import torchvision.transforms as transforms
	import os.path
	import numpy as np
	import cv2
	from torch.utils.data import Dataset
	import random
	from .__base_dataset__ import BaseDataset


	class ScanNetDataset(BaseDataset):
	def __init__(self, cfg, phase, **kwargs):
	super(ScanNetDataset, self).__init__(
	cfg=cfg,
	phase=phase,
	**kwargs)
	self.metric_scale = cfg.metric_scale

	# def get_data_for_test(self, idx):
	# anno = self.annotations['files'][idx]
	# curr_rgb_path = os.path.join(self.data_root, anno['rgb'])
	# curr_depth_path = os.path.join(self.depth_root, anno['depth'])
	# meta_data = self.load_meta_data(anno)
	# ori_curr_intrinsic = meta_data['cam_in']

	# curr_rgb, curr_depth = self.load_rgb_depth(curr_rgb_path, curr_depth_path)
	# # curr_rgb = cv2.resize(curr_rgb, dsize=(640, 480), interpolation=cv2.INTER_LINEAR)
	# ori_h, ori_w, _ = curr_rgb.shape
	# # create camera model
	# curr_cam_model = self.create_cam_model(curr_rgb.shape[0], curr_rgb.shape[1], ori_curr_intrinsic)
	# # load tmpl rgb info
	# # tmpl_annos = self.load_tmpl_annos(anno, curr_rgb, meta_data)
	# # tmpl_rgb = tmpl_annos['tmpl_rgb_list'] # list of reference rgbs

	# transform_paras = dict()
	# rgbs, depths, intrinsics, cam_models, _, other_labels, transform_paras = self.img_transforms(
	# images=[curr_rgb, ],
	# labels=[curr_depth, ],
	# intrinsics=[ori_curr_intrinsic,],
	# cam_models=[curr_cam_model, ],
	# transform_paras=transform_paras)
	# # depth in original size
	# depth_out = self.clip_depth(curr_depth) * self.depth_range[1]

	# filename = os.path.basename(anno['rgb'])
	# curr_intrinsic_mat = self.intrinsics_list2mat(intrinsics[0])

	# pad = transform_paras['pad'] if 'pad' in transform_paras else [0,0,0,0]
	# scale_ratio = transform_paras['label_scale_factor'] if 'label_scale_factor' in transform_paras else 1.0
	# cam_models_stacks = [
	# torch.nn.functional.interpolate(cam_models[0][None, :, :, :], size=(cam_models[0].shape[1]//i, cam_models[0].shape[2]//i), mode='bilinear', align_corners=False).squeeze()
	# for i in [2, 4, 8, 16, 32]
	# ]
	# raw_rgb = torch.from_numpy(curr_rgb)
	# data = dict(input=rgbs[0],
	# target=depth_out,
	# intrinsic=curr_intrinsic_mat,
	# filename=filename,
	# dataset=self.data_name,
	# cam_model=cam_models_stacks,
	# ref_input=rgbs[1:],
	# tmpl_flg=False,
	# pad=pad,
	# scale=scale_ratio,
	# raw_rgb=raw_rgb,
	# normal =np.zeros_like(curr_rgb.transpose((2,0,1))),
	# )
	# return data

	def get_data_for_test(self, idx: int, test_mode=True):
	anno = self.annotations['files'][idx]
	meta_data = self.load_meta_data(anno)
	data_path = self.load_data_path(meta_data)
	data_batch = self.load_batch(meta_data, data_path, test_mode)
	# load data
	curr_rgb, curr_depth, curr_normal, curr_cam_model = data_batch['curr_rgb'], data_batch['curr_depth'], data_batch['curr_normal'], data_batch['curr_cam_model']
	ori_curr_intrinsic = meta_data['cam_in']

	# get crop size
	transform_paras = dict()
	rgbs, depths, intrinsics, cam_models, _, other_labels, transform_paras = self.img_transforms(
	images=[curr_rgb,], #+ tmpl_rgbs,
	labels=[curr_depth, ],
	intrinsics=[ori_curr_intrinsic, ], # * (len(tmpl_rgbs) + 1),
	cam_models=[curr_cam_model, ],
	transform_paras=transform_paras)
	# depth in original size and orignial metric***
	depth_out = self.clip_depth(curr_depth) * self.depth_range[1] # self.clip_depth(depths[0]) #
	inv_depth = self.depth2invdepth(depth_out, np.zeros_like(depth_out, dtype=np.bool))
	filename = os.path.basename(meta_data['rgb'])[:-4] + '.jpg'
	curr_intrinsic_mat = self.intrinsics_list2mat(intrinsics[0])

	pad = transform_paras['pad'] if 'pad' in transform_paras else [0,0,0,0]
	scale_ratio = transform_paras['label_scale_factor'] if 'label_scale_factor' in transform_paras else 1.0
	cam_models_stacks = [
	torch.nn.functional.interpolate(cam_models[0][None, :, :, :], size=(cam_models[0].shape[1]//i, cam_models[0].shape[2]//i), mode='bilinear', align_corners=False).squeeze()
	for i in [2, 4, 8, 16, 32]
	]
	raw_rgb = torch.from_numpy(curr_rgb)
	curr_normal = torch.from_numpy(curr_normal.transpose((2,0,1)))


	data = dict(input=rgbs[0],
	target=depth_out,
	intrinsic=curr_intrinsic_mat,
	filename=filename,
	dataset=self.data_name,
	cam_model=cam_models_stacks,
	pad=pad,
	scale=scale_ratio,
	raw_rgb=raw_rgb,
	sample_id=idx,
	data_path=meta_data['rgb'],
	inv_depth=inv_depth,
	normal=curr_normal,
	)
	return data

	def get_data_for_trainval(self, idx: int):
	anno = self.annotations['files'][idx]
	meta_data = self.load_meta_data(anno)

	data_path = self.load_data_path(meta_data)
	data_batch = self.load_batch(meta_data, data_path, test_mode=False)

	# if data_path['sem_path'] is not None:
	# print(self.data_name)

	curr_rgb, curr_depth, curr_normal, curr_sem, curr_cam_model = data_batch['curr_rgb'], data_batch['curr_depth'], data_batch['curr_normal'], data_batch['curr_sem'], data_batch['curr_cam_model']
	#curr_stereo_depth = data_batch['curr_stereo_depth']

	# A patch for stereo depth dataloader (no need to modify specific datasets)
	if 'curr_stereo_depth' in data_batch.keys():
	curr_stereo_depth = data_batch['curr_stereo_depth']
	else:
	curr_stereo_depth = self.load_stereo_depth_label(None, H=curr_rgb.shape[0], W=curr_rgb.shape[1])

	curr_intrinsic = meta_data['cam_in']
	# data augmentation
	transform_paras = dict(random_crop_size = self.random_crop_size) # dict()
	assert curr_rgb.shape[:2] == curr_depth.shape == curr_normal.shape[:2] == curr_sem.shape
	rgbs, depths, intrinsics, cam_models, normals, other_labels, transform_paras = self.img_transforms(
	images=[curr_rgb, ],
	labels=[curr_depth, ],
	intrinsics=[curr_intrinsic,],
	cam_models=[curr_cam_model, ],
	normals = [curr_normal, ],
	other_labels=[curr_sem, curr_stereo_depth],
	transform_paras=transform_paras)
	# process sky masks
	sem_mask = other_labels[0].int()
	# clip depth map
	depth_out = self.normalize_depth(depths[0])
	# set the depth of sky region to the invalid
	depth_out[sem_mask==142] = -1 # self.depth_normalize[1] - 1e-6
	# get inverse depth
	inv_depth = self.depth2invdepth(depth_out, sem_mask==142)
	filename = os.path.basename(meta_data['rgb'])[:-4] + '.jpg'
	curr_intrinsic_mat = self.intrinsics_list2mat(intrinsics[0])
	cam_models_stacks = [
	torch.nn.functional.interpolate(cam_models[0][None, :, :, :], size=(cam_models[0].shape[1]//i, cam_models[0].shape[2]//i), mode='bilinear', align_corners=False).squeeze()
	for i in [2, 4, 8, 16, 32]
	]

	# stereo_depth
	stereo_depth_pre_trans = other_labels[1] * (other_labels[1] > 0.3) * (other_labels[1] < 200)
	stereo_depth = stereo_depth_pre_trans * transform_paras['label_scale_factor']
	stereo_depth = self.normalize_depth(stereo_depth)

	pad = transform_paras['pad'] if 'pad' in transform_paras else [0,0,0,0]
	data = dict(input=rgbs[0],
	target=depth_out,
	intrinsic=curr_intrinsic_mat,
	filename=filename,
	dataset=self.data_name,
	cam_model=cam_models_stacks,
	pad=torch.tensor(pad),
	data_type=[self.data_type, ],
	sem_mask=sem_mask.int(),
	stereo_depth= stereo_depth,
	normal=normals[0],
	inv_depth=inv_depth,
	scale=transform_paras['label_scale_factor'])
	return data

	def load_batch(self, meta_data, data_path, test_mode):

	# print('############')
	# print(data_path['rgb_path'])
	# print(data_path['normal_path'])
	# print('############')

	curr_intrinsic = meta_data['cam_in']
	# load rgb/depth
	curr_rgb, curr_depth = self.load_rgb_depth(data_path['rgb_path'], data_path['depth_path'], test_mode)
	# get semantic labels
	curr_sem = self.load_sem_label(data_path['sem_path'], curr_depth)
	# create camera model
	curr_cam_model = self.create_cam_model(curr_rgb.shape[0], curr_rgb.shape[1], curr_intrinsic)
	# get normal labels
	curr_normal = self.load_norm_label(data_path['normal_path'], H=curr_rgb.shape[0], W=curr_rgb.shape[1], test_mode=test_mode)
	# get depth mask
	depth_mask = self.load_depth_valid_mask(data_path['depth_mask_path'])
	curr_depth[~depth_mask] = -1
	# get stereo depth
	curr_stereo_depth = self.load_stereo_depth_label(data_path['disp_path'], H=curr_rgb.shape[0], W=curr_rgb.shape[1])

	data_batch = dict(
	curr_rgb = curr_rgb,
	curr_depth = curr_depth,
	curr_sem = curr_sem,
	curr_normal = curr_normal,
	curr_cam_model=curr_cam_model,
	curr_stereo_depth=curr_stereo_depth,
	)
	return data_batch

	def load_rgb_depth(self, rgb_path: str, depth_path: str, test_mode: bool):
	"""
	Load the rgb and depth map with the paths.
	"""
	rgb = self.load_data(rgb_path, is_rgb_img=True)
	if rgb is None:
	self.logger.info(f'>>>>{rgb_path} has errors.')

	depth = self.load_data(depth_path)
	if depth is None:
	self.logger.info(f'{depth_path} has errors.')

	# self.check_data(dict(
	# rgb_path=rgb,
	# depth_path=depth,
	# ))
	depth = depth.astype(np.float)
	# if depth.shape != rgb.shape[:2]:
	# print(f'no-equal in {self.data_name}')
	# depth = cv2.resize(depth, rgb.shape[::-1][1:])

	depth = self.process_depth(depth, rgb, test_mode)
	return rgb, depth

	def process_depth(self, depth, rgb, test_mode=False):
	depth[depth>65500] = 0
	depth /= self.metric_scale
	h, w, _ = rgb.shape # to rgb size
	if test_mode==False:
	depth = cv2.resize(depth, (w, h), interpolation=cv2.INTER_NEAREST)
	return depth

	def load_norm_label(self, norm_path, H, W, test_mode):

	if norm_path is None:
	norm_gt = np.zeros((H, W, 3)).astype(np.float32)
	else:
	norm_gt = cv2.imread(norm_path)
	norm_gt = cv2.cvtColor(norm_gt, cv2.COLOR_BGR2RGB)

	norm_gt = np.array(norm_gt).astype(np.uint8)

	mask_path = 'orient-mask'.join(norm_path.rsplit('normal', 1))
	mask_gt = cv2.imread(mask_path)
	mask_gt = np.array(mask_gt).astype(np.uint8)
	valid_mask = np.logical_not(
	np.logical_and(
	np.logical_and(
	mask_gt[:, :, 0] == 0, mask_gt[:, :, 1] == 0),
	mask_gt[:, :, 2] == 0))
	valid_mask = valid_mask[:, :, np.newaxis]

	# norm_valid_mask = np.logical_not(
	# np.logical_and(
	# np.logical_and(
	# norm_gt[:, :, 0] == 0, norm_gt[:, :, 1] == 0),
	# norm_gt[:, :, 2] == 0))
	# norm_valid_mask = norm_valid_mask[:, :, np.newaxis]

	norm_gt = ((norm_gt.astype(np.float32) / 255.0) * 2.0) - 1.0
	norm_valid_mask = (np.linalg.norm(norm_gt, axis=2, keepdims=True) > 0.5) * valid_mask
	norm_gt = norm_gt * norm_valid_mask

	if test_mode==False:
	norm_gt = cv2.resize(norm_gt, (W, H), interpolation=cv2.INTER_NEAREST)

	return norm_gt



	if __name__ == '__main__':
	from mmcv.utils import Config
	cfg = Config.fromfile('mono/configs/Apolloscape_DDAD/convnext_base.cascade.1m.sgd.mae.py')
	dataset_i = NYUDataset(cfg['Apolloscape'], 'train', **cfg.data_basic)
	print(dataset_i)