liguang0115's picture
Add initial project structure with core files, configurations, and sample images
2df809d
import os
import cv2
import json
import numpy as np
import os.path as osp
from collections import deque
import random
from eval.mv_recon.base import BaseStereoViewDataset
from dust3r.utils.image import imread_cv2
import eval.mv_recon.dataset_utils.cropping as cropping
def shuffle_deque(dq, seed=None):
# Set the random seed for reproducibility
if seed is not None:
random.seed(seed)
# Convert deque to list, shuffle, and convert back
shuffled_list = list(dq)
random.shuffle(shuffled_list)
return deque(shuffled_list)
class SevenScenes(BaseStereoViewDataset):
def __init__(
self,
num_seq=1,
num_frames=5,
min_thresh=10,
max_thresh=100,
test_id=None,
full_video=False,
tuple_list=None,
seq_id=None,
rebuttal=False,
shuffle_seed=-1,
kf_every=1,
*args,
ROOT,
**kwargs,
):
self.ROOT = ROOT
super().__init__(*args, **kwargs)
self.num_seq = num_seq
self.num_frames = num_frames
self.max_thresh = max_thresh
self.min_thresh = min_thresh
self.test_id = test_id
self.full_video = full_video
self.kf_every = kf_every
self.seq_id = seq_id
self.rebuttal = rebuttal
self.shuffle_seed = shuffle_seed
# load all scenes
self.load_all_tuples(tuple_list)
self.load_all_scenes(ROOT)
def __len__(self):
if self.tuple_list is not None:
return len(self.tuple_list)
return len(self.scene_list) * self.num_seq
def load_all_tuples(self, tuple_list):
if tuple_list is not None:
self.tuple_list = tuple_list
# with open(tuple_path) as f:
# self.tuple_list = f.read().splitlines()
else:
self.tuple_list = None
def load_all_scenes(self, base_dir):
if self.tuple_list is not None:
# Use pre-defined simplerecon scene_ids
self.scene_list = [
"stairs/seq-06",
"stairs/seq-02",
"pumpkin/seq-06",
"chess/seq-01",
"heads/seq-02",
"fire/seq-02",
"office/seq-03",
"pumpkin/seq-03",
"redkitchen/seq-07",
"chess/seq-02",
"office/seq-01",
"redkitchen/seq-01",
"fire/seq-01",
]
print(f"Found {len(self.scene_list)} sequences in split {self.split}")
return
scenes = os.listdir(base_dir)
file_split = {"train": "TrainSplit.txt", "test": "TestSplit.txt"}[self.split]
self.scene_list = []
for scene in scenes:
if self.test_id is not None and scene != self.test_id:
continue
# read file split
with open(osp.join(base_dir, scene, file_split)) as f:
seq_ids = f.read().splitlines()
for seq_id in seq_ids:
# seq is string, take the int part and make it 01, 02, 03
# seq_id = 'seq-{:2d}'.format(int(seq_id))
num_part = "".join(filter(str.isdigit, seq_id))
seq_id = f"seq-{num_part.zfill(2)}"
if self.seq_id is not None and seq_id != self.seq_id:
continue
self.scene_list.append(f"{scene}/{seq_id}")
print(f"Found {len(self.scene_list)} sequences in split {self.split}")
def _get_views(self, idx, resolution, rng):
if self.tuple_list is not None:
line = self.tuple_list[idx].split(" ")
scene_id = line[0]
img_idxs = line[1:]
else:
scene_id = self.scene_list[idx // self.num_seq]
seq_id = idx % self.num_seq
data_path = osp.join(self.ROOT, scene_id)
num_files = len([name for name in os.listdir(data_path) if "color" in name])
img_idxs = [f"{i:06d}" for i in range(num_files)]
img_idxs = img_idxs[:: self.kf_every]
# Intrinsics used in SimpleRecon
fx, fy, cx, cy = 525, 525, 320, 240
intrinsics_ = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
views = []
imgs_idxs = deque(img_idxs)
if self.shuffle_seed >= 0:
imgs_idxs = shuffle_deque(imgs_idxs)
while len(imgs_idxs) > 0:
im_idx = imgs_idxs.popleft()
impath = osp.join(self.ROOT, scene_id, f"frame-{im_idx}.color.png")
depthpath = osp.join(self.ROOT, scene_id, f"frame-{im_idx}.depth.proj.png")
posepath = osp.join(self.ROOT, scene_id, f"frame-{im_idx}.pose.txt")
rgb_image = imread_cv2(impath)
depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED)
rgb_image = cv2.resize(rgb_image, (depthmap.shape[1], depthmap.shape[0]))
depthmap[depthmap == 65535] = 0
depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0) / 1000.0
depthmap[depthmap > 10] = 0
depthmap[depthmap < 1e-3] = 0
camera_pose = np.loadtxt(posepath).astype(np.float32)
if resolution != (224, 224) or self.rebuttal:
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
rgb_image, depthmap, intrinsics_, resolution, rng=rng, info=impath
)
else:
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
rgb_image, depthmap, intrinsics_, (512, 384), rng=rng, info=impath
)
W, H = rgb_image.size
cx = W // 2
cy = H // 2
l, t = cx - 112, cy - 112
r, b = cx + 112, cy + 112
crop_bbox = (l, t, r, b)
rgb_image, depthmap, intrinsics = cropping.crop_image_depthmap(
rgb_image, depthmap, intrinsics, crop_bbox
)
views.append(
dict(
img=rgb_image,
depthmap=depthmap,
camera_pose=camera_pose,
camera_intrinsics=intrinsics,
dataset="7scenes",
label=osp.join(scene_id, im_idx),
instance=impath,
)
)
return views
class DTU(BaseStereoViewDataset):
def __init__(
self,
num_seq=49,
num_frames=5,
min_thresh=10,
max_thresh=30,
test_id=None,
full_video=False,
sample_pairs=False,
kf_every=1,
*args,
ROOT,
**kwargs,
):
self.ROOT = ROOT
super().__init__(*args, **kwargs)
self.num_seq = num_seq
self.num_frames = num_frames
self.max_thresh = max_thresh
self.min_thresh = min_thresh
self.test_id = test_id
self.full_video = full_video
self.kf_every = kf_every
self.sample_pairs = sample_pairs
# load all scenes
self.load_all_scenes(ROOT)
def __len__(self):
return len(self.scene_list) * self.num_seq
def load_all_scenes(self, base_dir):
if self.test_id is None:
self.scene_list = os.listdir(osp.join(base_dir))
print(f"Found {len(self.scene_list)} scenes in split {self.split}")
else:
if isinstance(self.test_id, list):
self.scene_list = self.test_id
else:
self.scene_list = [self.test_id]
print(f"Test_id: {self.test_id}")
def load_cam_mvsnet(self, file, interval_scale=1):
"""read camera txt file"""
cam = np.zeros((2, 4, 4))
words = file.read().split()
# read extrinsic
for i in range(0, 4):
for j in range(0, 4):
extrinsic_index = 4 * i + j + 1
cam[0][i][j] = words[extrinsic_index]
# read intrinsic
for i in range(0, 3):
for j in range(0, 3):
intrinsic_index = 3 * i + j + 18
cam[1][i][j] = words[intrinsic_index]
if len(words) == 29:
cam[1][3][0] = words[27]
cam[1][3][1] = float(words[28]) * interval_scale
cam[1][3][2] = 192
cam[1][3][3] = cam[1][3][0] + cam[1][3][1] * cam[1][3][2]
elif len(words) == 30:
cam[1][3][0] = words[27]
cam[1][3][1] = float(words[28]) * interval_scale
cam[1][3][2] = words[29]
cam[1][3][3] = cam[1][3][0] + cam[1][3][1] * cam[1][3][2]
elif len(words) == 31:
cam[1][3][0] = words[27]
cam[1][3][1] = float(words[28]) * interval_scale
cam[1][3][2] = words[29]
cam[1][3][3] = words[30]
else:
cam[1][3][0] = 0
cam[1][3][1] = 0
cam[1][3][2] = 0
cam[1][3][3] = 0
extrinsic = cam[0].astype(np.float32)
intrinsic = cam[1].astype(np.float32)
return intrinsic, extrinsic
def _get_views(self, idx, resolution, rng):
scene_id = self.scene_list[idx // self.num_seq]
seq_id = idx % self.num_seq
print("Scene ID:", scene_id)
image_path = osp.join(self.ROOT, scene_id, "images")
depth_path = osp.join(self.ROOT, scene_id, "depths")
mask_path = osp.join(self.ROOT, scene_id, "binary_masks")
cam_path = osp.join(self.ROOT, scene_id, "cams")
pairs_path = osp.join(self.ROOT, scene_id, "pair.txt")
if not self.full_video:
img_idxs = self.sample_pairs(pairs_path, seq_id)
else:
img_idxs = sorted(os.listdir(image_path))
img_idxs = img_idxs[:: self.kf_every]
views = []
imgs_idxs = deque(img_idxs)
while len(imgs_idxs) > 0:
im_idx = imgs_idxs.pop()
impath = osp.join(image_path, im_idx)
depthpath = osp.join(depth_path, im_idx.replace(".jpg", ".npy"))
campath = osp.join(cam_path, im_idx.replace(".jpg", "_cam.txt"))
maskpath = osp.join(mask_path, im_idx.replace(".jpg", ".png"))
rgb_image = imread_cv2(impath)
depthmap = np.load(depthpath)
depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0)
mask = imread_cv2(maskpath, cv2.IMREAD_UNCHANGED) / 255.0
mask = mask.astype(np.float32)
mask[mask > 0.5] = 1.0
mask[mask < 0.5] = 0.0
mask = cv2.resize(
mask,
(depthmap.shape[1], depthmap.shape[0]),
interpolation=cv2.INTER_NEAREST,
)
kernel = np.ones((10, 10), np.uint8) # Define the erosion kernel
mask = cv2.erode(mask, kernel, iterations=1)
depthmap = depthmap * mask
cur_intrinsics, camera_pose = self.load_cam_mvsnet(open(campath, "r"))
intrinsics = cur_intrinsics[:3, :3]
camera_pose = np.linalg.inv(camera_pose)
if resolution != (224, 224):
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath
)
else:
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
rgb_image, depthmap, intrinsics, (512, 384), rng=rng, info=impath
)
W, H = rgb_image.size
cx = W // 2
cy = H // 2
l, t = cx - 112, cy - 112
r, b = cx + 112, cy + 112
crop_bbox = (l, t, r, b)
rgb_image, depthmap, intrinsics = cropping.crop_image_depthmap(
rgb_image, depthmap, intrinsics, crop_bbox
)
views.append(
dict(
img=rgb_image,
depthmap=depthmap,
camera_pose=camera_pose,
camera_intrinsics=intrinsics,
dataset="dtu",
label=osp.join(scene_id, im_idx),
instance=impath,
)
)
return views
class NRGBD(BaseStereoViewDataset):
def __init__(
self,
num_seq=1,
num_frames=5,
min_thresh=10,
max_thresh=100,
test_id=None,
full_video=False,
tuple_list=None,
seq_id=None,
rebuttal=False,
shuffle_seed=-1,
kf_every=1,
*args,
ROOT,
**kwargs,
):
self.ROOT = ROOT
super().__init__(*args, **kwargs)
self.num_seq = num_seq
self.num_frames = num_frames
self.max_thresh = max_thresh
self.min_thresh = min_thresh
self.test_id = test_id
self.full_video = full_video
self.kf_every = kf_every
self.seq_id = seq_id
self.rebuttal = rebuttal
self.shuffle_seed = shuffle_seed
# load all scenes
self.load_all_tuples(tuple_list)
self.load_all_scenes(ROOT)
def __len__(self):
if self.tuple_list is not None:
return len(self.tuple_list)
return len(self.scene_list) * self.num_seq
def load_all_tuples(self, tuple_list):
if tuple_list is not None:
self.tuple_list = tuple_list
# with open(tuple_path) as f:
# self.tuple_list = f.read().splitlines()
else:
self.tuple_list = None
def load_all_scenes(self, base_dir):
scenes = [
d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))
]
if self.test_id is not None:
self.scene_list = [self.test_id]
else:
self.scene_list = scenes
print(f"Found {len(self.scene_list)} sequences in split {self.split}")
def load_poses(self, path):
file = open(path, "r")
lines = file.readlines()
file.close()
poses = []
valid = []
lines_per_matrix = 4
for i in range(0, len(lines), lines_per_matrix):
if "nan" in lines[i]:
valid.append(False)
poses.append(np.eye(4, 4, dtype=np.float32).tolist())
else:
valid.append(True)
pose_floats = [
[float(x) for x in line.split()]
for line in lines[i : i + lines_per_matrix]
]
poses.append(pose_floats)
return np.array(poses, dtype=np.float32), valid
def _get_views(self, idx, resolution, rng):
if self.tuple_list is not None:
line = self.tuple_list[idx].split(" ")
scene_id = line[0]
img_idxs = line[1:]
else:
scene_id = self.scene_list[idx // self.num_seq]
num_files = len(os.listdir(os.path.join(self.ROOT, scene_id, "images")))
img_idxs = [f"{i}" for i in range(num_files)]
img_idxs = img_idxs[:: min(self.kf_every, len(img_idxs) // 2)]
fx, fy, cx, cy = 554.2562584220408, 554.2562584220408, 320, 240
intrinsics_ = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
posepath = osp.join(self.ROOT, scene_id, f"poses.txt")
camera_poses, valids = self.load_poses(posepath)
imgs_idxs = deque(img_idxs)
if self.shuffle_seed >= 0:
imgs_idxs = shuffle_deque(imgs_idxs)
views = []
while len(imgs_idxs) > 0:
im_idx = imgs_idxs.popleft()
impath = osp.join(self.ROOT, scene_id, "images", f"img{im_idx}.png")
depthpath = osp.join(self.ROOT, scene_id, "depth", f"depth{im_idx}.png")
rgb_image = imread_cv2(impath)
depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED)
depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0) / 1000.0
depthmap[depthmap > 10] = 0
depthmap[depthmap < 1e-3] = 0
rgb_image = cv2.resize(rgb_image, (depthmap.shape[1], depthmap.shape[0]))
camera_pose = camera_poses[int(im_idx)]
# gl to cv
camera_pose[:, 1:3] *= -1.0
if resolution != (224, 224) or self.rebuttal:
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
rgb_image, depthmap, intrinsics_, resolution, rng=rng, info=impath
)
else:
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
rgb_image, depthmap, intrinsics_, (512, 384), rng=rng, info=impath
)
W, H = rgb_image.size
cx = W // 2
cy = H // 2
l, t = cx - 112, cy - 112
r, b = cx + 112, cy + 112
crop_bbox = (l, t, r, b)
rgb_image, depthmap, intrinsics = cropping.crop_image_depthmap(
rgb_image, depthmap, intrinsics, crop_bbox
)
views.append(
dict(
img=rgb_image,
depthmap=depthmap,
camera_pose=camera_pose,
camera_intrinsics=intrinsics,
dataset="nrgbd",
label=osp.join(scene_id, im_idx),
instance=impath,
)
)
return views