Spaces:
Running
on
Zero
Running
on
Zero
| from typing import Union, Tuple, List, Callable | |
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from einops import repeat | |
| from tqdm import tqdm | |
| from .attention_blocks import CrossAttentionDecoder | |
| from .attention_processors import FlashVDMCrossAttentionProcessor, FlashVDMTopMCrossAttentionProcessor | |
| from ...utils import logger | |
| def extract_near_surface_volume_fn(input_tensor: torch.Tensor, alpha: float): | |
| """ | |
| 修复维度问题的PyTorch实现 | |
| Args: | |
| input_tensor: shape [D, D, D], torch.float16 | |
| alpha: 标量偏移值 | |
| Returns: | |
| mask: shape [D, D, D], torch.int32 表面掩码 | |
| """ | |
| device = input_tensor.device | |
| D = input_tensor.shape[0] | |
| signed_val = 0.0 | |
| # 添加偏移并处理无效值 | |
| val = input_tensor + alpha | |
| valid_mask = val > -9000 # 假设-9000是无效值 | |
| # 改进的邻居获取函数(保持维度一致) | |
| def get_neighbor(t, shift, axis): | |
| """根据指定轴进行位移并保持维度一致""" | |
| if shift == 0: | |
| return t.clone() | |
| # 确定填充轴(输入为[D, D, D]对应z,y,x轴) | |
| pad_dims = [0, 0, 0, 0, 0, 0] # 格式:[x前,x后,y前,y后,z前,z后] | |
| # 根据轴类型设置填充 | |
| if axis == 0: # x轴(最后一个维度) | |
| pad_idx = 0 if shift > 0 else 1 | |
| pad_dims[pad_idx] = abs(shift) | |
| elif axis == 1: # y轴(中间维度) | |
| pad_idx = 2 if shift > 0 else 3 | |
| pad_dims[pad_idx] = abs(shift) | |
| elif axis == 2: # z轴(第一个维度) | |
| pad_idx = 4 if shift > 0 else 5 | |
| pad_dims[pad_idx] = abs(shift) | |
| # 执行填充(添加batch和channel维度适配F.pad) | |
| padded = F.pad(t.unsqueeze(0).unsqueeze(0), pad_dims[::-1], mode='replicate') # 反转顺序适配F.pad | |
| # 构建动态切片索引 | |
| slice_dims = [slice(None)] * 3 # 初始化为全切片 | |
| if axis == 0: # x轴(dim=2) | |
| if shift > 0: | |
| slice_dims[0] = slice(shift, None) | |
| else: | |
| slice_dims[0] = slice(None, shift) | |
| elif axis == 1: # y轴(dim=1) | |
| if shift > 0: | |
| slice_dims[1] = slice(shift, None) | |
| else: | |
| slice_dims[1] = slice(None, shift) | |
| elif axis == 2: # z轴(dim=0) | |
| if shift > 0: | |
| slice_dims[2] = slice(shift, None) | |
| else: | |
| slice_dims[2] = slice(None, shift) | |
| # 应用切片并恢复维度 | |
| padded = padded.squeeze(0).squeeze(0) | |
| sliced = padded[slice_dims] | |
| return sliced | |
| # 获取各方向邻居(确保维度一致) | |
| left = get_neighbor(val, 1, axis=0) # x方向 | |
| right = get_neighbor(val, -1, axis=0) | |
| back = get_neighbor(val, 1, axis=1) # y方向 | |
| front = get_neighbor(val, -1, axis=1) | |
| down = get_neighbor(val, 1, axis=2) # z方向 | |
| up = get_neighbor(val, -1, axis=2) | |
| # 处理边界无效值(使用where保持维度一致) | |
| def safe_where(neighbor): | |
| return torch.where(neighbor > -9000, neighbor, val) | |
| left = safe_where(left) | |
| right = safe_where(right) | |
| back = safe_where(back) | |
| front = safe_where(front) | |
| down = safe_where(down) | |
| up = safe_where(up) | |
| # 计算符号一致性(转换为float32确保精度) | |
| sign = torch.sign(val.to(torch.float32)) | |
| neighbors_sign = torch.stack([ | |
| torch.sign(left.to(torch.float32)), | |
| torch.sign(right.to(torch.float32)), | |
| torch.sign(back.to(torch.float32)), | |
| torch.sign(front.to(torch.float32)), | |
| torch.sign(down.to(torch.float32)), | |
| torch.sign(up.to(torch.float32)) | |
| ], dim=0) | |
| # 检查所有符号是否一致 | |
| same_sign = torch.all(neighbors_sign == sign, dim=0) | |
| # 生成最终掩码 | |
| mask = (~same_sign).to(torch.int32) | |
| return mask * valid_mask.to(torch.int32) | |
| def generate_dense_grid_points( | |
| bbox_min: np.ndarray, | |
| bbox_max: np.ndarray, | |
| octree_resolution: int, | |
| indexing: str = "ij", | |
| ): | |
| length = bbox_max - bbox_min | |
| num_cells = octree_resolution | |
| x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32) | |
| y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32) | |
| z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32) | |
| [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing) | |
| xyz = np.stack((xs, ys, zs), axis=-1) | |
| grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1] | |
| return xyz, grid_size, length | |
| class VanillaVolumeDecoder: | |
| def __call__( | |
| self, | |
| latents: torch.FloatTensor, | |
| geo_decoder: Callable, | |
| bounds: Union[Tuple[float], List[float], float] = 1.01, | |
| num_chunks: int = 10000, | |
| octree_resolution: int = None, | |
| enable_pbar: bool = True, | |
| **kwargs, | |
| ): | |
| device = latents.device | |
| dtype = latents.dtype | |
| batch_size = latents.shape[0] | |
| # 1. generate query points | |
| if isinstance(bounds, float): | |
| bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds] | |
| bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6]) | |
| xyz_samples, grid_size, length = generate_dense_grid_points( | |
| bbox_min=bbox_min, | |
| bbox_max=bbox_max, | |
| octree_resolution=octree_resolution, | |
| indexing="ij" | |
| ) | |
| xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3) | |
| # 2. latents to 3d volume | |
| batch_logits = [] | |
| for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), desc=f"Volume Decoding", | |
| disable=not enable_pbar): | |
| chunk_queries = xyz_samples[start: start + num_chunks, :] | |
| chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size) | |
| logits = geo_decoder(queries=chunk_queries, latents=latents) | |
| batch_logits.append(logits) | |
| grid_logits = torch.cat(batch_logits, dim=1) | |
| grid_logits = grid_logits.view((batch_size, *grid_size)).float() | |
| return grid_logits | |
| class HierarchicalVolumeDecoding: | |
| def __call__( | |
| self, | |
| latents: torch.FloatTensor, | |
| geo_decoder: Callable, | |
| bounds: Union[Tuple[float], List[float], float] = 1.01, | |
| num_chunks: int = 10000, | |
| mc_level: float = 0.0, | |
| octree_resolution: int = None, | |
| min_resolution: int = 63, | |
| enable_pbar: bool = True, | |
| **kwargs, | |
| ): | |
| device = latents.device | |
| dtype = latents.dtype | |
| resolutions = [] | |
| if octree_resolution < min_resolution: | |
| resolutions.append(octree_resolution) | |
| while octree_resolution >= min_resolution: | |
| resolutions.append(octree_resolution) | |
| octree_resolution = octree_resolution // 2 | |
| resolutions.reverse() | |
| # 1. generate query points | |
| if isinstance(bounds, float): | |
| bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds] | |
| bbox_min = np.array(bounds[0:3]) | |
| bbox_max = np.array(bounds[3:6]) | |
| bbox_size = bbox_max - bbox_min | |
| xyz_samples, grid_size, length = generate_dense_grid_points( | |
| bbox_min=bbox_min, | |
| bbox_max=bbox_max, | |
| octree_resolution=resolutions[0], | |
| indexing="ij" | |
| ) | |
| dilate = nn.Conv3d(1, 1, 3, padding=1, bias=False, device=device, dtype=dtype) | |
| dilate.weight = torch.nn.Parameter(torch.ones(dilate.weight.shape, dtype=dtype, device=device)) | |
| grid_size = np.array(grid_size) | |
| xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3) | |
| # 2. latents to 3d volume | |
| batch_logits = [] | |
| batch_size = latents.shape[0] | |
| for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), | |
| desc=f"Hierarchical Volume Decoding [r{resolutions[0] + 1}]"): | |
| queries = xyz_samples[start: start + num_chunks, :] | |
| batch_queries = repeat(queries, "p c -> b p c", b=batch_size) | |
| logits = geo_decoder(queries=batch_queries, latents=latents) | |
| batch_logits.append(logits) | |
| grid_logits = torch.cat(batch_logits, dim=1).view((batch_size, grid_size[0], grid_size[1], grid_size[2])) | |
| for octree_depth_now in resolutions[1:]: | |
| grid_size = np.array([octree_depth_now + 1] * 3) | |
| resolution = bbox_size / octree_depth_now | |
| next_index = torch.zeros(tuple(grid_size), dtype=dtype, device=device) | |
| next_logits = torch.full(next_index.shape, -10000., dtype=dtype, device=device) | |
| curr_points = extract_near_surface_volume_fn(grid_logits.squeeze(0), mc_level) | |
| curr_points += grid_logits.squeeze(0).abs() < 0.95 | |
| if octree_depth_now == resolutions[-1]: | |
| expand_num = 0 | |
| else: | |
| expand_num = 1 | |
| for i in range(expand_num): | |
| curr_points = dilate(curr_points.unsqueeze(0).to(dtype)).squeeze(0) | |
| (cidx_x, cidx_y, cidx_z) = torch.where(curr_points > 0) | |
| next_index[cidx_x * 2, cidx_y * 2, cidx_z * 2] = 1 | |
| for i in range(2 - expand_num): | |
| next_index = dilate(next_index.unsqueeze(0)).squeeze(0) | |
| nidx = torch.where(next_index > 0) | |
| next_points = torch.stack(nidx, dim=1) | |
| next_points = (next_points * torch.tensor(resolution, dtype=next_points.dtype, device=device) + | |
| torch.tensor(bbox_min, dtype=next_points.dtype, device=device)) | |
| batch_logits = [] | |
| for start in tqdm(range(0, next_points.shape[0], num_chunks), | |
| desc=f"Hierarchical Volume Decoding [r{octree_depth_now + 1}]"): | |
| queries = next_points[start: start + num_chunks, :] | |
| batch_queries = repeat(queries, "p c -> b p c", b=batch_size) | |
| logits = geo_decoder(queries=batch_queries.to(latents.dtype), latents=latents) | |
| batch_logits.append(logits) | |
| grid_logits = torch.cat(batch_logits, dim=1) | |
| next_logits[nidx] = grid_logits[0, ..., 0] | |
| grid_logits = next_logits.unsqueeze(0) | |
| grid_logits[grid_logits == -10000.] = float('nan') | |
| return grid_logits | |
| class FlashVDMVolumeDecoding: | |
| def __init__(self, topk_mode='mean'): | |
| if topk_mode not in ['mean', 'merge']: | |
| raise ValueError(f'Unsupported topk_mode {topk_mode}, available: {["mean", "merge"]}') | |
| if topk_mode == 'mean': | |
| self.processor = FlashVDMCrossAttentionProcessor() | |
| else: | |
| self.processor = FlashVDMTopMCrossAttentionProcessor() | |
| def __call__( | |
| self, | |
| latents: torch.FloatTensor, | |
| geo_decoder: CrossAttentionDecoder, | |
| bounds: Union[Tuple[float], List[float], float] = 1.01, | |
| num_chunks: int = 10000, | |
| mc_level: float = 0.0, | |
| octree_resolution: int = None, | |
| min_resolution: int = 63, | |
| mini_grid_num: int = 4, | |
| enable_pbar: bool = True, | |
| **kwargs, | |
| ): | |
| processor = self.processor | |
| geo_decoder.set_cross_attention_processor(processor) | |
| device = latents.device | |
| dtype = latents.dtype | |
| resolutions = [] | |
| if octree_resolution < min_resolution: | |
| resolutions.append(octree_resolution) | |
| while octree_resolution >= min_resolution: | |
| resolutions.append(octree_resolution) | |
| octree_resolution = octree_resolution // 2 | |
| resolutions.reverse() | |
| resolutions[0] = round(resolutions[0] / mini_grid_num) * mini_grid_num - 1 | |
| for i, resolution in enumerate(resolutions[1:]): | |
| resolutions[i + 1] = resolutions[0] * 2 ** (i + 1) | |
| logger.info(f"FlashVDMVolumeDecoding Resolution: {resolutions}") | |
| # 1. generate query points | |
| if isinstance(bounds, float): | |
| bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds] | |
| bbox_min = np.array(bounds[0:3]) | |
| bbox_max = np.array(bounds[3:6]) | |
| bbox_size = bbox_max - bbox_min | |
| xyz_samples, grid_size, length = generate_dense_grid_points( | |
| bbox_min=bbox_min, | |
| bbox_max=bbox_max, | |
| octree_resolution=resolutions[0], | |
| indexing="ij" | |
| ) | |
| dilate = nn.Conv3d(1, 1, 3, padding=1, bias=False, device=device, dtype=dtype) | |
| dilate.weight = torch.nn.Parameter(torch.ones(dilate.weight.shape, dtype=dtype, device=device)) | |
| grid_size = np.array(grid_size) | |
| # 2. latents to 3d volume | |
| xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype) | |
| batch_size = latents.shape[0] | |
| mini_grid_size = xyz_samples.shape[0] // mini_grid_num | |
| xyz_samples = xyz_samples.view( | |
| mini_grid_num, mini_grid_size, | |
| mini_grid_num, mini_grid_size, | |
| mini_grid_num, mini_grid_size, 3 | |
| ).permute( | |
| 0, 2, 4, 1, 3, 5, 6 | |
| ).reshape( | |
| -1, mini_grid_size * mini_grid_size * mini_grid_size, 3 | |
| ) | |
| batch_logits = [] | |
| num_batchs = max(num_chunks // xyz_samples.shape[1], 1) | |
| for start in tqdm(range(0, xyz_samples.shape[0], num_batchs), | |
| desc=f"FlashVDM Volume Decoding", disable=not enable_pbar): | |
| queries = xyz_samples[start: start + num_batchs, :] | |
| batch = queries.shape[0] | |
| batch_latents = repeat(latents.squeeze(0), "p c -> b p c", b=batch) | |
| processor.topk = True | |
| logits = geo_decoder(queries=queries, latents=batch_latents) | |
| batch_logits.append(logits) | |
| grid_logits = torch.cat(batch_logits, dim=0).reshape( | |
| mini_grid_num, mini_grid_num, mini_grid_num, | |
| mini_grid_size, mini_grid_size, | |
| mini_grid_size | |
| ).permute(0, 3, 1, 4, 2, 5).contiguous().view( | |
| (batch_size, grid_size[0], grid_size[1], grid_size[2]) | |
| ) | |
| for octree_depth_now in resolutions[1:]: | |
| grid_size = np.array([octree_depth_now + 1] * 3) | |
| resolution = bbox_size / octree_depth_now | |
| next_index = torch.zeros(tuple(grid_size), dtype=dtype, device=device) | |
| next_logits = torch.full(next_index.shape, -10000., dtype=dtype, device=device) | |
| curr_points = extract_near_surface_volume_fn(grid_logits.squeeze(0), mc_level) | |
| curr_points += grid_logits.squeeze(0).abs() < 0.95 | |
| if octree_depth_now == resolutions[-1]: | |
| expand_num = 0 | |
| else: | |
| expand_num = 1 | |
| for i in range(expand_num): | |
| curr_points = dilate(curr_points.unsqueeze(0).to(dtype)).squeeze(0) | |
| (cidx_x, cidx_y, cidx_z) = torch.where(curr_points > 0) | |
| next_index[cidx_x * 2, cidx_y * 2, cidx_z * 2] = 1 | |
| for i in range(2 - expand_num): | |
| next_index = dilate(next_index.unsqueeze(0)).squeeze(0) | |
| nidx = torch.where(next_index > 0) | |
| next_points = torch.stack(nidx, dim=1) | |
| next_points = (next_points * torch.tensor(resolution, dtype=torch.float32, device=device) + | |
| torch.tensor(bbox_min, dtype=torch.float32, device=device)) | |
| query_grid_num = 6 | |
| min_val = next_points.min(axis=0).values | |
| max_val = next_points.max(axis=0).values | |
| vol_queries_index = (next_points - min_val) / (max_val - min_val) * (query_grid_num - 0.001) | |
| index = torch.floor(vol_queries_index).long() | |
| index = index[..., 0] * (query_grid_num ** 2) + index[..., 1] * query_grid_num + index[..., 2] | |
| index = index.sort() | |
| next_points = next_points[index.indices].unsqueeze(0).contiguous() | |
| unique_values = torch.unique(index.values, return_counts=True) | |
| grid_logits = torch.zeros((next_points.shape[1]), dtype=latents.dtype, device=latents.device) | |
| input_grid = [[], []] | |
| logits_grid_list = [] | |
| start_num = 0 | |
| sum_num = 0 | |
| for grid_index, count in zip(unique_values[0].cpu().tolist(), unique_values[1].cpu().tolist()): | |
| if sum_num + count < num_chunks or sum_num == 0: | |
| sum_num += count | |
| input_grid[0].append(grid_index) | |
| input_grid[1].append(count) | |
| else: | |
| processor.topk = input_grid | |
| logits_grid = geo_decoder(queries=next_points[:, start_num:start_num + sum_num], latents=latents) | |
| start_num = start_num + sum_num | |
| logits_grid_list.append(logits_grid) | |
| input_grid = [[grid_index], [count]] | |
| sum_num = count | |
| if sum_num > 0: | |
| processor.topk = input_grid | |
| logits_grid = geo_decoder(queries=next_points[:, start_num:start_num + sum_num], latents=latents) | |
| logits_grid_list.append(logits_grid) | |
| logits_grid = torch.cat(logits_grid_list, dim=1) | |
| grid_logits[index.indices] = logits_grid.squeeze(0).squeeze(-1) | |
| next_logits[nidx] = grid_logits | |
| grid_logits = next_logits.unsqueeze(0) | |
| grid_logits[grid_logits == -10000.] = float('nan') | |
| return grid_logits | |