Spaces:

silentchen
/

Shap_Editor_demo

Runtime error

App Files Files Community

Shap_Editor_demo / shap_e /models /transmitter /multiview_encoder.py

silentchen

first commit

19c4ddf almost 2 years ago

raw

history blame

7.15 kB

	from typing import Any, Dict, List, Optional, Tuple, Union

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from PIL import Image

	from shap_e.models.generation.transformer import Transformer
	from shap_e.rendering.view_data import ProjectiveCamera
	from shap_e.util.collections import AttrDict

	from .base import VectorEncoder


	class MultiviewTransformerEncoder(VectorEncoder):
	"""
	Encode cameras and views using a transformer model with extra output
	token(s) used to extract a latent vector.
	"""

	def __init__(
	self,
	*,
	device: torch.device,
	dtype: torch.dtype,
	param_shapes: Dict[str, Tuple[int]],
	params_proj: Dict[str, Any],
	latent_bottleneck: Optional[Dict[str, Any]] = None,
	d_latent: int = 512,
	latent_ctx: int = 1,
	num_views: int = 20,
	image_size: int = 256,
	patch_size: int = 32,
	use_depth: bool = False,
	max_depth: float = 5.0,
	width: int = 512,
	layers: int = 12,
	heads: int = 8,
	init_scale: float = 0.25,
	pos_emb_init_scale: float = 1.0,
	):
	super().__init__(
	device=device,
	param_shapes=param_shapes,
	params_proj=params_proj,
	latent_bottleneck=latent_bottleneck,
	d_latent=d_latent,
	)
	self.num_views = num_views
	self.image_size = image_size
	self.patch_size = patch_size
	self.use_depth = use_depth
	self.max_depth = max_depth
	self.n_ctx = num_views * (1 + (image_size // patch_size) ** 2)
	self.latent_ctx = latent_ctx
	self.width = width

	assert d_latent % latent_ctx == 0

	self.ln_pre = nn.LayerNorm(width, device=device, dtype=dtype)
	self.backbone = Transformer(
	device=device,
	dtype=dtype,
	n_ctx=self.n_ctx + latent_ctx,
	width=width,
	layers=layers,
	heads=heads,
	init_scale=init_scale,
	)
	self.ln_post = nn.LayerNorm(width, device=device, dtype=dtype)
	self.register_parameter(
	"output_tokens",
	nn.Parameter(torch.randn(latent_ctx, width, device=device, dtype=dtype)),
	)
	self.register_parameter(
	"pos_emb",
	nn.Parameter(
	pos_emb_init_scale * torch.randn(self.n_ctx, width, device=device, dtype=dtype)
	),
	)
	self.patch_emb = nn.Conv2d(
	in_channels=3 if not use_depth else 4,
	out_channels=width,
	kernel_size=patch_size,
	stride=patch_size,
	device=device,
	dtype=dtype,
	)
	self.camera_emb = nn.Sequential(
	nn.Linear(
	3 * 4 + 1, width, device=device, dtype=dtype
	), # input size is for origin+x+y+z+fov
	nn.GELU(),
	nn.Linear(width, width, device=device, dtype=dtype),
	)
	self.output_proj = nn.Linear(width, d_latent // latent_ctx, device=device, dtype=dtype)

	def encode_to_vector(self, batch: AttrDict, options: Optional[AttrDict] = None) -> torch.Tensor:
	_ = options

	all_views = self.views_to_tensor(batch.views).to(self.device)
	if self.use_depth:
	all_views = torch.cat([all_views, self.depths_to_tensor(batch.depths)], dim=2)
	all_cameras = self.cameras_to_tensor(batch.cameras).to(self.device)

	batch_size, num_views, _, _, _ = all_views.shape

	views_proj = self.patch_emb(
	all_views.reshape([batch_size * num_views, *all_views.shape[2:]])
	)
	views_proj = (
	views_proj.reshape([batch_size, num_views, self.width, -1])
	.permute(0, 1, 3, 2)
	.contiguous()
	) # [batch_size x num_views x n_patches x width]

	cameras_proj = self.camera_emb(all_cameras).reshape([batch_size, num_views, 1, self.width])

	h = torch.cat([views_proj, cameras_proj], dim=2).reshape([batch_size, -1, self.width])
	h = h + self.pos_emb
	h = torch.cat([h, self.output_tokens[None].repeat(len(h), 1, 1)], dim=1)
	h = self.ln_pre(h)
	h = self.backbone(h)
	h = self.ln_post(h)
	h = h[:, self.n_ctx :]
	h = self.output_proj(h).flatten(1)

	return h

	def views_to_tensor(self, views: Union[torch.Tensor, List[List[Image.Image]]]) -> torch.Tensor:
	"""
	Returns a [batch x num_views x 3 x size x size] tensor in the range [-1, 1].
	"""
	if isinstance(views, torch.Tensor):
	return views

	tensor_batch = []
	for inner_list in views:
	assert len(inner_list) == self.num_views
	inner_batch = []
	for img in inner_list:
	img = img.resize((self.image_size,) * 2).convert("RGB")
	inner_batch.append(
	torch.from_numpy(np.array(img)).to(device=self.device, dtype=torch.float32)
	/ 127.5
	- 1
	)
	tensor_batch.append(torch.stack(inner_batch, dim=0))
	return torch.stack(tensor_batch, dim=0).permute(0, 1, 4, 2, 3)

	def depths_to_tensor(
	self, depths: Union[torch.Tensor, List[List[Image.Image]]]
	) -> torch.Tensor:
	"""
	Returns a [batch x num_views x 1 x size x size] tensor in the range [-1, 1].
	"""
	if isinstance(depths, torch.Tensor):
	return depths

	tensor_batch = []
	for inner_list in depths:
	assert len(inner_list) == self.num_views
	inner_batch = []
	for arr in inner_list:
	tensor = torch.from_numpy(arr).clamp(max=self.max_depth) / self.max_depth
	tensor = tensor * 2 - 1
	tensor = F.interpolate(
	tensor[None, None],
	(self.image_size,) * 2,
	mode="nearest",
	)
	inner_batch.append(tensor.to(device=self.device, dtype=torch.float32))
	tensor_batch.append(torch.cat(inner_batch, dim=0))
	return torch.stack(tensor_batch, dim=0)

	def cameras_to_tensor(
	self, cameras: Union[torch.Tensor, List[List[ProjectiveCamera]]]
	) -> torch.Tensor:
	"""
	Returns a [batch x num_views x 3*4+1] tensor of camera information.
	"""
	if isinstance(cameras, torch.Tensor):
	return cameras
	outer_batch = []
	for inner_list in cameras:
	inner_batch = []
	for camera in inner_list:
	inner_batch.append(
	np.array(
	[
	*camera.x,
	*camera.y,
	*camera.z,
	*camera.origin,
	camera.x_fov,
	]
	)
	)
	outer_batch.append(np.stack(inner_batch, axis=0))
	return torch.from_numpy(np.stack(outer_batch, axis=0)).float()