delinqu commited on
Commit
fd1764e
·
verified ·
1 Parent(s): 88dee11

Delete files modeling_ego3d.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. modeling_ego3d.py +0 -126
modeling_ego3d.py DELETED
@@ -1,126 +0,0 @@
1
- # MIT License
2
- # Copyright (c) 2025 IPEC at Shanghai AI Laboratory
3
- # Permission is hereby granted, free of charge, to use, copy, modify, merge, publish,
4
- # distribute, sublicense, and/or sell copies of the Software, subject to the following conditions:
5
- # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
7
- # coding=utf-8
8
-
9
- """Modified Flash version of zoe model for fast training."""
10
-
11
- import torch.utils.checkpoint
12
- from torch import nn
13
- from transformers.utils import logging
14
- import torchvision.transforms.functional as F
15
- import numpy as np
16
- import math
17
-
18
- logger = logging.get_logger(__name__)
19
-
20
-
21
- class Ego3DPositionEmbeddingMLP(nn.Module):
22
- """Absolute pos embedding, learned.
23
- https://github.com/kwea123/nerf_pl/blob/52aeb387da64a9ad9a0f914ea9b049ffc598b20c/models/nerf.py#L4
24
- """
25
-
26
- def __init__(self, in_channels=3, num_pos_feats=768, n_freqs=8, logscale=True):
27
- super(Ego3DPositionEmbeddingMLP, self).__init__()
28
- self.n_freqs = n_freqs
29
- self.freq_out_channels = in_channels * (2 * n_freqs + 1)
30
- if logscale:
31
- freq_bands = 2 ** torch.linspace(0, n_freqs - 1, n_freqs)
32
- else:
33
- freq_bands = torch.linspace(1, 2 ** (n_freqs - 1), n_freqs)
34
-
35
- center = torch.tensor([0., 0., 2.]).repeat(in_channels // 3)
36
- self.register_buffer("freq_bands", freq_bands, persistent=False)
37
- self.register_buffer("center", center, persistent=False)
38
-
39
- self.position_embedding_head = nn.Sequential(
40
- nn.Linear(self.freq_out_channels, num_pos_feats),
41
- nn.LayerNorm(num_pos_feats),
42
- nn.ReLU(),
43
- nn.Linear(num_pos_feats, num_pos_feats),
44
- )
45
- self._reset_parameters()
46
-
47
- def _reset_parameters(self):
48
- """init with small weights to maintain stable training."""
49
- for p in self.parameters():
50
- if p.dim() > 1:
51
- nn.init.xavier_uniform_(p, gain=0.01)
52
-
53
- @torch.no_grad()
54
- def frequency_encoding(self, xyz):
55
- """
56
- Embeds x to (x, sin(2^k x), cos(2^k x), ...)
57
- Different from the paper, "x" is also in the output
58
- See https://github.com/bmild/nerf/issues/12
59
- x \in [-2, 2]
60
- y \in [-2, 2]
61
- z \in [0., 4]
62
- Inputs:
63
- x: (b n m)
64
- Outputs:
65
- out: (b n o)
66
- """
67
- xyz_n = ((xyz - self.center) / 2.0).to(self.freq_bands.dtype)
68
- xyz_feq = xyz_n.unsqueeze(-1) * self.freq_bands # (b n m 1)
69
- sin_xyz, cos_xyz = torch.sin(xyz_feq), torch.cos(xyz_feq) # (b n m nf)
70
- encoding = torch.cat([xyz_n.unsqueeze(-1), sin_xyz, cos_xyz], -1).reshape(*xyz.shape[:2], -1)
71
- return encoding
72
-
73
- def forward(self, xyz):
74
- """Forward pass, xyz is (B, N, 3or6), output (B, N, F)."""
75
- # TODO: encoding with 3D position
76
- freq_encoding = self.frequency_encoding(xyz)
77
- position_embedding = self.position_embedding_head(freq_encoding)
78
- return position_embedding
79
-
80
-
81
- def get_resize_output_image_size(
82
- input_height: int,
83
- input_width: int,
84
- output_size: tuple = (384, 512),
85
- keep_aspect_ratio: bool = True,
86
- multiple: int = 32,
87
- ):
88
- def constrain_to_multiple_of(val, multiple, min_val=0):
89
- x = (np.round(val / multiple) * multiple).astype(int)
90
- if x < min_val:
91
- x = math.ceil(val / multiple) * multiple
92
- return x
93
-
94
- output_height, output_width = output_size
95
- scale_height = output_height / input_height
96
- scale_width = output_width / input_width
97
-
98
- if keep_aspect_ratio:
99
- # scale as little as possible
100
- if abs(1 - scale_width) < abs(1 - scale_height):
101
- scale_height = scale_width
102
- else:
103
- scale_width = scale_height
104
-
105
- new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
106
- new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
107
-
108
- return (int(new_height), int(new_width))
109
-
110
-
111
- def process_zoe(pixel_values, pad_mode="reflect", output_size=(384, 512)):
112
- """https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/zoedepth/image_processing_zoedepth.py"""
113
- # h, w = images.shape[-2:]
114
- # pad images
115
- ph, pw = 31, 31 # int((h / 2)**0.5 * 3), int((w / 2)**0.5 * 3) # 32, 31
116
- images = torch.nn.functional.pad(pixel_values, (pw, pw, ph, ph), mode=pad_mode)
117
-
118
- # resize images
119
- size = (384, 384) # get_resize_output_image_size(h, w, output_size=output_size, keep_aspect_ratio=True, multiple=32) # 384, 384
120
- images = torch.nn.functional.interpolate(images, size=size, mode="bicubic", align_corners=True)
121
-
122
- # NOTE: zoe: padding -> resize -> nomalize.
123
- # BUT: siglip processor get nomalized image, we simplely follow `nomalize -> padding -> resize` in reflect pad mode
124
- ZOE_MEAN, ZOE_STD = (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
125
- images = F.normalize(images, mean=ZOE_MEAN, std=ZOE_STD)
126
- return images, ph, pw