SchulzR97
/

TUC-AR-C3D

Video Classification

English

robotics

Model card Files Files and versions Community

Robert Schulz commited on Jan 31

Commit

2511aa0

1 Parent(s): d7dea55

commit files to HF hub

Browse files

Files changed (3) hide show

model.py +731 -0
tuc-ar.bin +0 -3
ucf101.bin +0 -3

model.py ADDED Viewed

	@@ -0,0 +1,731 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.models import resnet50
+class Conv2DBlock(nn.Module):
+    def __init__(
+            self,
+            in_channels:int,
+            out_channels:int,
+            kernel_size_conv:tuple[int, int],
+            kernel_size_pool:tuple[int, int],
+            stride:tuple[int, int],
+            padding_conv:int = 0,
+            p_dropout:float = 0.5
+    ):
+        super(Conv2DBlock, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size_conv, padding=padding_conv)
+        self.pool = nn.MaxPool2d(kernel_size=kernel_size_pool, stride=stride)
+        self.dropout = nn.Dropout2d(p_dropout)
+        self.relu = nn.LeakyReLU()
+    def forward(self, X):
+        Y = self.conv(X)
+        Y = self.pool(Y)
+        Y = self.dropout(Y)
+        Y = self.relu(Y)
+        return Y
+class Conv3DBlock(nn.Module):
+    def __init__(
+            self,
+            in_channels:int,
+            out_channels:int,
+            kernel_size_conv:tuple[int, int, int],
+            kernel_size_pool:tuple[int, int, int],
+            stride:tuple[int, int, int],
+            padding_conv:int = 0,
+            p_dropout:float = 0.5
+    ):
+        super(Conv3DBlock, self).__init__()
+        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size=kernel_size_conv, padding=padding_conv)
+        self.pool = nn.MaxPool3d(kernel_size=kernel_size_pool, stride=stride)
+        self.dropout = nn.Dropout3d(p_dropout)
+        self.batchnorm = nn.BatchNorm3d(out_channels)
+        self.relu = nn.LeakyReLU()
+    def forward(self, X):
+        Y = self.conv(X)
+        Y = self.pool(Y)
+        Y = self.batchnorm(Y)
+        Y = self.dropout(Y)
+        Y = self.relu(Y)
+        return Y
+class SelfAttention(nn.Module):
+    def __init__(
+            self,
+            d_q:int = 2,
+            d_k:int = 2,
+            d_v:int = 4,
+            embed_dim:int = 3
+        ):
+        super().__init__()
+        self.d_q = d_q
+        self.d_k = d_k
+        self.d_v = d_v
+        self.W_q = nn.Parameter(torch.rand(embed_dim, d_q))
+        self.W_k = nn.Parameter(torch.rand(embed_dim, d_k))
+        self.W_v = nn.Parameter(torch.rand(embed_dim, d_v))
+        pass
+    def forward(self, X):
+        Z = []
+        # iterate over batch_size
+        for x in X:
+            Q = x @ self.W_q    # Queries
+            K = x @ self.W_k    # Keys
+            V = x @ self.W_v    # Values
+            omega = Q @ K.T                                     # omega ...unnormalized attantion weights
+            alpha = F.softmax(omega / self.d_k**0.5, dim=0)     # alpha ...normalized attention weights
+            z = alpha @ V                                       # z     ...context vector -> attention-weighted version of original query input x_i
+            Z.append(z)
+        Z = torch.stack(Z)
+        return Z
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(
+            self,
+            num_heads:int,
+            d_q:int = 2,
+            d_k:int = 2,
+            d_v:int = 4,
+            embed_dim:int = 3
+        ):
+        super().__init__()
+        self.d_q = d_q
+        self.d_k = d_k
+        self.d_v = d_v
+        self.heads = nn.ModuleList([SelfAttention(d_q, d_k, d_v, embed_dim) for _ in range(num_heads)])
+    def forward(self, X):
+        return torch.cat([head(X) for head in self.heads], dim=-1)
+class model001(nn.Module):
+    def __init__(
+            self,
+            sequence_length = 30,
+            num_actions:int = 10
+        ):
+        super(model001, self).__init__()
+        self.conv1 = nn.Conv3d(sequence_length, 64, kernel_size=(2, 7, 7))
+        self.maxPool1 = nn.MaxPool3d(kernel_size=(1, 7, 7), stride=(1, 5, 5))
+        self.batchnorm1 = nn.BatchNorm3d(64)
+        self.conv2 = nn.Conv3d(64, 96, kernel_size=(2, 5, 5))
+        self.maxPool2 = nn.MaxPool3d(kernel_size=(1, 5, 5), stride=(1, 3, 3))
+        self.batchnorm2 = nn.BatchNorm3d(96)
+        self.conv3 = nn.Conv3d(96, 128, kernel_size=(2, 5, 5))
+        self.maxPool3 = nn.MaxPool3d(kernel_size=(1, 5, 5), stride=(1, 3, 3))
+        self.batchnorm3 = nn.BatchNorm3d(128)
+        self.flatten = nn.Flatten()
+        self.readout = nn.Linear(4608, num_actions)
+        self.dropout1d = nn.Dropout1d(p = 0.2)
+        self.dropout3d = nn.Dropout3d(p = 0.2)
+        self.relu = nn.ReLU()
+        self.softmax = nn.Softmax(dim = 1)
+        self.sigmoid = nn.Sigmoid()
+        self.num_actions = num_actions
+    def forward(self, X):
+        #X = X.permute(0, 2, 1, 3, 4)
+        Y = X
+        Y = self.conv1(Y)
+        Y = self.maxPool1(Y)
+        Y = self.batchnorm1(Y)
+        Y = self.dropout3d(Y)
+        Y = self.relu(Y)
+        Y = self.conv2(Y)
+        Y = self.maxPool2(Y)
+        Y = self.batchnorm2(Y)
+        Y = self.dropout3d(Y)
+        Y = self.relu(Y)
+        Y = self.conv3(Y)
+        Y = self.maxPool3(Y)
+        Y = self.batchnorm3(Y)
+        Y = self.dropout3d(Y)
+        Y = self.relu(Y)
+        Y = self.flatten(Y)
+        Y = self.readout(Y)
+        Y = self.dropout1d(Y)
+        Y = self.softmax(Y)
+        #Y = self.sigmoid(Y)
+        return Y
+class model002(nn.Module):
+    def __init__(
+            self,
+            sequence_length = 30,
+            num_actions:int = 10
+        ):
+        super(model002, self).__init__()
+        self.sequence_length = sequence_length
+        self.input_size = (400, 400)
+        self.conv1 = Conv3DBlock(
+            in_channels = sequence_length,
+            out_channels = 64,
+            kernel_size_conv = (2, 7, 7),
+            kernel_size_pool = (1, 7, 7),
+            stride = (1, 5, 5)
+        )
+        self.conv2 = Conv3DBlock(
+            in_channels = 64,
+            out_channels = 96,
+            kernel_size_conv = (2, 5, 5),
+            kernel_size_pool = (1, 5, 5),
+            stride = (1, 3, 3)
+        )
+        self.conv3 = Conv3DBlock(
+            in_channels = 96,
+            out_channels = 128,
+            kernel_size_conv = (2, 5, 5),
+            kernel_size_pool = (1, 5, 5),
+            stride = (1, 3, 3)
+        )
+        self.conv4 = Conv3DBlock(
+            in_channels = 128,
+            out_channels = 160,
+            kernel_size_conv = (1, 3, 3),
+            kernel_size_pool = (1, 3, 3),
+            stride = (1, 2, 2)
+        )
+        self.flatten = nn.Flatten(start_dim=1)
+        self.dropout = nn.Dropout()
+        self.readout = nn.Linear(160, num_actions)
+        self.softmax = nn.Softmax(dim=1)
+        self.num_actions = num_actions
+    def forward(self, X):
+        assert X.shape[1] == self.sequence_length and X.shape[2] == 4 and X.shape[3] == self.input_size[0] and X.shape[4] == self.input_size[1],\
+            f'Expected input shape (batch_size, sequence_length={self.sequence_length}, channels=4, width={self.input_size[0]}, height={self.input_size[1]}), but got ({X.shape})'
+        Y = X
+        Y = self.conv1(Y)
+        #print(Y.shape)
+        Y = self.conv2(Y)
+        #print(Y.shape)
+        Y = self.conv3(Y)
+        #print(Y.shape)
+        Y = self.conv4(Y)
+        #print(Y.shape)
+        Y = self.flatten(Y)
+        Y = self.dropout(Y)
+        #print(Y.shape)
+        Y = self.readout(Y)
+        Y = self.softmax(Y)
+        return Y
+class model003(nn.Module):
+    def __init__(
+            self,
+            sequence_length = 30,
+            num_actions:int = 10
+        ):
+        super(model003, self).__init__()
+        self.embed = resnet50(weights='DEFAULT')
+        self.attention = MultiHeadSelfAttention(num_heads=16, embed_dim=1000)
+        self.flatten = nn.Flatten(start_dim=1)
+        readout_dim1 = sequence_length * len(self.attention.heads) * self.attention.d_v
+        self.readout = nn.Linear(readout_dim1, num_actions)
+        self.softmax = nn.Softmax(dim=1)
+        self.num_actions = num_actions
+    def forward(self, X):
+        embeddings = []
+        for x in X:
+            with torch.no_grad():
+                embedded = self.embed(x)
+            embeddings.append(embedded)
+        embeddings = torch.stack(embeddings)
+        Y = self.attention(embeddings)
+        Y = self.flatten(Y)
+        Y = self.readout(Y)
+        Y = self.softmax(Y)
+        return Y
+class model004(nn.Module):
+    def __init__(
+            self,
+            sequence_length = 30,
+            num_actions:int = 10
+        ):
+        super().__init__()
+        self.sequence_length = sequence_length,
+        self.num_actions = num_actions
+        self.embed = nn.Embedding(sequence_length, 256)
+        self.conv1 = Conv2DBlock(
+            in_channels = 3,
+            out_channels = 16,
+            kernel_size_conv = (9, 9),
+            kernel_size_pool = (7, 7),
+            stride = (5, 5),
+            padding_conv=1,
+            p_dropout = 0
+        )
+        self.conv2 = Conv2DBlock(
+            in_channels = 16,
+            out_channels = 32,
+            kernel_size_conv = (7, 7),
+            kernel_size_pool = (5, 5),
+            stride = (3, 3),
+            p_dropout = 0
+        )
+        self.conv3 = Conv2DBlock(
+            in_channels = 32,
+            out_channels = 64,
+            kernel_size_conv = (5, 5),
+            kernel_size_pool = (3, 3),
+            stride = (2, 2),
+            p_dropout = 0
+        )
+        # self.conv4 = Conv2DBlock(
+        #     in_channels = 64,
+        #     out_channels = 128,
+        #     kernel_size_conv = (5, 5),
+        #     kernel_size_pool = (3, 3),
+        #     stride = (2, 2)
+        # )
+        self.attention = MultiHeadSelfAttention(num_heads=16, embed_dim=960)
+        self.flatten = nn.Flatten(start_dim=1)
+        readout_dim1 = sequence_length * len(self.attention.heads) * self.attention.d_v
+        self.readout = nn.Linear(readout_dim1, num_actions)
+        self.softmax = nn.Softmax(dim=1)
+    def forward(self, X:torch.Tensor):
+        Y = X.reshape((X.shape[0] * X.shape[1], X.shape[2], X.shape[3], X.shape[4]))
+        #print(Y.shape)
+        Y = self.conv1(Y)
+        #print(Y.shape)
+        Y = self.conv2(Y)
+        #print(Y.shape)
+        Y = self.conv3(Y)
+        #print(Y.shape)
+        #Y = self.conv4(Y)
+        #print(Y.shape)
+        Y = Y.reshape((X.shape[0], X.shape[1], Y.shape[1] * Y.shape[2] * Y.shape[3]))
+        #print(Y.shape)
+        Y = self.attention(Y)
+        #print(Y.shape)
+        Y = self.flatten(Y)
+        #print(Y.shape)
+        Y = self.readout(Y)
+        Y = self.softmax(Y)
+        return Y
+class model005(nn.Module):
+    def __init__(
+            self,
+            sequence_length = 30,
+            num_actions:int = 10
+        ):
+        super().__init__()
+        self.sequence_length = sequence_length
+        self.num_actions = num_actions
+        self.input_size = (300, 300)
+        self.embed = nn.Embedding(sequence_length, 1000)
+        self.conv1 = Conv2DBlock(
+            in_channels = 3,
+            out_channels = 16,
+            kernel_size_conv = (7, 7),
+            kernel_size_pool = (5, 5),
+            stride = (4, 4),
+            padding_conv=1,
+            p_dropout = 0.2
+        )
+        self.conv2 = Conv2DBlock(
+            in_channels = 16,
+            out_channels = 32,
+            kernel_size_conv = (7, 7),
+            kernel_size_pool = (5, 5),
+            stride = (3, 3),
+            p_dropout = 0.2
+        )
+        self.conv3 = Conv2DBlock(
+            in_channels = 32,
+            out_channels = 64,
+            kernel_size_conv = (5, 5),
+            kernel_size_pool = (3, 3),
+            stride = (2, 2),
+            p_dropout = 0.2
+        )
+        self.conv4 = Conv2DBlock(
+            in_channels = 64,
+            out_channels = 128,
+            kernel_size_conv = (5, 5),
+            kernel_size_pool = (3, 3),
+            stride = (2, 2),
+            p_dropout = 0.2
+        )
+        self.attention = MultiHeadSelfAttention(num_heads=16, embed_dim=128)
+        self.flatten = nn.Flatten(start_dim=1)
+        readout_dim1 = sequence_length * len(self.attention.heads) * self.attention.d_v
+        self.readout = nn.Linear(readout_dim1, num_actions)
+        self.softmax = nn.Softmax(dim=1)
+        self.dropout = nn.Dropout(p = 0.2)
+    def forward(self, X:torch.Tensor):
+        assert X.shape[1] == self.sequence_length and X.shape[2] == 3 and X.shape[3] == self.input_size[0] and X.shape[4] == self.input_size[1],\
+            f'Expected input shape (batch_size, sequence_length={self.sequence_length}, channels=3, width={self.input_size[0]}, height={self.input_size[1]}), but got ({X.shape})'
+        Y = X.reshape((X.shape[0] * X.shape[1], X.shape[2], X.shape[3], X.shape[4]))
+        #print(Y.shape)
+        Y = self.conv1(Y)
+        #print(Y.shape)
+        Y = self.conv2(Y)
+        #print(Y.shape)
+        Y = self.conv3(Y)
+        #print(Y.shape)
+        Y = self.conv4(Y)
+        #print(Y.shape)
+        Y = Y.reshape((X.shape[0], X.shape[1], Y.shape[1] * Y.shape[2] * Y.shape[3]))
+        #print(Y.shape)
+        Y = self.attention(Y)
+        #print(Y.shape)
+        Y = self.flatten(Y)
+        Y = self.dropout(Y)
+        #print(Y.shape)
+        Y = self.readout(Y)
+        Y = self.dropout(Y)
+        Y = self.softmax(Y)
+        return Y
+class model006(nn.Module):
+    def __init__(
+            self,
+            sequence_length = 30,
+            num_actions:int = 10
+        ):
+        super().__init__()
+        self.sequence_length = sequence_length
+        self.num_actions = num_actions
+        self.input_size = (300, 300)
+        #self.embed = nn.Embedding(sequence_length, 1000)
+        self.conv1 = Conv2DBlock(
+            in_channels = 4,
+            out_channels = 16,
+            kernel_size_conv = (7, 7),
+            kernel_size_pool = (5, 5),
+            stride = (4, 4),
+            padding_conv=1,
+            p_dropout = 0.2
+        )
+        self.conv2 = Conv2DBlock(
+            in_channels = 16,
+            out_channels = 32,
+            kernel_size_conv = (7, 7),
+            kernel_size_pool = (5, 5),
+            stride = (3, 3),
+            p_dropout = 0.2
+        )
+        self.conv3 = Conv2DBlock(
+            in_channels = 32,
+            out_channels = 64,
+            kernel_size_conv = (5, 5),
+            kernel_size_pool = (3, 3),
+            stride = (2, 2),
+            p_dropout = 0.2
+        )
+        self.conv4 = Conv2DBlock(
+            in_channels = 64,
+            out_channels = 128,
+            kernel_size_conv = (5, 5),
+            kernel_size_pool = (3, 3),
+            stride = (2, 2),
+            p_dropout = 0.2
+        )
+        self.attention = MultiHeadSelfAttention(num_heads=32, embed_dim=128, d_q = 4, d_k = 4, d_v = 8)
+        self.flatten = nn.Flatten(start_dim=1)
+        readout_dim1 = sequence_length * len(self.attention.heads) * self.attention.d_v
+        self.readout = nn.Linear(readout_dim1, num_actions)
+        self.softmax = nn.Softmax(dim=1)
+        self.dropout = nn.Dropout(p = 0.2)
+    def forward(self, X:torch.Tensor):
+        assert X.shape[1] == self.sequence_length and X.shape[2] == 4 and X.shape[3] == self.input_size[0] and X.shape[4] == self.input_size[1],\
+            f'Expected input shape (batch_size, sequence_length={self.sequence_length}, channels=4, width={self.input_size[0]}, height={self.input_size[1]}), but got ({X.shape})'
+        Y = X.reshape((X.shape[0] * X.shape[1], X.shape[2], X.shape[3], X.shape[4]))
+        #print(Y.shape)
+        Y = self.conv1(Y)
+        #print(Y.shape)
+        Y = self.conv2(Y)
+        #print(Y.shape)
+        Y = self.conv3(Y)
+        #print(Y.shape)
+        Y = self.conv4(Y)
+        #print(Y.shape)
+        Y = Y.reshape((X.shape[0], X.shape[1], Y.shape[1] * Y.shape[2] * Y.shape[3]))
+        #print(Y.shape)
+        Y = self.attention(Y)
+        #print(Y.shape)
+        Y = self.flatten(Y)
+        Y = self.dropout(Y)
+        #print(Y.shape)
+        Y = self.readout(Y)
+        Y = self.dropout(Y)
+        Y = self.softmax(Y)
+        return Y
+class model007(nn.Module):
+    def __init__(
+            self,
+            sequence_length = 30,
+            num_actions:int = 10
+        ):
+        super().__init__()
+        self.sequence_length = sequence_length
+        self.num_actions = num_actions
+        self.input_size = (300, 300)
+        self.conv1 = Conv3DBlock(
+            in_channels = sequence_length,
+            out_channels = 32,
+            kernel_size_conv = (2, 7, 7),
+            kernel_size_pool = (1, 7, 7),
+            stride=(1, 5, 5),
+            p_dropout = 0.2
+        )
+        self.conv2 = Conv3DBlock(
+            in_channels = 32,
+            out_channels = 64,
+            kernel_size_conv = (2, 5, 5),
+            kernel_size_pool = (1, 5, 5),
+            stride=(1, 3, 3),
+            p_dropout = 0.2
+        )
+        self.conv3 = Conv3DBlock(
+            in_channels = 96,
+            out_channels = 192,
+            kernel_size_conv = (2, 5, 5),
+            kernel_size_pool = (1, 3, 3),
+            stride=(1, 2, 2),
+            p_dropout = 0.2
+        )
+        self.conv4 = Conv3DBlock(
+            in_channels = 288,
+            out_channels = 675,
+            kernel_size_conv = (1, 5, 5),
+            kernel_size_pool = (1, 2, 2),
+            stride=(1, 2, 2),
+            p_dropout = 0.2
+        )
+        self.downsample13 = nn.MaxPool3d(kernel_size=(2,7,7), stride=(1,3,3))
+        self.downsample14 = nn.MaxPool3d(kernel_size=(2,9,9), stride=(2,8,8))
+        self.downsample24 = nn.MaxPool3d(kernel_size=(2,7,7), stride=(2,2,2))
+        self.flatten = nn.Flatten(start_dim = 1)
+        self.readout = nn.Linear(2700, num_actions)
+        self.relu = nn.LeakyReLU()
+        self.dropout = nn.Dropout(p = 0.5)
+        self.softmax = nn.Softmax(dim = 1)
+    def forward(self, X):
+        Y = X
+        Y1 = self.conv1(Y)
+        Y2 = self.conv2(Y1)
+        Y13 = self.downsample13(Y1)
+        Y14 = self.downsample14(Y1)
+        Y24 = self.downsample24(Y2)
+        Y2_cat = torch.cat([Y2, Y13], dim=1)
+        Y3 = self.conv3(Y2_cat)
+        Y3_cat = torch.cat([Y3, Y14, Y24], dim=1)
+        Y4 = self.conv4(Y3_cat)
+        Y = self.flatten(Y4)
+        # print('X', X.shape)
+        # print('Y1', Y1.shape)
+        # print('Y2', Y2.shape)
+        # print('Y3', Y3.shape)
+        # print('Y4', Y4.shape)
+        # print('Y', Y.shape)
+        # print('Y13', Y13.shape)
+        # print('Y14', Y14.shape)
+        # print('Y24', Y24.shape)
+        # print('Y2_cat', Y2_cat.shape)
+        # print('Y3_cat', Y3_cat.shape)
+        Y = self.readout(Y)
+        Y = self.softmax(Y)
+        return Y
+class model008(nn.Module):
+    def __init__(
+            self,
+            use_depth_channel:bool,
+            sequence_length = 30,
+            num_actions:int = 10,
+            apply_softmax:bool = True
+        ):
+        super().__init__()
+        self.sequence_length = sequence_length
+        self.num_actions = num_actions
+        self.use_depth_channel = use_depth_channel
+        self.conv1 = Conv3DBlock(
+            in_channels = sequence_length,
+            out_channels = 64,
+            kernel_size_conv = (2, 7, 7),
+            kernel_size_pool = (1, 7, 7),
+            stride=(1, 5, 5),
+            p_dropout = 0.2
+        )
+        self.conv2 = Conv3DBlock(
+            in_channels = 64,
+            out_channels = 128,
+            kernel_size_conv = (2, 5, 5),
+            kernel_size_pool = (1, 5, 5),
+            stride=(1, 3, 3),
+            p_dropout = 0.2
+        )
+        self.conv3 = Conv3DBlock(
+            in_channels = 192,
+            out_channels = 384,
+            kernel_size_conv = (2, 5, 5) if self.use_depth_channel else (1, 5, 5),
+            kernel_size_pool = (1, 3, 3),
+            stride=(1, 2, 2),
+            p_dropout = 0.2
+        )
+        self.conv4 = Conv3DBlock(
+            in_channels = 576,
+            out_channels = 1152,
+            kernel_size_conv = (1, 3, 3),
+            kernel_size_pool = (1, 2, 2),
+            stride=(1, 2, 2),
+            p_dropout = 0.2
+        )
+        self.downsample13 = nn.MaxPool3d(kernel_size=(2,7,7), stride=(1,3,3))
+        self.downsample14 = nn.MaxPool3d(kernel_size=(2,9,9), stride=(2,8,8))
+        if self.use_depth_channel:
+            self.downsample24 = nn.MaxPool3d(kernel_size=(2,7,7), stride=(2,2,2))
+        else:
+            self.downsample24 = nn.MaxPool3d(kernel_size=(1,7,7), stride=(1,2,2))
+        self.downsample1e = nn.MaxPool3d(kernel_size=(2,28,28), stride=(2,21,21))
+        self.downsample2e = nn.MaxPool3d(kernel_size=(2,9,9) if self.use_depth_channel else (1,9,9), stride=(1,6,6))
+        self.downsample3e = nn.MaxPool3d(kernel_size=(1,5,5), stride=(1,2,2))
+        self.dropout3d = nn.Dropout3d(p=0.2)
+        self.flatten = nn.Flatten(start_dim = 1)
+        self.readout = nn.Linear(15552, num_actions)
+        self.relu = nn.LeakyReLU()
+        self.dropout = nn.Dropout(p = 0.2)
+        self.softmax = nn.Softmax(dim = 1)
+        self.sigmoid = nn.Sigmoid()
+        self.apply_softmax = apply_softmax
+    def forward(self, X):
+        Y = X
+        Y1 = self.conv1(Y)
+        Y2 = self.conv2(Y1)
+        Y13 = self.downsample13(Y1)
+        Y14 = self.downsample14(Y1)
+        Y24 = self.downsample24(Y2)
+        Y2_cat = torch.cat([Y2, Y13], dim=1)
+        Y3 = self.conv3(Y2_cat)
+        Y3_cat = torch.cat([Y3, Y14, Y24], dim=1)
+        Y4 = self.conv4(Y3_cat)
+        Y1e = self.downsample1e(Y1)
+        Y2e = self.downsample2e(Y2)
+        Y3e = self.downsample3e(Y3)
+        Y4_cat = torch.cat([Y4, Y1e, Y2e, Y3e], dim=1)
+        Y = self.flatten(Y4_cat)
+        # print('X', X.shape)
+        # print('Y1', Y1.shape)
+        # print('Y2', Y2.shape)
+        # print('Y3', Y3.shape)
+        # print('Y4', Y4.shape)
+        # print('Y', Y.shape)
+        # print('Y13', Y13.shape)
+        # print('Y14', Y14.shape)
+        # print('Y24', Y24.shape)
+        # print('Y2_cat', Y2_cat.shape)
+        # print('Y3_cat', Y3_cat.shape)
+        Y = self.readout(Y)
+        if self.apply_softmax:
+            Y = self.softmax(Y)
+        else:
+            Y = self.sigmoid(Y)
+        return Y
+if __name__ == '__main__':
+    batch_size = 4
+    seq_len = 30
+    embed_dim = 3
+    image_size = (400, 40)
+    X = torch.rand((batch_size, seq_len, 3, image_size[0], image_size[1]))
+    model3 = model003()
+    model3.to('cpu')
+    X = X.to('cpu')
+    Y = model3(X)
+    pass

tuc-ar.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6f928b8a21f5d7089395bb6f51e7556f7a0c0fa22951709016ff09bc9e1ac68d
-size 41698458

ucf101.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d743f2b218846ef6ad770e3f4efcd95e2ba852e121cb67194381c311ece23405
-size 40739610