Robert Schulz commited on
Commit
2511aa0
·
1 Parent(s): d7dea55

commit files to HF hub

Browse files
Files changed (3) hide show
  1. model.py +731 -0
  2. tuc-ar.bin +0 -3
  3. ucf101.bin +0 -3
model.py ADDED
@@ -0,0 +1,731 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from torchvision.models import resnet50
5
+
6
+ class Conv2DBlock(nn.Module):
7
+ def __init__(
8
+ self,
9
+ in_channels:int,
10
+ out_channels:int,
11
+ kernel_size_conv:tuple[int, int],
12
+ kernel_size_pool:tuple[int, int],
13
+ stride:tuple[int, int],
14
+ padding_conv:int = 0,
15
+ p_dropout:float = 0.5
16
+ ):
17
+ super(Conv2DBlock, self).__init__()
18
+
19
+ self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size_conv, padding=padding_conv)
20
+ self.pool = nn.MaxPool2d(kernel_size=kernel_size_pool, stride=stride)
21
+ self.dropout = nn.Dropout2d(p_dropout)
22
+ self.relu = nn.LeakyReLU()
23
+
24
+ def forward(self, X):
25
+ Y = self.conv(X)
26
+ Y = self.pool(Y)
27
+ Y = self.dropout(Y)
28
+ Y = self.relu(Y)
29
+
30
+ return Y
31
+
32
+ class Conv3DBlock(nn.Module):
33
+ def __init__(
34
+ self,
35
+ in_channels:int,
36
+ out_channels:int,
37
+ kernel_size_conv:tuple[int, int, int],
38
+ kernel_size_pool:tuple[int, int, int],
39
+ stride:tuple[int, int, int],
40
+ padding_conv:int = 0,
41
+ p_dropout:float = 0.5
42
+ ):
43
+ super(Conv3DBlock, self).__init__()
44
+
45
+ self.conv = nn.Conv3d(in_channels, out_channels, kernel_size=kernel_size_conv, padding=padding_conv)
46
+ self.pool = nn.MaxPool3d(kernel_size=kernel_size_pool, stride=stride)
47
+ self.dropout = nn.Dropout3d(p_dropout)
48
+ self.batchnorm = nn.BatchNorm3d(out_channels)
49
+ self.relu = nn.LeakyReLU()
50
+
51
+ def forward(self, X):
52
+ Y = self.conv(X)
53
+ Y = self.pool(Y)
54
+ Y = self.batchnorm(Y)
55
+ Y = self.dropout(Y)
56
+ Y = self.relu(Y)
57
+
58
+ return Y
59
+
60
+ class SelfAttention(nn.Module):
61
+ def __init__(
62
+ self,
63
+ d_q:int = 2,
64
+ d_k:int = 2,
65
+ d_v:int = 4,
66
+ embed_dim:int = 3
67
+ ):
68
+ super().__init__()
69
+
70
+ self.d_q = d_q
71
+ self.d_k = d_k
72
+ self.d_v = d_v
73
+
74
+ self.W_q = nn.Parameter(torch.rand(embed_dim, d_q))
75
+ self.W_k = nn.Parameter(torch.rand(embed_dim, d_k))
76
+ self.W_v = nn.Parameter(torch.rand(embed_dim, d_v))
77
+ pass
78
+
79
+ def forward(self, X):
80
+ Z = []
81
+ # iterate over batch_size
82
+ for x in X:
83
+ Q = x @ self.W_q # Queries
84
+ K = x @ self.W_k # Keys
85
+ V = x @ self.W_v # Values
86
+
87
+ omega = Q @ K.T # omega ...unnormalized attantion weights
88
+ alpha = F.softmax(omega / self.d_k**0.5, dim=0) # alpha ...normalized attention weights
89
+ z = alpha @ V # z ...context vector -> attention-weighted version of original query input x_i
90
+ Z.append(z)
91
+
92
+ Z = torch.stack(Z)
93
+ return Z
94
+
95
+ class MultiHeadSelfAttention(nn.Module):
96
+ def __init__(
97
+ self,
98
+ num_heads:int,
99
+ d_q:int = 2,
100
+ d_k:int = 2,
101
+ d_v:int = 4,
102
+ embed_dim:int = 3
103
+ ):
104
+ super().__init__()
105
+
106
+ self.d_q = d_q
107
+ self.d_k = d_k
108
+ self.d_v = d_v
109
+
110
+ self.heads = nn.ModuleList([SelfAttention(d_q, d_k, d_v, embed_dim) for _ in range(num_heads)])
111
+
112
+ def forward(self, X):
113
+ return torch.cat([head(X) for head in self.heads], dim=-1)
114
+
115
+ class model001(nn.Module):
116
+ def __init__(
117
+ self,
118
+ sequence_length = 30,
119
+ num_actions:int = 10
120
+ ):
121
+ super(model001, self).__init__()
122
+
123
+ self.conv1 = nn.Conv3d(sequence_length, 64, kernel_size=(2, 7, 7))
124
+ self.maxPool1 = nn.MaxPool3d(kernel_size=(1, 7, 7), stride=(1, 5, 5))
125
+ self.batchnorm1 = nn.BatchNorm3d(64)
126
+
127
+ self.conv2 = nn.Conv3d(64, 96, kernel_size=(2, 5, 5))
128
+ self.maxPool2 = nn.MaxPool3d(kernel_size=(1, 5, 5), stride=(1, 3, 3))
129
+ self.batchnorm2 = nn.BatchNorm3d(96)
130
+
131
+ self.conv3 = nn.Conv3d(96, 128, kernel_size=(2, 5, 5))
132
+ self.maxPool3 = nn.MaxPool3d(kernel_size=(1, 5, 5), stride=(1, 3, 3))
133
+ self.batchnorm3 = nn.BatchNorm3d(128)
134
+
135
+ self.flatten = nn.Flatten()
136
+ self.readout = nn.Linear(4608, num_actions)
137
+
138
+ self.dropout1d = nn.Dropout1d(p = 0.2)
139
+ self.dropout3d = nn.Dropout3d(p = 0.2)
140
+
141
+ self.relu = nn.ReLU()
142
+ self.softmax = nn.Softmax(dim = 1)
143
+ self.sigmoid = nn.Sigmoid()
144
+ self.num_actions = num_actions
145
+
146
+ def forward(self, X):
147
+ #X = X.permute(0, 2, 1, 3, 4)
148
+ Y = X
149
+
150
+ Y = self.conv1(Y)
151
+ Y = self.maxPool1(Y)
152
+ Y = self.batchnorm1(Y)
153
+ Y = self.dropout3d(Y)
154
+ Y = self.relu(Y)
155
+
156
+ Y = self.conv2(Y)
157
+ Y = self.maxPool2(Y)
158
+ Y = self.batchnorm2(Y)
159
+ Y = self.dropout3d(Y)
160
+ Y = self.relu(Y)
161
+
162
+ Y = self.conv3(Y)
163
+ Y = self.maxPool3(Y)
164
+ Y = self.batchnorm3(Y)
165
+ Y = self.dropout3d(Y)
166
+ Y = self.relu(Y)
167
+
168
+ Y = self.flatten(Y)
169
+
170
+ Y = self.readout(Y)
171
+ Y = self.dropout1d(Y)
172
+ Y = self.softmax(Y)
173
+ #Y = self.sigmoid(Y)
174
+
175
+ return Y
176
+
177
+ class model002(nn.Module):
178
+ def __init__(
179
+ self,
180
+ sequence_length = 30,
181
+ num_actions:int = 10
182
+ ):
183
+ super(model002, self).__init__()
184
+
185
+ self.sequence_length = sequence_length
186
+ self.input_size = (400, 400)
187
+
188
+ self.conv1 = Conv3DBlock(
189
+ in_channels = sequence_length,
190
+ out_channels = 64,
191
+ kernel_size_conv = (2, 7, 7),
192
+ kernel_size_pool = (1, 7, 7),
193
+ stride = (1, 5, 5)
194
+ )
195
+ self.conv2 = Conv3DBlock(
196
+ in_channels = 64,
197
+ out_channels = 96,
198
+ kernel_size_conv = (2, 5, 5),
199
+ kernel_size_pool = (1, 5, 5),
200
+ stride = (1, 3, 3)
201
+ )
202
+ self.conv3 = Conv3DBlock(
203
+ in_channels = 96,
204
+ out_channels = 128,
205
+ kernel_size_conv = (2, 5, 5),
206
+ kernel_size_pool = (1, 5, 5),
207
+ stride = (1, 3, 3)
208
+ )
209
+ self.conv4 = Conv3DBlock(
210
+ in_channels = 128,
211
+ out_channels = 160,
212
+ kernel_size_conv = (1, 3, 3),
213
+ kernel_size_pool = (1, 3, 3),
214
+ stride = (1, 2, 2)
215
+ )
216
+ self.flatten = nn.Flatten(start_dim=1)
217
+ self.dropout = nn.Dropout()
218
+ self.readout = nn.Linear(160, num_actions)
219
+ self.softmax = nn.Softmax(dim=1)
220
+ self.num_actions = num_actions
221
+
222
+ def forward(self, X):
223
+ assert X.shape[1] == self.sequence_length and X.shape[2] == 4 and X.shape[3] == self.input_size[0] and X.shape[4] == self.input_size[1],\
224
+ f'Expected input shape (batch_size, sequence_length={self.sequence_length}, channels=4, width={self.input_size[0]}, height={self.input_size[1]}), but got ({X.shape})'
225
+ Y = X
226
+
227
+ Y = self.conv1(Y)
228
+ #print(Y.shape)
229
+ Y = self.conv2(Y)
230
+ #print(Y.shape)
231
+ Y = self.conv3(Y)
232
+ #print(Y.shape)
233
+ Y = self.conv4(Y)
234
+ #print(Y.shape)
235
+ Y = self.flatten(Y)
236
+ Y = self.dropout(Y)
237
+ #print(Y.shape)
238
+ Y = self.readout(Y)
239
+
240
+ Y = self.softmax(Y)
241
+ return Y
242
+
243
+ class model003(nn.Module):
244
+ def __init__(
245
+ self,
246
+ sequence_length = 30,
247
+ num_actions:int = 10
248
+ ):
249
+ super(model003, self).__init__()
250
+
251
+ self.embed = resnet50(weights='DEFAULT')
252
+
253
+ self.attention = MultiHeadSelfAttention(num_heads=16, embed_dim=1000)
254
+ self.flatten = nn.Flatten(start_dim=1)
255
+
256
+ readout_dim1 = sequence_length * len(self.attention.heads) * self.attention.d_v
257
+ self.readout = nn.Linear(readout_dim1, num_actions)
258
+ self.softmax = nn.Softmax(dim=1)
259
+ self.num_actions = num_actions
260
+
261
+ def forward(self, X):
262
+ embeddings = []
263
+ for x in X:
264
+ with torch.no_grad():
265
+ embedded = self.embed(x)
266
+ embeddings.append(embedded)
267
+ embeddings = torch.stack(embeddings)
268
+
269
+ Y = self.attention(embeddings)
270
+ Y = self.flatten(Y)
271
+ Y = self.readout(Y)
272
+ Y = self.softmax(Y)
273
+ return Y
274
+
275
+ class model004(nn.Module):
276
+ def __init__(
277
+ self,
278
+ sequence_length = 30,
279
+ num_actions:int = 10
280
+ ):
281
+ super().__init__()
282
+ self.sequence_length = sequence_length,
283
+ self.num_actions = num_actions
284
+
285
+ self.embed = nn.Embedding(sequence_length, 256)
286
+
287
+ self.conv1 = Conv2DBlock(
288
+ in_channels = 3,
289
+ out_channels = 16,
290
+ kernel_size_conv = (9, 9),
291
+ kernel_size_pool = (7, 7),
292
+ stride = (5, 5),
293
+ padding_conv=1,
294
+ p_dropout = 0
295
+ )
296
+ self.conv2 = Conv2DBlock(
297
+ in_channels = 16,
298
+ out_channels = 32,
299
+ kernel_size_conv = (7, 7),
300
+ kernel_size_pool = (5, 5),
301
+ stride = (3, 3),
302
+ p_dropout = 0
303
+ )
304
+ self.conv3 = Conv2DBlock(
305
+ in_channels = 32,
306
+ out_channels = 64,
307
+ kernel_size_conv = (5, 5),
308
+ kernel_size_pool = (3, 3),
309
+ stride = (2, 2),
310
+ p_dropout = 0
311
+ )
312
+ # self.conv4 = Conv2DBlock(
313
+ # in_channels = 64,
314
+ # out_channels = 128,
315
+ # kernel_size_conv = (5, 5),
316
+ # kernel_size_pool = (3, 3),
317
+ # stride = (2, 2)
318
+ # )
319
+
320
+ self.attention = MultiHeadSelfAttention(num_heads=16, embed_dim=960)
321
+ self.flatten = nn.Flatten(start_dim=1)
322
+
323
+ readout_dim1 = sequence_length * len(self.attention.heads) * self.attention.d_v
324
+ self.readout = nn.Linear(readout_dim1, num_actions)
325
+ self.softmax = nn.Softmax(dim=1)
326
+
327
+ def forward(self, X:torch.Tensor):
328
+ Y = X.reshape((X.shape[0] * X.shape[1], X.shape[2], X.shape[3], X.shape[4]))
329
+ #print(Y.shape)
330
+ Y = self.conv1(Y)
331
+ #print(Y.shape)
332
+ Y = self.conv2(Y)
333
+ #print(Y.shape)
334
+ Y = self.conv3(Y)
335
+ #print(Y.shape)
336
+ #Y = self.conv4(Y)
337
+ #print(Y.shape)
338
+ Y = Y.reshape((X.shape[0], X.shape[1], Y.shape[1] * Y.shape[2] * Y.shape[3]))
339
+ #print(Y.shape)
340
+ Y = self.attention(Y)
341
+ #print(Y.shape)
342
+ Y = self.flatten(Y)
343
+ #print(Y.shape)
344
+ Y = self.readout(Y)
345
+ Y = self.softmax(Y)
346
+ return Y
347
+
348
+ class model005(nn.Module):
349
+ def __init__(
350
+ self,
351
+ sequence_length = 30,
352
+ num_actions:int = 10
353
+ ):
354
+ super().__init__()
355
+ self.sequence_length = sequence_length
356
+ self.num_actions = num_actions
357
+ self.input_size = (300, 300)
358
+
359
+ self.embed = nn.Embedding(sequence_length, 1000)
360
+
361
+ self.conv1 = Conv2DBlock(
362
+ in_channels = 3,
363
+ out_channels = 16,
364
+ kernel_size_conv = (7, 7),
365
+ kernel_size_pool = (5, 5),
366
+ stride = (4, 4),
367
+ padding_conv=1,
368
+ p_dropout = 0.2
369
+ )
370
+ self.conv2 = Conv2DBlock(
371
+ in_channels = 16,
372
+ out_channels = 32,
373
+ kernel_size_conv = (7, 7),
374
+ kernel_size_pool = (5, 5),
375
+ stride = (3, 3),
376
+ p_dropout = 0.2
377
+ )
378
+ self.conv3 = Conv2DBlock(
379
+ in_channels = 32,
380
+ out_channels = 64,
381
+ kernel_size_conv = (5, 5),
382
+ kernel_size_pool = (3, 3),
383
+ stride = (2, 2),
384
+ p_dropout = 0.2
385
+ )
386
+ self.conv4 = Conv2DBlock(
387
+ in_channels = 64,
388
+ out_channels = 128,
389
+ kernel_size_conv = (5, 5),
390
+ kernel_size_pool = (3, 3),
391
+ stride = (2, 2),
392
+ p_dropout = 0.2
393
+ )
394
+
395
+ self.attention = MultiHeadSelfAttention(num_heads=16, embed_dim=128)
396
+ self.flatten = nn.Flatten(start_dim=1)
397
+
398
+ readout_dim1 = sequence_length * len(self.attention.heads) * self.attention.d_v
399
+ self.readout = nn.Linear(readout_dim1, num_actions)
400
+ self.softmax = nn.Softmax(dim=1)
401
+
402
+ self.dropout = nn.Dropout(p = 0.2)
403
+
404
+ def forward(self, X:torch.Tensor):
405
+ assert X.shape[1] == self.sequence_length and X.shape[2] == 3 and X.shape[3] == self.input_size[0] and X.shape[4] == self.input_size[1],\
406
+ f'Expected input shape (batch_size, sequence_length={self.sequence_length}, channels=3, width={self.input_size[0]}, height={self.input_size[1]}), but got ({X.shape})'
407
+ Y = X.reshape((X.shape[0] * X.shape[1], X.shape[2], X.shape[3], X.shape[4]))
408
+ #print(Y.shape)
409
+ Y = self.conv1(Y)
410
+ #print(Y.shape)
411
+ Y = self.conv2(Y)
412
+ #print(Y.shape)
413
+ Y = self.conv3(Y)
414
+ #print(Y.shape)
415
+ Y = self.conv4(Y)
416
+ #print(Y.shape)
417
+ Y = Y.reshape((X.shape[0], X.shape[1], Y.shape[1] * Y.shape[2] * Y.shape[3]))
418
+ #print(Y.shape)
419
+ Y = self.attention(Y)
420
+ #print(Y.shape)
421
+ Y = self.flatten(Y)
422
+ Y = self.dropout(Y)
423
+ #print(Y.shape)
424
+ Y = self.readout(Y)
425
+ Y = self.dropout(Y)
426
+ Y = self.softmax(Y)
427
+ return Y
428
+
429
+ class model006(nn.Module):
430
+ def __init__(
431
+ self,
432
+ sequence_length = 30,
433
+ num_actions:int = 10
434
+ ):
435
+ super().__init__()
436
+ self.sequence_length = sequence_length
437
+ self.num_actions = num_actions
438
+ self.input_size = (300, 300)
439
+
440
+ #self.embed = nn.Embedding(sequence_length, 1000)
441
+
442
+ self.conv1 = Conv2DBlock(
443
+ in_channels = 4,
444
+ out_channels = 16,
445
+ kernel_size_conv = (7, 7),
446
+ kernel_size_pool = (5, 5),
447
+ stride = (4, 4),
448
+ padding_conv=1,
449
+ p_dropout = 0.2
450
+ )
451
+ self.conv2 = Conv2DBlock(
452
+ in_channels = 16,
453
+ out_channels = 32,
454
+ kernel_size_conv = (7, 7),
455
+ kernel_size_pool = (5, 5),
456
+ stride = (3, 3),
457
+ p_dropout = 0.2
458
+ )
459
+ self.conv3 = Conv2DBlock(
460
+ in_channels = 32,
461
+ out_channels = 64,
462
+ kernel_size_conv = (5, 5),
463
+ kernel_size_pool = (3, 3),
464
+ stride = (2, 2),
465
+ p_dropout = 0.2
466
+ )
467
+ self.conv4 = Conv2DBlock(
468
+ in_channels = 64,
469
+ out_channels = 128,
470
+ kernel_size_conv = (5, 5),
471
+ kernel_size_pool = (3, 3),
472
+ stride = (2, 2),
473
+ p_dropout = 0.2
474
+ )
475
+
476
+ self.attention = MultiHeadSelfAttention(num_heads=32, embed_dim=128, d_q = 4, d_k = 4, d_v = 8)
477
+ self.flatten = nn.Flatten(start_dim=1)
478
+
479
+ readout_dim1 = sequence_length * len(self.attention.heads) * self.attention.d_v
480
+ self.readout = nn.Linear(readout_dim1, num_actions)
481
+ self.softmax = nn.Softmax(dim=1)
482
+
483
+ self.dropout = nn.Dropout(p = 0.2)
484
+
485
+ def forward(self, X:torch.Tensor):
486
+ assert X.shape[1] == self.sequence_length and X.shape[2] == 4 and X.shape[3] == self.input_size[0] and X.shape[4] == self.input_size[1],\
487
+ f'Expected input shape (batch_size, sequence_length={self.sequence_length}, channels=4, width={self.input_size[0]}, height={self.input_size[1]}), but got ({X.shape})'
488
+ Y = X.reshape((X.shape[0] * X.shape[1], X.shape[2], X.shape[3], X.shape[4]))
489
+ #print(Y.shape)
490
+ Y = self.conv1(Y)
491
+ #print(Y.shape)
492
+ Y = self.conv2(Y)
493
+ #print(Y.shape)
494
+ Y = self.conv3(Y)
495
+ #print(Y.shape)
496
+ Y = self.conv4(Y)
497
+ #print(Y.shape)
498
+ Y = Y.reshape((X.shape[0], X.shape[1], Y.shape[1] * Y.shape[2] * Y.shape[3]))
499
+ #print(Y.shape)
500
+ Y = self.attention(Y)
501
+ #print(Y.shape)
502
+ Y = self.flatten(Y)
503
+ Y = self.dropout(Y)
504
+ #print(Y.shape)
505
+ Y = self.readout(Y)
506
+ Y = self.dropout(Y)
507
+ Y = self.softmax(Y)
508
+ return Y
509
+
510
+ class model007(nn.Module):
511
+ def __init__(
512
+ self,
513
+ sequence_length = 30,
514
+ num_actions:int = 10
515
+ ):
516
+ super().__init__()
517
+ self.sequence_length = sequence_length
518
+ self.num_actions = num_actions
519
+ self.input_size = (300, 300)
520
+
521
+ self.conv1 = Conv3DBlock(
522
+ in_channels = sequence_length,
523
+ out_channels = 32,
524
+ kernel_size_conv = (2, 7, 7),
525
+ kernel_size_pool = (1, 7, 7),
526
+ stride=(1, 5, 5),
527
+ p_dropout = 0.2
528
+ )
529
+ self.conv2 = Conv3DBlock(
530
+ in_channels = 32,
531
+ out_channels = 64,
532
+ kernel_size_conv = (2, 5, 5),
533
+ kernel_size_pool = (1, 5, 5),
534
+ stride=(1, 3, 3),
535
+ p_dropout = 0.2
536
+ )
537
+ self.conv3 = Conv3DBlock(
538
+ in_channels = 96,
539
+ out_channels = 192,
540
+ kernel_size_conv = (2, 5, 5),
541
+ kernel_size_pool = (1, 3, 3),
542
+ stride=(1, 2, 2),
543
+ p_dropout = 0.2
544
+ )
545
+ self.conv4 = Conv3DBlock(
546
+ in_channels = 288,
547
+ out_channels = 675,
548
+ kernel_size_conv = (1, 5, 5),
549
+ kernel_size_pool = (1, 2, 2),
550
+ stride=(1, 2, 2),
551
+ p_dropout = 0.2
552
+ )
553
+
554
+ self.downsample13 = nn.MaxPool3d(kernel_size=(2,7,7), stride=(1,3,3))
555
+ self.downsample14 = nn.MaxPool3d(kernel_size=(2,9,9), stride=(2,8,8))
556
+ self.downsample24 = nn.MaxPool3d(kernel_size=(2,7,7), stride=(2,2,2))
557
+
558
+ self.flatten = nn.Flatten(start_dim = 1)
559
+
560
+ self.readout = nn.Linear(2700, num_actions)
561
+
562
+ self.relu = nn.LeakyReLU()
563
+ self.dropout = nn.Dropout(p = 0.5)
564
+ self.softmax = nn.Softmax(dim = 1)
565
+
566
+ def forward(self, X):
567
+ Y = X
568
+
569
+ Y1 = self.conv1(Y)
570
+ Y2 = self.conv2(Y1)
571
+ Y13 = self.downsample13(Y1)
572
+ Y14 = self.downsample14(Y1)
573
+ Y24 = self.downsample24(Y2)
574
+ Y2_cat = torch.cat([Y2, Y13], dim=1)
575
+ Y3 = self.conv3(Y2_cat)
576
+ Y3_cat = torch.cat([Y3, Y14, Y24], dim=1)
577
+
578
+ Y4 = self.conv4(Y3_cat)
579
+
580
+
581
+ Y = self.flatten(Y4)
582
+
583
+ # print('X', X.shape)
584
+ # print('Y1', Y1.shape)
585
+ # print('Y2', Y2.shape)
586
+ # print('Y3', Y3.shape)
587
+ # print('Y4', Y4.shape)
588
+ # print('Y', Y.shape)
589
+
590
+ # print('Y13', Y13.shape)
591
+ # print('Y14', Y14.shape)
592
+ # print('Y24', Y24.shape)
593
+
594
+ # print('Y2_cat', Y2_cat.shape)
595
+ # print('Y3_cat', Y3_cat.shape)
596
+
597
+ Y = self.readout(Y)
598
+ Y = self.softmax(Y)
599
+
600
+ return Y
601
+
602
+ class model008(nn.Module):
603
+ def __init__(
604
+ self,
605
+ use_depth_channel:bool,
606
+ sequence_length = 30,
607
+ num_actions:int = 10,
608
+ apply_softmax:bool = True
609
+ ):
610
+ super().__init__()
611
+ self.sequence_length = sequence_length
612
+ self.num_actions = num_actions
613
+ self.use_depth_channel = use_depth_channel
614
+
615
+ self.conv1 = Conv3DBlock(
616
+ in_channels = sequence_length,
617
+ out_channels = 64,
618
+ kernel_size_conv = (2, 7, 7),
619
+ kernel_size_pool = (1, 7, 7),
620
+ stride=(1, 5, 5),
621
+ p_dropout = 0.2
622
+ )
623
+ self.conv2 = Conv3DBlock(
624
+ in_channels = 64,
625
+ out_channels = 128,
626
+ kernel_size_conv = (2, 5, 5),
627
+ kernel_size_pool = (1, 5, 5),
628
+ stride=(1, 3, 3),
629
+ p_dropout = 0.2
630
+ )
631
+ self.conv3 = Conv3DBlock(
632
+ in_channels = 192,
633
+ out_channels = 384,
634
+ kernel_size_conv = (2, 5, 5) if self.use_depth_channel else (1, 5, 5),
635
+ kernel_size_pool = (1, 3, 3),
636
+ stride=(1, 2, 2),
637
+ p_dropout = 0.2
638
+ )
639
+ self.conv4 = Conv3DBlock(
640
+ in_channels = 576,
641
+ out_channels = 1152,
642
+ kernel_size_conv = (1, 3, 3),
643
+ kernel_size_pool = (1, 2, 2),
644
+ stride=(1, 2, 2),
645
+ p_dropout = 0.2
646
+ )
647
+
648
+ self.downsample13 = nn.MaxPool3d(kernel_size=(2,7,7), stride=(1,3,3))
649
+ self.downsample14 = nn.MaxPool3d(kernel_size=(2,9,9), stride=(2,8,8))
650
+ if self.use_depth_channel:
651
+ self.downsample24 = nn.MaxPool3d(kernel_size=(2,7,7), stride=(2,2,2))
652
+ else:
653
+ self.downsample24 = nn.MaxPool3d(kernel_size=(1,7,7), stride=(1,2,2))
654
+
655
+ self.downsample1e = nn.MaxPool3d(kernel_size=(2,28,28), stride=(2,21,21))
656
+ self.downsample2e = nn.MaxPool3d(kernel_size=(2,9,9) if self.use_depth_channel else (1,9,9), stride=(1,6,6))
657
+ self.downsample3e = nn.MaxPool3d(kernel_size=(1,5,5), stride=(1,2,2))
658
+
659
+ self.dropout3d = nn.Dropout3d(p=0.2)
660
+
661
+ self.flatten = nn.Flatten(start_dim = 1)
662
+
663
+ self.readout = nn.Linear(15552, num_actions)
664
+
665
+ self.relu = nn.LeakyReLU()
666
+ self.dropout = nn.Dropout(p = 0.2)
667
+ self.softmax = nn.Softmax(dim = 1)
668
+ self.sigmoid = nn.Sigmoid()
669
+
670
+ self.apply_softmax = apply_softmax
671
+
672
+ def forward(self, X):
673
+ Y = X
674
+
675
+ Y1 = self.conv1(Y)
676
+ Y2 = self.conv2(Y1)
677
+ Y13 = self.downsample13(Y1)
678
+ Y14 = self.downsample14(Y1)
679
+ Y24 = self.downsample24(Y2)
680
+ Y2_cat = torch.cat([Y2, Y13], dim=1)
681
+ Y3 = self.conv3(Y2_cat)
682
+ Y3_cat = torch.cat([Y3, Y14, Y24], dim=1)
683
+
684
+ Y4 = self.conv4(Y3_cat)
685
+
686
+ Y1e = self.downsample1e(Y1)
687
+ Y2e = self.downsample2e(Y2)
688
+ Y3e = self.downsample3e(Y3)
689
+
690
+ Y4_cat = torch.cat([Y4, Y1e, Y2e, Y3e], dim=1)
691
+
692
+ Y = self.flatten(Y4_cat)
693
+
694
+
695
+
696
+ # print('X', X.shape)
697
+ # print('Y1', Y1.shape)
698
+ # print('Y2', Y2.shape)
699
+ # print('Y3', Y3.shape)
700
+ # print('Y4', Y4.shape)
701
+ # print('Y', Y.shape)
702
+
703
+ # print('Y13', Y13.shape)
704
+ # print('Y14', Y14.shape)
705
+ # print('Y24', Y24.shape)
706
+
707
+ # print('Y2_cat', Y2_cat.shape)
708
+ # print('Y3_cat', Y3_cat.shape)
709
+
710
+ Y = self.readout(Y)
711
+
712
+ if self.apply_softmax:
713
+ Y = self.softmax(Y)
714
+ else:
715
+ Y = self.sigmoid(Y)
716
+
717
+ return Y
718
+
719
+ if __name__ == '__main__':
720
+ batch_size = 4
721
+ seq_len = 30
722
+ embed_dim = 3
723
+ image_size = (400, 40)
724
+
725
+ X = torch.rand((batch_size, seq_len, 3, image_size[0], image_size[1]))
726
+
727
+ model3 = model003()
728
+ model3.to('cpu')
729
+ X = X.to('cpu')
730
+ Y = model3(X)
731
+ pass
tuc-ar.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f928b8a21f5d7089395bb6f51e7556f7a0c0fa22951709016ff09bc9e1ac68d
3
- size 41698458
 
 
 
 
ucf101.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d743f2b218846ef6ad770e3f4efcd95e2ba852e121cb67194381c311ece23405
3
- size 40739610