madhurjindal commited on
Commit
c269304
Β·
1 Parent(s): 8e9576b

Upload 26 files

Browse files
Store/.DS_Store ADDED
Binary file (6.15 kB). View file
 
Store/epoch=39-step=16560.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e4328995f50c43b7aa83f6ab30421825bcefb3a4cbc64df23b85506c8d24e16
3
+ size 740104377
Store/examples/.DS_Store ADDED
Binary file (6.15 kB). View file
 
Store/examples/000004.jpg ADDED
Store/examples/000006.jpg ADDED
Store/examples/000007.jpg ADDED
Store/examples/00001.jpeg ADDED
Store/examples/000012.jpg ADDED
Store/examples/000013.jpg ADDED
Store/examples/000014.jpg ADDED
Store/examples/000018.jpg ADDED
Store/examples/000022.jpg ADDED
Store/examples/airplane.png ADDED
Store/examples/bird.webp ADDED
Store/examples/car.jpg ADDED
Store/examples/cat.jpeg ADDED
Store/examples/horse.jpg ADDED
Store/examples/shipp.jpg ADDED
Utilities/__init__.py ADDED
File without changes
Utilities/config.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+
5
+ MAIN_DIR = "/kaggle/working/S13/"
6
+ # DATASET = os.path.join(MAIN_DIR, "../data/PASCAL_VOC")
7
+ DATASET = "/kaggle/input/pascal-voc-dataset-used-in-yolov3-video/PASCAL_VOC"
8
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
9
+ # DEVICE = "mps"
10
+ # seed_everything() # If you want deterministic behavior
11
+ NUM_WORKERS = 2
12
+ BATCH_SIZE = 40
13
+ IMAGE_SIZE = 416
14
+ INPUT_RESOLUTIONS = [416, 544]
15
+ INPUT_RESOLUTIONS_CUM_PROBS = [50, 100]
16
+ NUM_CLASSES = 20
17
+ LEARNING_RATE = 1e-5
18
+ WEIGHT_DECAY = 1e-4
19
+ NUM_EPOCHS = 40
20
+ CONF_THRESHOLD = 0.05
21
+ MAP_IOU_THRESH = 0.5
22
+ NMS_IOU_THRESH = 0.45
23
+ S = [IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8]
24
+ PIN_MEMORY = True
25
+ LOAD_MODEL = False
26
+ SAVE_MODEL = True
27
+ CHECKPOINT_PATH = os.path.join(MAIN_DIR, "Store/checkpoints/")
28
+ IMG_DIR = DATASET + "/images/"
29
+ LABEL_DIR = DATASET + "/labels/"
30
+ TRAIN_MOSAIC_PERCENTAGE = 0.67
31
+ TEST_MOSAIC_PERCENTAGE = 0.00
32
+
33
+ MODEL_CHECKPOINT_PATH = "./Store/epoch=39-step=16560.ckpt"
34
+ EXAMPLE_IMG_PATH = "./Store/examples/"
35
+
36
+ ANCHORS = [
37
+ [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],
38
+ [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],
39
+ [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],
40
+ ] # Note these have been rescaled to be between [0, 1]
41
+
42
+ means = [0.485, 0.456, 0.406]
43
+
44
+ scale = 1.1
45
+
46
+ PASCAL_CLASSES = [
47
+ "aeroplane",
48
+ "bicycle",
49
+ "bird",
50
+ "boat",
51
+ "bottle",
52
+ "bus",
53
+ "car",
54
+ "cat",
55
+ "chair",
56
+ "cow",
57
+ "diningtable",
58
+ "dog",
59
+ "horse",
60
+ "motorbike",
61
+ "person",
62
+ "pottedplant",
63
+ "sheep",
64
+ "sofa",
65
+ "train",
66
+ "tvmonitor",
67
+ ]
68
+
69
+ COCO_LABELS = [
70
+ "person",
71
+ "bicycle",
72
+ "car",
73
+ "motorcycle",
74
+ "airplane",
75
+ "bus",
76
+ "train",
77
+ "truck",
78
+ "boat",
79
+ "traffic light",
80
+ "fire hydrant",
81
+ "stop sign",
82
+ "parking meter",
83
+ "bench",
84
+ "bird",
85
+ "cat",
86
+ "dog",
87
+ "horse",
88
+ "sheep",
89
+ "cow",
90
+ "elephant",
91
+ "bear",
92
+ "zebra",
93
+ "giraffe",
94
+ "backpack",
95
+ "umbrella",
96
+ "handbag",
97
+ "tie",
98
+ "suitcase",
99
+ "frisbee",
100
+ "skis",
101
+ "snowboard",
102
+ "sports ball",
103
+ "kite",
104
+ "baseball bat",
105
+ "baseball glove",
106
+ "skateboard",
107
+ "surfboard",
108
+ "tennis racket",
109
+ "bottle",
110
+ "wine glass",
111
+ "cup",
112
+ "fork",
113
+ "knife",
114
+ "spoon",
115
+ "bowl",
116
+ "banana",
117
+ "apple",
118
+ "sandwich",
119
+ "orange",
120
+ "broccoli",
121
+ "carrot",
122
+ "hot dog",
123
+ "pizza",
124
+ "donut",
125
+ "cake",
126
+ "chair",
127
+ "couch",
128
+ "potted plant",
129
+ "bed",
130
+ "dining table",
131
+ "toilet",
132
+ "tv",
133
+ "laptop",
134
+ "mouse",
135
+ "remote",
136
+ "keyboard",
137
+ "cell phone",
138
+ "microwave",
139
+ "oven",
140
+ "toaster",
141
+ "sink",
142
+ "refrigerator",
143
+ "book",
144
+ "clock",
145
+ "vase",
146
+ "scissors",
147
+ "teddy bear",
148
+ "hair drier",
149
+ "toothbrush",
150
+ ]
Utilities/gradio_utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytorch_lightning as pl
3
+ import torch
4
+ from pytorch_grad_cam import GradCAM
5
+ from pytorch_grad_cam.utils.image import show_cam_on_image
6
+
7
+ from Utilities.model import YOLOv3
8
+ from Utilities.transforms import test_transforms
9
+ from Utilities.utils import cells_to_bboxes, non_max_suppression, plot_image
10
+
11
+
12
+ def plot_bboxes(
13
+ input_img,
14
+ model,
15
+ thresh=0.6,
16
+ iou_thresh=0.5,
17
+ anchors=None,
18
+ ):
19
+ input_img = test_transforms(image=input_img)["image"]
20
+ input_img = input_img.unsqueeze(0)
21
+ model.eval()
22
+ with torch.no_grad():
23
+ out = model(input_img)
24
+ for i in range(3):
25
+ batch_size, A, S, _, _ = out[i].shape
26
+ anchor = anchors[i]
27
+ boxes_scale_i = cells_to_bboxes(out[i], anchor, S=S, is_preds=True)
28
+ bboxes = boxes_scale_i[0]
29
+
30
+ nms_boxes = non_max_suppression(
31
+ bboxes,
32
+ iou_threshold=iou_thresh,
33
+ threshold=thresh,
34
+ box_format="midpoint",
35
+ )
36
+ fig = plot_image(input_img[0].permute(1, 2, 0).detach().cpu(), nms_boxes)
37
+ return fig, input_img
38
+
39
+
40
+ def return_top_objectness_class_preds(model, input_img, gradcam_output_stream):
41
+ out = model(input_img)[gradcam_output_stream]
42
+
43
+ # Step 1: Extract objectness scores
44
+ objectness_scores = out[..., 0]
45
+
46
+ # Step 2: Get the index of the highest objectness score
47
+ max_obj_arg = torch.argmax(objectness_scores)
48
+
49
+ max_obj_arg_onehot = torch.zeros(objectness_scores.flatten().shape[0])
50
+ max_obj_arg_onehot[max_obj_arg] = 1
51
+
52
+ max_obj_arg_onehot = max_obj_arg_onehot.reshape_as(
53
+ objectness_scores,
54
+ ).int()
55
+
56
+ selected_elements = out[max_obj_arg_onehot == 1]
57
+ selected_elements = selected_elements[:, 5:]
58
+
59
+ return selected_elements
60
+
61
+
62
+ class TopObjectnessClassPreds(pl.LightningModule):
63
+ def __init__(self, model, gradcam_output_stream):
64
+ super().__init__()
65
+ self.model = model
66
+ self.gradcam_output_stream = gradcam_output_stream
67
+
68
+ def forward(self, x):
69
+ return return_top_objectness_class_preds(
70
+ self.model, x, self.gradcam_output_stream
71
+ )
72
+
73
+
74
+ def generate_gradcam_output(org_img, model, input_img, gradcam_output_stream: int = 0):
75
+ TopObjectnessClassPredsObj = TopObjectnessClassPreds(model, gradcam_output_stream)
76
+ gradcam_model_layer = [15, 22, 29]
77
+ cam = GradCAM(
78
+ model=TopObjectnessClassPredsObj,
79
+ target_layers=[
80
+ TopObjectnessClassPredsObj.model.layers[
81
+ gradcam_model_layer[gradcam_output_stream]
82
+ ]
83
+ ],
84
+ )
85
+ grayscale_cam = cam(input_tensor=input_img, targets=None)
86
+ grayscale_cam = np.sum(grayscale_cam, axis=-1)
87
+ grayscale_cam = grayscale_cam[0, :]
88
+
89
+ visualization = show_cam_on_image(
90
+ org_img / 255,
91
+ grayscale_cam,
92
+ use_rgb=True,
93
+ image_weight=0.5,
94
+ )
95
+ return visualization
Utilities/loss.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Implementation of Yolo Loss Function similar to the one in Yolov3 paper,
3
+ the difference from what I can tell is I use CrossEntropy for the classes
4
+ instead of BinaryCrossEntropy.
5
+ """
6
+ import random
7
+
8
+ import pytorch_lightning as pl
9
+ import torch
10
+ import torch.nn as nn
11
+
12
+ from .utils import intersection_over_union
13
+
14
+
15
+ class YoloLoss(pl.LightningModule):
16
+ def __init__(self):
17
+ super().__init__()
18
+ self.mse = nn.MSELoss()
19
+ self.bce = nn.BCEWithLogitsLoss()
20
+ self.entropy = nn.CrossEntropyLoss()
21
+ self.sigmoid = nn.Sigmoid()
22
+
23
+ # Constants signifying how much to pay for each respective part of the loss
24
+ self.lambda_class = 1
25
+ self.lambda_noobj = 5
26
+ self.lambda_obj = 1
27
+ self.lambda_box = 5
28
+
29
+ def forward(self, predictions, target, anchors):
30
+ # Check where obj and noobj (we ignore if target == -1)
31
+ obj = target[..., 0] == 1 # in paper this is Iobj_i
32
+ noobj = target[..., 0] == 0 # in paper this is Inoobj_i
33
+
34
+ # ======================= #
35
+ # FOR NO OBJECT LOSS #
36
+ # ======================= #
37
+
38
+ no_object_loss = self.bce(
39
+ (predictions[..., 0:1][noobj]),
40
+ (target[..., 0:1][noobj]),
41
+ )
42
+
43
+ # ==================== #
44
+ # FOR OBJECT LOSS #
45
+ # ==================== #
46
+
47
+ anchors = anchors.reshape(1, 3, 1, 1, 2)
48
+
49
+ box_preds = torch.cat(
50
+ [
51
+ self.sigmoid(predictions[..., 1:3]),
52
+ torch.exp(predictions[..., 3:5]) * anchors,
53
+ ],
54
+ dim=-1,
55
+ )
56
+ ious = intersection_over_union(box_preds[obj], target[..., 1:5][obj]).detach()
57
+ object_loss = self.mse(
58
+ self.sigmoid(predictions[..., 0:1][obj]), ious * target[..., 0:1][obj]
59
+ )
60
+
61
+ # ======================== #
62
+ # FOR BOX COORDINATES #
63
+ # ======================== #
64
+
65
+ predictions[..., 1:3] = self.sigmoid(predictions[..., 1:3]) # x,y coordinates
66
+ target[..., 3:5] = torch.log(
67
+ (1e-16 + target[..., 3:5] / anchors)
68
+ ) # width, height coordinates
69
+ box_loss = self.mse(predictions[..., 1:5][obj], target[..., 1:5][obj])
70
+
71
+ # ================== #
72
+ # FOR CLASS LOSS #
73
+ # ================== #
74
+
75
+ class_loss = self.entropy(
76
+ (predictions[..., 5:][obj]),
77
+ (target[..., 5][obj].long()),
78
+ )
79
+
80
+ # print("__________________________________")
81
+ # print(self.lambda_box * box_loss)
82
+ # print(self.lambda_obj * object_loss)
83
+ # print(self.lambda_noobj * no_object_loss)
84
+ # print(self.lambda_class * class_loss)
85
+ # print("\n")
86
+
87
+ return (
88
+ self.lambda_box * box_loss
89
+ + self.lambda_obj * object_loss
90
+ + self.lambda_noobj * no_object_loss
91
+ + self.lambda_class * class_loss
92
+ )
Utilities/model.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Implementation of YOLOv3 architecture
3
+ """
4
+
5
+ import random
6
+ from typing import Any, Optional
7
+
8
+ import pytorch_lightning as pl
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.optim as optim
12
+ from pytorch_lightning.utilities.types import STEP_OUTPUT
13
+ from torch.optim.lr_scheduler import OneCycleLR
14
+
15
+ from . import config
16
+ from .loss import YoloLoss
17
+
18
+ """
19
+ Information about architecture config:
20
+ Tuple is structured by (filters, kernel_size, stride)
21
+ Every conv is a same convolution.
22
+ List is structured by "B" indicating a residual block followed by the number of repeats
23
+ "S" is for scale prediction block and computing the yolo loss
24
+ "U" is for upsampling the feature map and concatenating with a previous layer
25
+ """
26
+
27
+ model_config = [
28
+ (32, 3, 1),
29
+ (64, 3, 2),
30
+ ["B", 1],
31
+ (128, 3, 2),
32
+ ["B", 2],
33
+ (256, 3, 2),
34
+ ["B", 8],
35
+ (512, 3, 2),
36
+ ["B", 8],
37
+ (1024, 3, 2),
38
+ ["B", 4], # To this point is Darknet-53
39
+ (512, 1, 1),
40
+ (1024, 3, 1),
41
+ "S",
42
+ (256, 1, 1),
43
+ "U",
44
+ (256, 1, 1),
45
+ (512, 3, 1),
46
+ "S",
47
+ (128, 1, 1),
48
+ "U",
49
+ (128, 1, 1),
50
+ (256, 3, 1),
51
+ "S",
52
+ ]
53
+
54
+
55
+ class CNNBlock(pl.LightningModule):
56
+ def __init__(self, in_channels, out_channels, bn_act=True, **kwargs):
57
+ super().__init__()
58
+ self.conv = nn.Conv2d(in_channels, out_channels, bias=not bn_act, **kwargs)
59
+ self.bn = nn.BatchNorm2d(out_channels)
60
+ self.leaky = nn.LeakyReLU(0.1)
61
+ self.use_bn_act = bn_act
62
+
63
+ def forward(self, x):
64
+ if self.use_bn_act:
65
+ return self.leaky(self.bn(self.conv(x)))
66
+ else:
67
+ return self.conv(x)
68
+
69
+
70
+ class ResidualBlock(pl.LightningModule):
71
+ def __init__(self, channels, use_residual=True, num_repeats=1):
72
+ super().__init__()
73
+ self.layers = nn.ModuleList()
74
+ for repeat in range(num_repeats):
75
+ self.layers += [
76
+ nn.Sequential(
77
+ CNNBlock(channels, channels // 2, kernel_size=1),
78
+ CNNBlock(channels // 2, channels, kernel_size=3, padding=1),
79
+ )
80
+ ]
81
+
82
+ self.use_residual = use_residual
83
+ self.num_repeats = num_repeats
84
+
85
+ def forward(self, x):
86
+ for layer in self.layers:
87
+ if self.use_residual:
88
+ x = x + layer(x)
89
+ else:
90
+ x = layer(x)
91
+
92
+ return x
93
+
94
+
95
+ class ScalePrediction(pl.LightningModule):
96
+ def __init__(self, in_channels, num_classes):
97
+ super().__init__()
98
+ self.pred = nn.Sequential(
99
+ CNNBlock(in_channels, 2 * in_channels, kernel_size=3, padding=1),
100
+ CNNBlock(
101
+ 2 * in_channels, (num_classes + 5) * 3, bn_act=False, kernel_size=1
102
+ ),
103
+ )
104
+ self.num_classes = num_classes
105
+
106
+ def forward(self, x):
107
+ return (
108
+ self.pred(x)
109
+ .reshape(x.shape[0], 3, self.num_classes + 5, x.shape[2], x.shape[3])
110
+ .permute(0, 1, 3, 4, 2)
111
+ )
112
+
113
+
114
+ class YOLOv3(pl.LightningModule):
115
+ def __init__(self, in_channels=3, num_classes=20):
116
+ super().__init__()
117
+ self.num_classes = num_classes
118
+ self.in_channels = in_channels
119
+ self.layers = self._create_conv_layers()
120
+
121
+ self.scaled_anchors = (
122
+ torch.tensor(config.ANCHORS)
123
+ * torch.tensor(config.S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)
124
+ ).to(config.DEVICE)
125
+
126
+ self.learning_rate = config.LEARNING_RATE
127
+ self.weight_decay = config.WEIGHT_DECAY
128
+ self.best_lr = 1e-3
129
+
130
+ def forward(self, x):
131
+ outputs = [] # for each scale
132
+ route_connections = []
133
+ for layer in self.layers:
134
+ if isinstance(layer, ScalePrediction):
135
+ outputs.append(layer(x))
136
+ continue
137
+
138
+ x = layer(x)
139
+
140
+ if isinstance(layer, ResidualBlock) and layer.num_repeats == 8:
141
+ route_connections.append(x)
142
+
143
+ elif isinstance(layer, nn.Upsample):
144
+ x = torch.cat([x, route_connections[-1]], dim=1)
145
+ route_connections.pop()
146
+
147
+ return outputs
148
+
149
+ def _create_conv_layers(self):
150
+ layers = nn.ModuleList()
151
+ in_channels = self.in_channels
152
+
153
+ for module in model_config:
154
+ if isinstance(module, tuple):
155
+ out_channels, kernel_size, stride = module
156
+ layers.append(
157
+ CNNBlock(
158
+ in_channels,
159
+ out_channels,
160
+ kernel_size=kernel_size,
161
+ stride=stride,
162
+ padding=1 if kernel_size == 3 else 0,
163
+ )
164
+ )
165
+ in_channels = out_channels
166
+
167
+ elif isinstance(module, list):
168
+ num_repeats = module[1]
169
+ layers.append(
170
+ ResidualBlock(
171
+ in_channels,
172
+ num_repeats=num_repeats,
173
+ )
174
+ )
175
+
176
+ elif isinstance(module, str):
177
+ if module == "S":
178
+ layers += [
179
+ ResidualBlock(in_channels, use_residual=False, num_repeats=1),
180
+ CNNBlock(in_channels, in_channels // 2, kernel_size=1),
181
+ ScalePrediction(in_channels // 2, num_classes=self.num_classes),
182
+ ]
183
+ in_channels = in_channels // 2
184
+
185
+ elif module == "U":
186
+ layers.append(
187
+ nn.Upsample(scale_factor=2),
188
+ )
189
+ in_channels = in_channels * 3
190
+
191
+ return layers
192
+
193
+ def yololoss(self):
194
+ return YoloLoss()
195
+
196
+ def training_step(self, batch, batch_idx):
197
+ x, y = batch
198
+ y0, y1, y2 = y[0], y[1], y[2]
199
+ out = self(x)
200
+ # print(out[0].shape, y0.shape)
201
+ loss = (
202
+ self.yololoss()(out[0], y0, self.scaled_anchors[0])
203
+ + self.yololoss()(out[1], y1, self.scaled_anchors[1])
204
+ + self.yololoss()(out[2], y2, self.scaled_anchors[2])
205
+ )
206
+
207
+ self.log(
208
+ "train_loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True
209
+ )
210
+
211
+ # config.IMAGE_SIZE = 416 if random.random() < 0.5 else 544
212
+ # config.S = [
213
+ # config.IMAGE_SIZE // 32,
214
+ # config.IMAGE_SIZE // 16,
215
+ # config.IMAGE_SIZE // 8,
216
+ # ]
217
+ # print(f"{self.trainer.datamodule.train_dataset.S=}")
218
+ # self.trainer.datamodule.train_dataset.S = [
219
+ # config.IMAGE_SIZE // 32,
220
+ # config.IMAGE_SIZE // 16,
221
+ # config.IMAGE_SIZE // 8,
222
+ # ]
223
+ # self.trainer.datamodule.train_dataset.image_size = config.IMAGE_SIZE
224
+ # self.scaled_anchors = (
225
+ # torch.tensor(config.ANCHORS)
226
+ # * torch.tensor(config.S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)
227
+ # ).to(config.DEVICE)
228
+ return loss
229
+
230
+ def on_train_epoch_end(self) -> None:
231
+ print(
232
+ f"EPOCH: {self.current_epoch}, Loss: {self.trainer.callback_metrics['train_loss_epoch']}"
233
+ )
234
+
235
+ def configure_optimizers(self) -> Any:
236
+ optimizer = optim.Adam(
237
+ self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
238
+ )
239
+ scheduler = OneCycleLR(
240
+ optimizer,
241
+ max_lr=self.best_lr,
242
+ steps_per_epoch=len(self.trainer.datamodule.train_dataloader()),
243
+ epochs=config.NUM_EPOCHS,
244
+ pct_start=8 / config.NUM_EPOCHS,
245
+ div_factor=100,
246
+ three_phase=False,
247
+ final_div_factor=100,
248
+ anneal_strategy="linear",
249
+ )
250
+
251
+ return [optimizer], [
252
+ {"scheduler": scheduler, "interval": "step", "frequency": 1}
253
+ ]
254
+
255
+ def on_train_end(self) -> None:
256
+ torch.save(self.state_dict(), config.MODEL_STATE_DICT_PATH)
257
+
258
+
259
+ if __name__ == "__main__":
260
+ num_classes = 20
261
+ IMAGE_SIZE = 416
262
+ model = YOLOv3(num_classes=num_classes)
263
+ x = torch.randn((2, 3, IMAGE_SIZE, IMAGE_SIZE))
264
+ out = model(x)
265
+ assert model(x)[0].shape == (
266
+ 2,
267
+ 3,
268
+ IMAGE_SIZE // 32,
269
+ IMAGE_SIZE // 32,
270
+ num_classes + 5,
271
+ )
272
+ assert model(x)[1].shape == (
273
+ 2,
274
+ 3,
275
+ IMAGE_SIZE // 16,
276
+ IMAGE_SIZE // 16,
277
+ num_classes + 5,
278
+ )
279
+ assert model(x)[2].shape == (
280
+ 2,
281
+ 3,
282
+ IMAGE_SIZE // 8,
283
+ IMAGE_SIZE // 8,
284
+ num_classes + 5,
285
+ )
286
+ print("Success!")
Utilities/transforms.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import albumentations as A
2
+ import cv2
3
+ from albumentations.pytorch import ToTensorV2
4
+
5
+ from .config import IMAGE_SIZE
6
+
7
+ test_transforms = A.Compose(
8
+ [
9
+ A.LongestMaxSize(max_size=IMAGE_SIZE),
10
+ A.PadIfNeeded(
11
+ min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=cv2.BORDER_CONSTANT
12
+ ),
13
+ A.Normalize(
14
+ mean=[0, 0, 0],
15
+ std=[1, 1, 1],
16
+ max_pixel_value=255,
17
+ ),
18
+ ToTensorV2(),
19
+ ],
20
+ )
21
+
22
+ resize_transforms = A.Compose(
23
+ [
24
+ A.LongestMaxSize(max_size=IMAGE_SIZE),
25
+ A.PadIfNeeded(
26
+ min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=cv2.BORDER_CONSTANT
27
+ ),
28
+ ]
29
+ )
Utilities/utils.py ADDED
@@ -0,0 +1,538 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ from collections import Counter
4
+
5
+ import matplotlib.patches as patches
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+ import torch
9
+ from tqdm import tqdm
10
+
11
+ from . import config
12
+
13
+
14
+ def iou_width_height(boxes1, boxes2):
15
+ """
16
+ Parameters:
17
+ boxes1 (tensor): width and height of the first bounding boxes
18
+ boxes2 (tensor): width and height of the second bounding boxes
19
+ Returns:
20
+ tensor: Intersection over union of the corresponding boxes
21
+ """
22
+ intersection = torch.min(boxes1[..., 0], boxes2[..., 0]) * torch.min(
23
+ boxes1[..., 1], boxes2[..., 1]
24
+ )
25
+ union = (
26
+ boxes1[..., 0] * boxes1[..., 1] + boxes2[..., 0] * boxes2[..., 1] - intersection
27
+ )
28
+ return intersection / union
29
+
30
+
31
+ def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
32
+ """
33
+ Video explanation of this function:
34
+ https://youtu.be/XXYG5ZWtjj0
35
+
36
+ This function calculates intersection over union (iou) given pred boxes
37
+ and target boxes.
38
+
39
+ Parameters:
40
+ boxes_preds (tensor): Predictions of Bounding Boxes (BATCH_SIZE, 4)
41
+ boxes_labels (tensor): Correct labels of Bounding Boxes (BATCH_SIZE, 4)
42
+ box_format (str): midpoint/corners, if boxes (x,y,w,h) or (x1,y1,x2,y2)
43
+
44
+ Returns:
45
+ tensor: Intersection over union for all examples
46
+ """
47
+
48
+ if box_format == "midpoint":
49
+ box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
50
+ box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
51
+ box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
52
+ box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
53
+ box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
54
+ box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
55
+ box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
56
+ box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2
57
+
58
+ if box_format == "corners":
59
+ box1_x1 = boxes_preds[..., 0:1]
60
+ box1_y1 = boxes_preds[..., 1:2]
61
+ box1_x2 = boxes_preds[..., 2:3]
62
+ box1_y2 = boxes_preds[..., 3:4]
63
+ box2_x1 = boxes_labels[..., 0:1]
64
+ box2_y1 = boxes_labels[..., 1:2]
65
+ box2_x2 = boxes_labels[..., 2:3]
66
+ box2_y2 = boxes_labels[..., 3:4]
67
+
68
+ x1 = torch.max(box1_x1, box2_x1)
69
+ y1 = torch.max(box1_y1, box2_y1)
70
+ x2 = torch.min(box1_x2, box2_x2)
71
+ y2 = torch.min(box1_y2, box2_y2)
72
+
73
+ intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
74
+ box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
75
+ box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))
76
+
77
+ return intersection / (box1_area + box2_area - intersection + 1e-6)
78
+
79
+
80
+ def non_max_suppression(bboxes, iou_threshold, threshold, box_format="corners"):
81
+ """
82
+ Video explanation of this function:
83
+ https://youtu.be/YDkjWEN8jNA
84
+
85
+ Does Non Max Suppression given bboxes
86
+
87
+ Parameters:
88
+ bboxes (list): list of lists containing all bboxes with each bboxes
89
+ specified as [class_pred, prob_score, x1, y1, x2, y2]
90
+ iou_threshold (float): threshold where predicted bboxes is correct
91
+ threshold (float): threshold to remove predicted bboxes (independent of IoU)
92
+ box_format (str): "midpoint" or "corners" used to specify bboxes
93
+
94
+ Returns:
95
+ list: bboxes after performing NMS given a specific IoU threshold
96
+ """
97
+
98
+ assert type(bboxes) == list
99
+
100
+ bboxes = [box for box in bboxes if box[1] > threshold]
101
+ bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)
102
+ bboxes_after_nms = []
103
+
104
+ while bboxes:
105
+ chosen_box = bboxes.pop(0)
106
+
107
+ bboxes = [
108
+ box
109
+ for box in bboxes
110
+ if box[0] != chosen_box[0]
111
+ or intersection_over_union(
112
+ torch.tensor(chosen_box[2:]),
113
+ torch.tensor(box[2:]),
114
+ box_format=box_format,
115
+ )
116
+ < iou_threshold
117
+ ]
118
+
119
+ bboxes_after_nms.append(chosen_box)
120
+
121
+ return bboxes_after_nms
122
+
123
+
124
+ def mean_average_precision(
125
+ pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20
126
+ ):
127
+ """
128
+ Video explanation of this function:
129
+ https://youtu.be/FppOzcDvaDI
130
+
131
+ This function calculates mean average precision (mAP)
132
+
133
+ Parameters:
134
+ pred_boxes (list): list of lists containing all bboxes with each bboxes
135
+ specified as [train_idx, class_prediction, prob_score, x1, y1, x2, y2]
136
+ true_boxes (list): Similar as pred_boxes except all the correct ones
137
+ iou_threshold (float): threshold where predicted bboxes is correct
138
+ box_format (str): "midpoint" or "corners" used to specify bboxes
139
+ num_classes (int): number of classes
140
+
141
+ Returns:
142
+ float: mAP value across all classes given a specific IoU threshold
143
+ """
144
+
145
+ # list storing all AP for respective classes
146
+ average_precisions = []
147
+
148
+ # used for numerical stability later on
149
+ epsilon = 1e-6
150
+
151
+ for c in range(num_classes):
152
+ detections = []
153
+ ground_truths = []
154
+
155
+ # Go through all predictions and targets,
156
+ # and only add the ones that belong to the
157
+ # current class c
158
+ for detection in pred_boxes:
159
+ if detection[1] == c:
160
+ detections.append(detection)
161
+
162
+ for true_box in true_boxes:
163
+ if true_box[1] == c:
164
+ ground_truths.append(true_box)
165
+
166
+ # find the amount of bboxes for each training example
167
+ # Counter here finds how many ground truth bboxes we get
168
+ # for each training example, so let's say img 0 has 3,
169
+ # img 1 has 5 then we will obtain a dictionary with:
170
+ # amount_bboxes = {0:3, 1:5}
171
+ amount_bboxes = Counter([gt[0] for gt in ground_truths])
172
+
173
+ # We then go through each key, val in this dictionary
174
+ # and convert to the following (w.r.t same example):
175
+ # ammount_bboxes = {0:torch.tensor[0,0,0], 1:torch.tensor[0,0,0,0,0]}
176
+ for key, val in amount_bboxes.items():
177
+ amount_bboxes[key] = torch.zeros(val)
178
+
179
+ # sort by box probabilities which is index 2
180
+ detections.sort(key=lambda x: x[2], reverse=True)
181
+ TP = torch.zeros((len(detections)))
182
+ FP = torch.zeros((len(detections)))
183
+ total_true_bboxes = len(ground_truths)
184
+
185
+ # If none exists for this class then we can safely skip
186
+ if total_true_bboxes == 0:
187
+ continue
188
+
189
+ for detection_idx, detection in enumerate(detections):
190
+ # Only take out the ground_truths that have the same
191
+ # training idx as detection
192
+ ground_truth_img = [
193
+ bbox for bbox in ground_truths if bbox[0] == detection[0]
194
+ ]
195
+
196
+ num_gts = len(ground_truth_img)
197
+ best_iou = 0
198
+
199
+ for idx, gt in enumerate(ground_truth_img):
200
+ iou = intersection_over_union(
201
+ torch.tensor(detection[3:]),
202
+ torch.tensor(gt[3:]),
203
+ box_format=box_format,
204
+ )
205
+
206
+ if iou > best_iou:
207
+ best_iou = iou
208
+ best_gt_idx = idx
209
+
210
+ if best_iou > iou_threshold:
211
+ # only detect ground truth detection once
212
+ if amount_bboxes[detection[0]][best_gt_idx] == 0:
213
+ # true positive and add this bounding box to seen
214
+ TP[detection_idx] = 1
215
+ amount_bboxes[detection[0]][best_gt_idx] = 1
216
+ else:
217
+ FP[detection_idx] = 1
218
+
219
+ # if IOU is lower then the detection is a false positive
220
+ else:
221
+ FP[detection_idx] = 1
222
+
223
+ TP_cumsum = torch.cumsum(TP, dim=0)
224
+ FP_cumsum = torch.cumsum(FP, dim=0)
225
+ recalls = TP_cumsum / (total_true_bboxes + epsilon)
226
+ precisions = TP_cumsum / (TP_cumsum + FP_cumsum + epsilon)
227
+ precisions = torch.cat((torch.tensor([1]), precisions))
228
+ recalls = torch.cat((torch.tensor([0]), recalls))
229
+ # torch.trapz for numerical integration
230
+ average_precisions.append(torch.trapz(precisions, recalls))
231
+
232
+ return sum(average_precisions) / len(average_precisions)
233
+
234
+
235
+ def plot_image(image, boxes):
236
+ """Plots predicted bounding boxes on the image"""
237
+ cmap = plt.get_cmap("tab20b")
238
+ class_labels = (
239
+ config.COCO_LABELS if config.DATASET == "COCO" else config.PASCAL_CLASSES
240
+ )
241
+ colors = [cmap(i) for i in np.linspace(0, 1, len(class_labels))]
242
+ im = np.array(image)
243
+ height, width, _ = im.shape
244
+
245
+ # Create figure and axes
246
+ fig, ax = plt.subplots(1)
247
+ # Display the image
248
+ ax.imshow(im)
249
+
250
+ # box[0] is x midpoint, box[2] is width
251
+ # box[1] is y midpoint, box[3] is height
252
+
253
+ # Create a Rectangle patch
254
+ for box in boxes:
255
+ assert (
256
+ len(box) == 6
257
+ ), "box should contain class pred, confidence, x, y, width, height"
258
+ class_pred = box[0]
259
+ box = box[2:]
260
+ upper_left_x = box[0] - box[2] / 2
261
+ upper_left_y = box[1] - box[3] / 2
262
+ rect = patches.Rectangle(
263
+ (upper_left_x * width, upper_left_y * height),
264
+ box[2] * width,
265
+ box[3] * height,
266
+ linewidth=2,
267
+ edgecolor=colors[int(class_pred)],
268
+ facecolor="none",
269
+ )
270
+ # Add the patch to the Axes
271
+ ax.add_patch(rect)
272
+ plt.text(
273
+ upper_left_x * width,
274
+ upper_left_y * height,
275
+ s=class_labels[int(class_pred)],
276
+ color="white",
277
+ verticalalignment="top",
278
+ bbox={"color": colors[int(class_pred)], "pad": 0},
279
+ )
280
+ # plt.close()
281
+ return fig
282
+
283
+
284
+ def get_evaluation_bboxes(
285
+ loader,
286
+ model,
287
+ iou_threshold,
288
+ anchors,
289
+ threshold,
290
+ box_format="midpoint",
291
+ device="cuda",
292
+ ):
293
+ # make sure model is in eval before get bboxes
294
+ model.eval()
295
+ train_idx = 0
296
+ all_pred_boxes = []
297
+ all_true_boxes = []
298
+ for batch_idx, (x, labels) in enumerate(tqdm(loader)):
299
+ x = x.to(device)
300
+
301
+ with torch.no_grad():
302
+ predictions = model(x)
303
+
304
+ batch_size = x.shape[0]
305
+ bboxes = [[] for _ in range(batch_size)]
306
+ for i in range(3):
307
+ S = predictions[i].shape[2]
308
+ anchor = torch.tensor([*anchors[i]]).to(device) * S
309
+ boxes_scale_i = cells_to_bboxes(predictions[i], anchor, S=S, is_preds=True)
310
+ for idx, (box) in enumerate(boxes_scale_i):
311
+ bboxes[idx] += box
312
+
313
+ # we just want one bbox for each label, not one for each scale
314
+ true_bboxes = cells_to_bboxes(labels[2], anchor, S=S, is_preds=False)
315
+
316
+ for idx in range(batch_size):
317
+ nms_boxes = non_max_suppression(
318
+ bboxes[idx],
319
+ iou_threshold=iou_threshold,
320
+ threshold=threshold,
321
+ box_format=box_format,
322
+ )
323
+
324
+ for nms_box in nms_boxes:
325
+ all_pred_boxes.append([train_idx] + nms_box)
326
+
327
+ for box in true_bboxes[idx]:
328
+ if box[1] > threshold:
329
+ all_true_boxes.append([train_idx] + box)
330
+
331
+ train_idx += 1
332
+
333
+ model.train()
334
+ return all_pred_boxes, all_true_boxes
335
+
336
+
337
+ def cells_to_bboxes(predictions, anchors, S, is_preds=True):
338
+ """
339
+ Scales the predictions coming from the model to
340
+ be relative to the entire image such that they for example later
341
+ can be plotted or.
342
+ INPUT:
343
+ predictions: tensor of size (N, 3, S, S, num_classes+5)
344
+ anchors: the anchors used for the predictions
345
+ S: the number of cells the image is divided in on the width (and height)
346
+ is_preds: whether the input is predictions or the true bounding boxes
347
+ OUTPUT:
348
+ converted_bboxes: the converted boxes of sizes (N, num_anchors, S, S, 1+5) with class index,
349
+ object score, bounding box coordinates
350
+ """
351
+ BATCH_SIZE = predictions.shape[0]
352
+ num_anchors = len(anchors)
353
+ box_predictions = predictions[..., 1:5]
354
+ if is_preds:
355
+ anchors = anchors.reshape(1, len(anchors), 1, 1, 2)
356
+ box_predictions[..., 0:2] = torch.sigmoid(box_predictions[..., 0:2])
357
+ box_predictions[..., 2:] = torch.exp(box_predictions[..., 2:]) * anchors
358
+ scores = torch.sigmoid(predictions[..., 0:1])
359
+ best_class = torch.argmax(predictions[..., 5:], dim=-1).unsqueeze(-1)
360
+ else:
361
+ scores = predictions[..., 0:1]
362
+ best_class = predictions[..., 5:6]
363
+
364
+ cell_indices = (
365
+ torch.arange(S)
366
+ .repeat(predictions.shape[0], 3, S, 1)
367
+ .unsqueeze(-1)
368
+ .to(predictions.device)
369
+ )
370
+ x = 1 / S * (box_predictions[..., 0:1] + cell_indices)
371
+ y = 1 / S * (box_predictions[..., 1:2] + cell_indices.permute(0, 1, 3, 2, 4))
372
+ w_h = 1 / S * box_predictions[..., 2:4]
373
+ converted_bboxes = torch.cat((best_class, scores, x, y, w_h), dim=-1).reshape(
374
+ BATCH_SIZE, num_anchors * S * S, 6
375
+ )
376
+ return converted_bboxes.tolist()
377
+
378
+
379
+ def check_class_accuracy(model, loader, threshold):
380
+ model.eval()
381
+ tot_class_preds, correct_class = 0, 0
382
+ tot_noobj, correct_noobj = 0, 0
383
+ tot_obj, correct_obj = 0, 0
384
+
385
+ for idx, (x, y) in enumerate(tqdm(loader)):
386
+ x = x.to(config.DEVICE)
387
+ with torch.no_grad():
388
+ out = model(x)
389
+
390
+ for i in range(3):
391
+ y[i] = y[i].to(config.DEVICE)
392
+ obj = y[i][..., 0] == 1 # in paper this is Iobj_i
393
+ noobj = y[i][..., 0] == 0 # in paper this is Iobj_i
394
+
395
+ correct_class += torch.sum(
396
+ torch.argmax(out[i][..., 5:][obj], dim=-1) == y[i][..., 5][obj]
397
+ )
398
+ tot_class_preds += torch.sum(obj)
399
+
400
+ obj_preds = torch.sigmoid(out[i][..., 0]) > threshold
401
+ correct_obj += torch.sum(obj_preds[obj] == y[i][..., 0][obj])
402
+ tot_obj += torch.sum(obj)
403
+ correct_noobj += torch.sum(obj_preds[noobj] == y[i][..., 0][noobj])
404
+ tot_noobj += torch.sum(noobj)
405
+
406
+ class_acc = (correct_class / (tot_class_preds + 1e-16)) * 100
407
+ no_obj_acc = (correct_noobj / (tot_noobj + 1e-16)) * 100
408
+ obj_acc = (correct_obj / (tot_obj + 1e-16)) * 100
409
+
410
+ print(f"Class accuracy is: {class_acc:2f}%")
411
+ print(f"No obj accuracy is: {no_obj_acc:2f}%")
412
+ print(f"Obj accuracy is: {obj_acc:2f}%")
413
+ model.train()
414
+ return class_acc, no_obj_acc, obj_acc
415
+
416
+
417
+ def get_mean_std(loader):
418
+ # var[X] = E[X**2] - E[X]**2
419
+ channels_sum, channels_sqrd_sum, num_batches = 0, 0, 0
420
+
421
+ for data, _ in tqdm(loader):
422
+ channels_sum += torch.mean(data, dim=[0, 2, 3])
423
+ channels_sqrd_sum += torch.mean(data**2, dim=[0, 2, 3])
424
+ num_batches += 1
425
+
426
+ mean = channels_sum / num_batches
427
+ std = (channels_sqrd_sum / num_batches - mean**2) ** 0.5
428
+
429
+ return mean, std
430
+
431
+
432
+ def save_checkpoint(model, optimizer, filename="my_checkpoint.pth.tar"):
433
+ print("=> Saving checkpoint")
434
+ checkpoint = {
435
+ "state_dict": model.state_dict(),
436
+ "optimizer": optimizer.state_dict(),
437
+ }
438
+ torch.save(checkpoint, filename)
439
+
440
+
441
+ def load_checkpoint(checkpoint_file, model, optimizer, lr):
442
+ print("=> Loading checkpoint")
443
+ checkpoint = torch.load(checkpoint_file, map_location=config.DEVICE)
444
+ model.load_state_dict(checkpoint["state_dict"])
445
+ optimizer.load_state_dict(checkpoint["optimizer"])
446
+
447
+ # If we don't do this then it will just have learning rate of old checkpoint
448
+ # and it will lead to many hours of debugging \:
449
+ for param_group in optimizer.param_groups:
450
+ param_group["lr"] = lr
451
+
452
+
453
+ def plot_couple_examples(model, loader, thresh, iou_thresh, anchors):
454
+ model.eval()
455
+ x, y = next(iter(loader))
456
+ x = x.to(config.DEVICE)
457
+
458
+ with torch.no_grad():
459
+ out = model(x)
460
+ bboxes = [[] for _ in range(x.shape[0])]
461
+ for i in range(3):
462
+ batch_size, A, S, _, _ = out[i].shape
463
+ anchor = anchors[i]
464
+ boxes_scale_i = cells_to_bboxes(out[i], anchor, S=S, is_preds=True)
465
+ for idx, (box) in enumerate(boxes_scale_i):
466
+ bboxes[idx] += box
467
+
468
+ model.train()
469
+
470
+ for i in range(batch_size // 4):
471
+ nms_boxes = non_max_suppression(
472
+ bboxes[i],
473
+ iou_threshold=iou_thresh,
474
+ threshold=thresh,
475
+ box_format="midpoint",
476
+ )
477
+ plot_image(x[i].permute(1, 2, 0).detach().cpu(), nms_boxes)
478
+
479
+
480
+ def seed_everything(seed=42):
481
+ os.environ["PYTHONHASHSEED"] = str(seed)
482
+ random.seed(seed)
483
+ np.random.seed(seed)
484
+ torch.manual_seed(seed)
485
+ torch.cuda.manual_seed(seed)
486
+ torch.cuda.manual_seed_all(seed)
487
+ torch.backends.cudnn.deterministic = True
488
+ torch.backends.cudnn.benchmark = False
489
+
490
+
491
+ def clip_coords(boxes, img_shape):
492
+ # Clip bounding xyxy bounding boxes to image shape (height, width)
493
+ boxes[:, 0].clamp_(0, img_shape[1]) # x1
494
+ boxes[:, 1].clamp_(0, img_shape[0]) # y1
495
+ boxes[:, 2].clamp_(0, img_shape[1]) # x2
496
+ boxes[:, 3].clamp_(0, img_shape[0]) # y2
497
+
498
+
499
+ def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
500
+ # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
501
+ y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
502
+ y[..., 0] = w * (x[..., 0] - x[..., 2] / 2) + padw # top left x
503
+ y[..., 1] = h * (x[..., 1] - x[..., 3] / 2) + padh # top left y
504
+ y[..., 2] = w * (x[..., 0] + x[..., 2] / 2) + padw # bottom right x
505
+ y[..., 3] = h * (x[..., 1] + x[..., 3] / 2) + padh # bottom right y
506
+ return y
507
+
508
+
509
+ def xyn2xy(x, w=640, h=640, padw=0, padh=0):
510
+ # Convert normalized segments into pixel segments, shape (n,2)
511
+ y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
512
+ y[..., 0] = w * x[..., 0] + padw # top left x
513
+ y[..., 1] = h * x[..., 1] + padh # top left y
514
+ return y
515
+
516
+
517
+ def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
518
+ # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right
519
+ if clip:
520
+ clip_boxes(x, (h - eps, w - eps)) # warning: inplace clip
521
+ y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
522
+ y[..., 0] = ((x[..., 0] + x[..., 2]) / 2) / w # x center
523
+ y[..., 1] = ((x[..., 1] + x[..., 3]) / 2) / h # y center
524
+ y[..., 2] = (x[..., 2] - x[..., 0]) / w # width
525
+ y[..., 3] = (x[..., 3] - x[..., 1]) / h # height
526
+ return y
527
+
528
+
529
+ def clip_boxes(boxes, shape):
530
+ # Clip boxes (xyxy) to image shape (height, width)
531
+ if isinstance(boxes, torch.Tensor): # faster individually
532
+ boxes[..., 0].clamp_(0, shape[1]) # x1
533
+ boxes[..., 1].clamp_(0, shape[0]) # y1
534
+ boxes[..., 2].clamp_(0, shape[1]) # x2
535
+ boxes[..., 3].clamp_(0, shape[0]) # y2
536
+ else: # np.array (faster grouped)
537
+ boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
538
+ boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from Utilities import config
4
+ from Utilities.gradio_utils import generate_gradcam_output, plot_bboxes
5
+ from Utilities.model import YOLOv3
6
+ from Utilities.transforms import resize_transforms
7
+
8
+ model = YOLOv3.load_from_checkpoint(
9
+ config.MODEL_CHECKPOINT_PATH,
10
+ map_location=torch.device("cpu"),
11
+ )
12
+ # model = YOLOv3.load_from_checkpoint(
13
+ # "/Users/madhurjindal/WorkProjects/ERA-v1/S13/Gradio App/Store/epoch=39-step=16560.ckpt",
14
+ # map_location=torch.device("cpu"),
15
+ # )
16
+
17
+ examples = [
18
+ [config.EXAMPLE_IMG_PATH + "cat.jpeg", 1],
19
+ [config.EXAMPLE_IMG_PATH + "horse.jpg", 1],
20
+ [config.EXAMPLE_IMG_PATH + "000018.jpg", 2],
21
+ [config.EXAMPLE_IMG_PATH + "bird.webp", 2],
22
+ [config.EXAMPLE_IMG_PATH + "000022.jpg", 2],
23
+ [config.EXAMPLE_IMG_PATH + "airplane.png", 0],
24
+ [config.EXAMPLE_IMG_PATH + "00001.jpeg", 2],
25
+ [config.EXAMPLE_IMG_PATH + "shipp.jpg", 0],
26
+ [config.EXAMPLE_IMG_PATH + "car.jpg", 1],
27
+ [config.EXAMPLE_IMG_PATH + "000007.jpg", 1],
28
+ [config.EXAMPLE_IMG_PATH + "000013.jpg", 2],
29
+ [config.EXAMPLE_IMG_PATH + "000012.jpg", 2],
30
+ [config.EXAMPLE_IMG_PATH + "000006.jpg", 1],
31
+ [config.EXAMPLE_IMG_PATH + "000004.jpg", 1],
32
+ [config.EXAMPLE_IMG_PATH + "000014.jpg", 0],
33
+ ]
34
+
35
+ title = "Object Detection (YOLOv3) with GradCAM"
36
+ description = """Introducing the YOLOv3 Object Detection Explorer πŸ•΅οΈβ€β™€οΈπŸ”
37
+ ---
38
+ Are you curious about the world of computer vision and object detection? Look no further! Our interactive Gradio app powered by Hugging Face Spaces brings the excitement of object detection to your fingertips.
39
+
40
+ πŸŽ‰ Key Features:
41
+ ---
42
+ YOLOv3 at Your Fingertips: Our app is built around the YOLOv3 model, meticulously trained from scratch using the comprehensive Pascal VOC dataset comprising 20 diverse classes. This ensures accurate and robust object detection.
43
+
44
+ Precision with GradCAM: Experience the power of GradCAM (Gradient-weighted Class Activation Mapping), a cutting-edge technique that delves into the inner workings of the model. By harnessing gradients, it unveils the specific areas in an image that heavily influence the classification score. This level of insight is unprecedented and helps demystify the model's decision-making process.
45
+
46
+ Streamline Your Object Detection: With three different output streams providing sizes of 13x13, 26x26, and 52x52, you have the flexibility to focus on objects of varying sizes. Smaller outputs excel at capturing large objects, while larger ones excel at handling more intricate details. Tailor your approach based on the nature of your task.
47
+
48
+ πŸ“Έ How It Works:
49
+ ---
50
+ Simply upload an image that you'd like to subject to object detection.
51
+ Select the output stream that you believe is most appropriate for your task.
52
+ Sit back and watch as our YOLOv3 model deftly identifies and annotates objects within the image.
53
+ For an added layer of enlightenment, explore the GradCAM visualization. Uncover the regions that the model identifies as pivotal in its classification decision, all in real-time!
54
+
55
+ βœ… Pascal VOC Classes:
56
+ ---
57
+ aeroplane, bicycle, bird, boat, bottle, bus, car, cat, chair, cow, diningtable, dog, horse, motorbike, person, pottedplant, sheep, sofa, train, tvmonitor
58
+
59
+ 🌟 Explore and Learn:
60
+ ---
61
+ Our "Examples" tab is a treasure trove of visual insights. Explore pre-loaded images with varying complexities to witness the prowess of YOLOv3 in action. Study the GradCAM outputs to gain a deeper understanding of how different output streams affect the model's attention.
62
+
63
+ Ready to embark on an object detection journey like never before? Give our YOLOv3 Object Detection Explorer a try and discover the captivating world of computer vision today!
64
+ """
65
+
66
+
67
+ def generate_gradio_output(
68
+ input_img,
69
+ gradcam_output_stream=0,
70
+ ):
71
+ input_img = resize_transforms(image=input_img)["image"]
72
+ fig, processed_img = plot_bboxes(
73
+ input_img=input_img,
74
+ model=model,
75
+ thresh=0.6,
76
+ iou_thresh=0.5,
77
+ anchors=model.scaled_anchors,
78
+ )
79
+ visualization = generate_gradcam_output(
80
+ org_img=input_img,
81
+ model=model,
82
+ input_img=processed_img,
83
+ gradcam_output_stream=gradcam_output_stream,
84
+ )
85
+ return fig, visualization
86
+
87
+
88
+ # generate_gradio_output(torch.zeros(416, 416, 3).numpy())
89
+
90
+ gr.Interface(
91
+ fn=generate_gradio_output,
92
+ inputs=[
93
+ gr.Image(label="Input Image"),
94
+ gr.Slider(0, 2, step=1, label="GradCAM Output Stream (13, 26, 52)"),
95
+ ],
96
+ outputs=[
97
+ gr.Plot(
98
+ visible=True,
99
+ label="Bounding Box Predictions",
100
+ ),
101
+ gr.Image(label="GradCAM Output").style(width=416, height=416),
102
+ ],
103
+ examples=examples,
104
+ title=title,
105
+ description=description,
106
+ ).launch()