Spaces:

eksemyashkina
/

mask-detection

Running

App Files Files Community

eksemyashkina commited on Feb 2

Commit

f514e23

verified ·

1 Parent(s): 285ba68

Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +2 -0
app.py +100 -0
assets/examples/image1.jpg +0 -0
assets/examples/image2.jpg +0 -0
assets/examples/image3.jpg +3 -0
assets/examples/image4.jpg +0 -0
assets/examples/image5.jpg +3 -0
requirements.txt +11 -0
src/dataset.py +173 -0
src/loss.py +177 -0
src/models/yolov3.py +114 -0
src/train.py +427 -0
src/utils.py +11 -0
weights/checkpoint-best.pth +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/examples/image3.jpg filter=lfs diff=lfs merge=lfs -text
+assets/examples/image5.jpg filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from typing import List
+import gradio as gr
+import PIL.Image, PIL.ImageOps
+import torch
+import numpy as np
+import torchvision.transforms as T
+from src.models.yolov3 import YOLOv3
+from src.train import draw_bounding_boxes, decode_predictions_3scales
+from src.dataset import ANCHORS, resize_with_padding
+device = torch.device("cpu")
+model_weight = "weights/checkpoint-best.pth"
+label_colors = {"without_mask": (178, 34, 34), "with_mask": (34, 139, 34), "mask_worn_incorrectly": (184, 134, 11)}
+model = YOLOv3()
+model.load_state_dict(torch.load(model_weight, map_location=device))
+model.eval()
+def create_combined_image(img: torch.Tensor, results: List[torch.Tensor], mean: List[float] = [0.485, 0.456, 0.406], std: List[float] = [0.229, 0.224, 0.225]):
+    batch_size, _, height, width = img.shape
+    combined_height = height
+    combined_width = width * batch_size
+    combined_image = np.zeros((combined_height, combined_width, 3), dtype=np.uint8)
+    for i in range(batch_size):
+        image = img[i].cpu().permute(1, 2, 0).numpy()
+        image = (image * std + mean).clip(0, 1)
+        image = (image * 255).astype(np.uint8)
+        pred_image = PIL.Image.fromarray(image.copy())
+        draw_bounding_boxes(pred_image, results[i], show_conf=True)
+        combined_image[:height, i * width:(i + 1) * width, :] = np.array(pred_image)
+    return PIL.Image.fromarray(combined_image)
+transform = T.Compose([
+            T.ToTensor(),
+            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+def detect_mask(image, conf_threshold: float) -> PIL.Image:
+    img_resized, _, _, _ = resize_with_padding(image)
+    img_tensor = transform(img_resized)
+    with torch.no_grad():
+        out_l, out_m, out_s = model(img_tensor.unsqueeze(0))
+    results = decode_predictions_3scales(out_l, out_m, out_s, ANCHORS["large"], ANCHORS["medium"], ANCHORS["small"], conf_threshold=conf_threshold)
+    combined_image = create_combined_image(img_tensor.unsqueeze(0), results)
+    return combined_image
+def generate_legend_html_compact() -> str:
+    legend_html = """
+    <div style="display: flex; flex-wrap: wrap; gap: 10px; justify-content: center;">
+    """
+    for idx, (label, color) in enumerate(label_colors.items()):
+        legend_html += f"""
+        <div style="display: flex; align-items: center; justify-content: center;
+                     padding: 5px 10px; border: 1px solid rgb{color};
+                     background-color: rgb{color}; border-radius: 5px;
+                     color: white; font-size: 12px; text-align: center;">
+            {label}
+        </div>
+        """
+    legend_html += "</div>"
+    return legend_html
+examples = [
+    ["assets/examples/image1.jpg"],
+    ["assets/examples/image2.jpg"],
+    ["assets/examples/image3.jpg"],
+    ["assets/examples/image4.jpg"],
+    ["assets/examples/image5.jpg"]
+]
+with gr.Blocks() as demo:
+    gr.Markdown("## Mask Detection with YOLOv3")
+    with gr.Row():
+        with gr.Column():
+            pic = gr.Image(label="Upload Human Image", type="pil", height=300, width=300)
+            conf_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.01, label="Confidence Threshold")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    predict_btn = gr.Button("Predict")
+                with gr.Column(scale=1):
+                    clear_btn = gr.Button("Clear")
+        with gr.Column():
+            output = gr.Image(label="Detection", type="pil", height=300, width=300)
+            legend = gr.HTML(label="Legend", value=generate_legend_html_compact())
+    predict_btn.click(fn=detect_mask, inputs=[pic, conf_slider], outputs=output, api_name="predict")
+    clear_btn.click(lambda: (None, None), outputs=[pic, output])
+    gr.Examples(examples=examples, inputs=[pic])
+demo.launch()

assets/examples/image1.jpg ADDED Viewed

assets/examples/image2.jpg ADDED Viewed

assets/examples/image3.jpg ADDED Viewed

Git LFS Details

SHA256: fc1c5e72c5362f0f1ea703728f51df89ee0715c6c4d33ef03cb143724db6fd56
Pointer size: 131 Bytes
Size of remote file: 554 kB

assets/examples/image4.jpg ADDED Viewed

assets/examples/image5.jpg ADDED Viewed

Git LFS Details

SHA256: e568c5c129b480bb65705f658e3dbfc89cb3f7c074491122d235c4d2a485c751
Pointer size: 131 Bytes
Size of remote file: 229 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch==2.6.0
+tqdm==4.67.1
+Pillow==10.4.0
+bs4==0.0.2
+scikit-learn==1.6.0
+torchvision==0.21.0
+wandb==0.19.1
+lxml==5.3.0
+accelerate==1.1.0
+kaggle==1.6.17
+gradio==5.14.0

src/dataset.py ADDED Viewed

	@@ -0,0 +1,173 @@

+from typing import List, Tuple, Dict
+from pathlib import Path
+import PIL.Image
+import numpy as np
+import torchvision.transforms as T
+import torch
+from torch.utils.data import Dataset
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+ANCHORS = {
+    "small":  [(26, 28), (17, 19), (10, 11)],
+    "medium": [(78, 88), (55, 59), (37, 42)],
+    "large":  [(128, 152), (182, 205), (103, 124)]
+}
+GRID_SIZES = [13, 26, 52]
+IMAGE_SIZE = (416, 416)
+NUM_CLASSES = 3
+def generate_box(obj: Tag) -> List[int]:
+    xmin = int(obj.find("xmin").text) - 1
+    ymin = int(obj.find("ymin").text) - 1
+    xmax = int(obj.find("xmax").text) - 1
+    ymax = int(obj.find("ymax").text) - 1
+    if obj.find("name").text == "without_mask":
+        class_id = 0
+    elif obj.find("name").text == "with_mask":
+        class_id = 1
+    else:
+        class_id = 2
+    return [xmin, ymin, xmax, ymax, class_id]
+def resize_boxes(box: List[int], scale: float, pad_x: int, pad_y: int) -> Tuple[int]:
+    xmin, ymin, xmax, ymax, class_id = box
+    xmin = int(xmin * scale + pad_x)
+    ymin = int(ymin * scale + pad_y)
+    xmax = int(xmax * scale + pad_x)
+    ymax = int(ymax * scale + pad_y)
+    return (xmin, ymin, xmax, ymax, class_id)
+def resize_with_padding(image: PIL.Image.Image, target_size: Tuple[int] = IMAGE_SIZE, fill: Tuple[int] = (255, 255, 255)) -> Tuple[PIL.Image.Image, float, int]:
+    target_w, target_h = target_size
+    orig_w, orig_h = image.size
+    scale = min(target_w / orig_w, target_h / orig_h)
+    new_w = int(orig_w * scale)
+    new_h = int(orig_h * scale)
+    image_resized = image.resize((new_w, new_h), resample=PIL.Image.LANCZOS)
+    new_image = PIL.Image.new("RGB", (target_w, target_h), color=fill)
+    pad_x = (target_w - new_w) // 2
+    pad_y = (target_h - new_h) // 2
+    new_image.paste(image_resized, (pad_x, pad_y))
+    return new_image, scale, pad_x, pad_y
+def build_targets_3scale(bboxes: List[Tuple[int]], image_size: Tuple[int] = IMAGE_SIZE, anchors: Dict[str, List[Tuple[int]]] = ANCHORS, grid_sizes: List[int] = GRID_SIZES, num_classes: int = NUM_CLASSES) -> Tuple[torch.Tensor]:
+    img_w, img_h = image_size
+    t_large = torch.zeros((grid_sizes[0], grid_sizes[0], 3, 5 + num_classes), dtype=torch.float32)
+    t_medium = torch.zeros((grid_sizes[1], grid_sizes[1], 3, 5 + num_classes), dtype=torch.float32)
+    t_small = torch.zeros((grid_sizes[2], grid_sizes[2], 3, 5 + num_classes), dtype=torch.float32)
+    all_anchors = anchors["large"] + anchors["medium"] + anchors["small"]
+    for (xmin, ymin, xmax, ymax, cls_id) in bboxes:
+        box_w = xmax - xmin
+        box_h = ymax - ymin
+        x_center = (xmax + xmin) / 2
+        y_center = (ymax + ymin) / 2
+        if box_w <= 0 or box_h <= 0:
+            continue
+        best_iou = 0
+        best_idx = 0
+        for i, (aw, ah) in enumerate(all_anchors):
+            inter = min(box_w, aw) * min(box_h, ah)
+            union = box_w * box_h + aw * ah - inter
+            iou = inter / union if union > 0 else 0
+            if iou > best_iou:
+                best_iou = iou
+                best_idx = i
+        if best_idx <= 2:
+            s = grid_sizes[0]
+            t = t_large
+            local_anchor_id = best_idx
+            anchor_w, anchor_h = anchors["large"][local_anchor_id]
+        elif best_idx <= 5:
+            s = grid_sizes[1]
+            t = t_medium
+            local_anchor_id = best_idx - 3
+            anchor_w, anchor_h = anchors["medium"][local_anchor_id]
+        else:
+            s = grid_sizes[2]
+            t = t_small
+            local_anchor_id = best_idx - 6
+            anchor_w, anchor_h = anchors["small"][local_anchor_id]
+        cell_w = img_w / s
+        cell_h = img_h / s
+        gx = int(x_center // cell_w)
+        gy = int(y_center // cell_h)
+        tx = (x_center / cell_w) - gx
+        ty = (y_center / cell_h) - gy
+        tw = np.log((box_w / (anchor_w + 1e-16)) + 1e-16)
+        th = np.log((box_h / (anchor_h + 1e-16)) + 1e-16)
+        t[gy, gx, local_anchor_id, 0] = tx
+        t[gy, gx, local_anchor_id, 1] = ty
+        t[gy, gx, local_anchor_id, 2] = tw
+        t[gy, gx, local_anchor_id, 3] = th
+        t[gy, gx, local_anchor_id, 4] = 1.0
+        t[gy, gx, local_anchor_id, 5 + cls_id] = 1.0
+    return t_large, t_medium, t_small
+class MaskDataset(Dataset):
+    def __init__(self, root: str, train: bool = True, test_size: float = 0.25) -> None:
+        super().__init__()
+        self.class_counts = [0, 0, 0]
+        self.root = root
+        self.train = train
+        all_imgs = sorted(list((Path(root) / "images").glob("*.png")))
+        all_anns = sorted(list((Path(root) / "annotations").glob("*.xml")))
+        n_test = int(len(all_imgs) * test_size)
+        if train:
+            self.images = all_imgs[n_test:]
+            self.annots = all_anns[n_test:]
+        else:
+            self.images = all_imgs[:n_test]
+            self.annots = all_anns[:n_test]
+        self.transform = T.Compose([
+            T.ToTensor(),
+            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+        for ann in self.annots:
+            with open(ann, "r") as f:
+                data = f.read()
+                soup = BeautifulSoup(data, "lxml")
+                for obj in soup.find_all("object"):
+                    cls = obj.find("name").text
+                    self.class_counts[0 if cls == "without_mask" else 1 if cls == "with_mask" else 2] += 1
+    def __len__(self) -> int:
+        return len(self.images)
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
+        img_path = self.images[idx]
+        ann_path = self.annots[idx]
+        img = PIL.Image.open(img_path).convert("RGB")
+        img_resized, scale, pad_x, pad_y = resize_with_padding(img)
+        with open(ann_path, "r") as f:
+            data = f.read()
+            soup = BeautifulSoup(data, "lxml")
+            objs = soup.find_all("object")
+        resized_boxes = []
+        for obj in objs:
+            b = generate_box(obj)
+            b2 = resize_boxes(b, scale, pad_x, pad_y)
+            resized_boxes.append(b2)
+        t_large, t_medium, t_small = build_targets_3scale(resized_boxes)
+        img_tensor = self.transform(img_resized)
+        return img_tensor, (t_large, t_medium, t_small)
+def collate_fn(batch: List[Tuple[torch.Tensor, Tuple[torch.Tensor]]]) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
+    imgs, t_l, t_m, t_s = [], [], [], []
+    for (img, (tl, tm, ts)) in batch:
+        imgs.append(img)
+        t_l.append(tl)
+        t_m.append(tm)
+        t_s.append(ts)
+    imgs = torch.stack(imgs, dim=0)
+    t_l = torch.stack(t_l, dim=0)
+    t_m = torch.stack(t_m, dim=0)
+    t_s = torch.stack(t_s, dim=0)
+    return imgs, (t_l, t_m, t_s)

src/loss.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from typing import Tuple, List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def box_iou_xyxy(boxes1: torch.Tensor, boxes2: torch.Tensor) -> torch.Tensor:
+    N = boxes1.size(0)
+    M = boxes2.size(0)
+    x1_1, y1_1, x2_1, y2_1 = boxes1[:, 0], boxes1[:, 1], boxes1[:, 2], boxes1[:, 3]
+    x1_2, y1_2, x2_2, y2_2 = boxes2[:, 0], boxes2[:, 1], boxes2[:, 2], boxes2[:, 3]
+    x1_1 = x1_1.unsqueeze(1).expand(N, M)
+    y1_1 = y1_1.unsqueeze(1).expand(N, M)
+    x2_1 = x2_1.unsqueeze(1).expand(N, M)
+    y2_1 = y2_1.unsqueeze(1).expand(N, M)
+    x1_2 = x1_2.unsqueeze(0).expand(N, M)
+    y1_2 = y1_2.unsqueeze(0).expand(N, M)
+    x2_2 = x2_2.unsqueeze(0).expand(N, M)
+    y2_2 = y2_2.unsqueeze(0).expand(N, M)
+    interX1 = torch.max(x1_1, x1_2)
+    interY1 = torch.max(y1_1, y1_2)
+    interX2 = torch.min(x2_1, x2_2)
+    interY2 = torch.min(y2_1, y2_2)
+    interW = (interX2 - interX1).clamp(min=0)
+    interH = (interY2 - interY1).clamp(min=0)
+    interArea = interW * interH
+    area1 = (x2_1 - x1_1).clamp(min=0) * (y2_1 - y1_1).clamp(min=0)
+    area2 = (x2_2 - x1_2).clamp(min=0) * (y2_2 - y1_2).clamp(min=0)
+    union = area1 + area2 - interArea + 1e-16
+    iou = interArea / union
+    return iou
+def box_giou_xyxy(boxes1: torch.Tensor, boxes2: torch.Tensor) -> torch.Tensor:
+    xA = torch.max(boxes1[:, 0], boxes2[:, 0])
+    yA = torch.max(boxes1[:, 1], boxes2[:, 1])
+    xB = torch.min(boxes1[:, 2], boxes2[:, 2])
+    yB = torch.min(boxes1[:, 3], boxes2[:, 3])
+    interW = (xB - xA).clamp(min=0)
+    interH = (yB - yA).clamp(min=0)
+    interArea = interW * interH
+    area1 = (boxes1[:, 2] - boxes1[:, 0]).clamp(min=0) * (boxes1[:, 3] - boxes1[:, 1]).clamp(min=0)
+    area2 = (boxes2[:, 2] - boxes2[:, 0]).clamp(min=0) * (boxes2[:, 3] - boxes2[:, 1]).clamp(min=0)
+    union = area1 + area2 - interArea + 1e-16
+    iou = interArea / union
+    xC1 = torch.min(boxes1[:, 0], boxes2[:, 0])
+    yC1 = torch.min(boxes1[:, 1], boxes2[:, 1])
+    xC2 = torch.max(boxes1[:, 2], boxes2[:, 2])
+    yC2 = torch.max(boxes1[:, 3], boxes2[:, 3])
+    encloseW = (xC2 - xC1).clamp(min=0)
+    encloseH = (yC2 - yC1).clamp(min=0)
+    encloseArea = encloseW * encloseH + 1e-16
+    giou = iou - (encloseArea - union) / encloseArea
+    return giou
+class YoloLoss(nn.Module):
+    def __init__(self, class_counts: List[int], anchors_l: List[int] = [(128, 152), (182, 205), (103, 124)], anchors_m: List[int] = [(78, 88), (55, 59), (37, 42)], anchors_s: List[int] = [(26, 28), (17, 19), (10, 11)], image_size: Tuple[int] = (416, 416), num_classes: int = 3, ignore_thresh: float = 0.7, lambda_noobj: float = 5.0):
+        super().__init__()
+        self.anchors_l = anchors_l
+        self.anchors_m = anchors_m
+        self.anchors_s = anchors_s
+        self.image_size = image_size
+        self.num_classes = num_classes
+        self.ignore_thresh = ignore_thresh
+        self.lambda_noobj = lambda_noobj
+        total = sum(class_counts)
+        w_list = [total / (c + 1e-5) * (2.0 if c_id == 0 else (3.0 if c_id == 2 else 1.0)) for c_id, c in enumerate(class_counts)]
+        self.class_weight = torch.tensor(w_list, dtype=torch.float32)
+        self.bce_obj = nn.BCEWithLogitsLoss(reduction="none")
+        self.bce_cls = nn.BCEWithLogitsLoss(weight=self.class_weight, reduction="none")
+    def forward(self, outputs: Tuple[torch.Tensor], targets: Tuple[torch.Tensor]) -> torch.Tensor:
+        out_l, out_m, out_s = outputs
+        t_l, t_m, t_s = targets
+        loss_l = self._loss_single_scale(out_l, t_l, self.anchors_l, scale_wh=(13, 13))
+        loss_m = self._loss_single_scale(out_m, t_m, self.anchors_m, scale_wh=(26, 26))
+        loss_s = self._loss_single_scale(out_s, t_s, self.anchors_s, scale_wh=(52, 52))
+        return loss_l + loss_m + loss_s
+    def _loss_single_scale(self, pred: torch.Tensor, target: torch.Tensor, anchors: List[Tuple[int]], scale_wh: Tuple[int]) -> torch.Tensor:
+        device = pred.device
+        B, _, H, W = pred.shape
+        A = len(anchors)
+        pred = pred.view(B, A, (5 + self.num_classes), H, W)
+        pred = pred.permute(0, 3, 4, 1, 2).contiguous()
+        pred_tx = pred[..., 0]
+        pred_ty = pred[..., 1]
+        pred_tw = pred[..., 2]
+        pred_th = pred[..., 3]
+        pred_obj = pred[..., 4]
+        pred_cls = pred[..., 5:]
+        tgt_tx  = target[..., 0]
+        tgt_ty  = target[..., 1]
+        tgt_tw  = target[..., 2]
+        tgt_th  = target[..., 3]
+        tgt_obj = target[..., 4]
+        tgt_cls = target[..., 5:]
+        obj_mask = (tgt_obj == 1)
+        noobj_mask = (tgt_obj == 0)
+        img_w, img_h = self.image_size
+        stride_x = img_w / W
+        stride_y = img_h / H
+        grid_x = torch.arange(W, device=device).view(1, 1, W, 1).expand(1, H, W, 1)
+        grid_y = torch.arange(H, device=device).view(1, H, 1, 1).expand(1, H, W, 1)
+        anchors_t = torch.tensor(anchors, dtype=torch.float, device=device)
+        anchor_w = anchors_t[:, 0].view(1, 1, 1, A)
+        anchor_h = anchors_t[:, 1].view(1, 1, 1, A)
+        pred_box_xc = (grid_x + torch.sigmoid(pred_tx)) * stride_x
+        pred_box_yc = (grid_y + torch.sigmoid(pred_ty)) * stride_y
+        pred_box_w  = torch.exp(pred_tw) * anchor_w
+        pred_box_h  = torch.exp(pred_th) * anchor_h
+        pred_x1 = pred_box_xc - pred_box_w / 2
+        pred_y1 = pred_box_yc - pred_box_h / 2
+        pred_x2 = pred_box_xc + pred_box_w / 2
+        pred_y2 = pred_box_yc + pred_box_h / 2
+        gt_box_xc = (grid_x + tgt_tx) * stride_x
+        gt_box_yc = (grid_y + tgt_ty) * stride_y
+        gt_box_w  = torch.exp(tgt_tw) * anchor_w
+        gt_box_h  = torch.exp(tgt_th) * anchor_h
+        gt_x1 = gt_box_xc - gt_box_w / 2
+        gt_y1 = gt_box_yc - gt_box_h  /2
+        gt_x2 = gt_box_xc + gt_box_w / 2
+        gt_y2 = gt_box_yc + gt_box_h / 2
+        with torch.no_grad():
+            ignore_mask_buf = torch.zeros_like(tgt_obj, dtype=torch.bool)
+            noobj_flat = noobj_mask.view(-1)
+            obj_flat = obj_mask.view(-1)
+            px1f = pred_x1.view(-1)
+            py1f = pred_y1.view(-1)
+            px2f = pred_x2.view(-1)
+            py2f = pred_y2.view(-1)
+            gx1f = gt_x1.view(-1)[obj_flat]
+            gy1f = gt_y1.view(-1)[obj_flat]
+            gx2f = gt_x2.view(-1)[obj_flat]
+            gy2f = gt_y2.view(-1)[obj_flat]
+            if noobj_flat.sum() > 0 and obj_flat.sum() > 0:
+                noobj_idx = noobj_flat.nonzero(as_tuple=True)[0]
+                noobj_boxes_xyxy = torch.stack([px1f[noobj_idx], py1f[noobj_idx], px2f[noobj_idx], py2f[noobj_idx]], dim=-1)
+                obj_boxes_xyxy = torch.stack([gx1f, gy1f, gx2f, gy2f], dim=-1)
+                ious = box_iou_xyxy(noobj_boxes_xyxy, obj_boxes_xyxy)
+                best_iou, _ = ious.max(dim=1)
+                ignore_flags = (best_iou > self.ignore_thresh)
+                all_idx = noobj_idx[ignore_flags]
+                ignore_mask_buf.view(-1)[all_idx] = True
+            ignore_mask = ignore_mask_buf
+        obj_loss = self.bce_obj(pred_obj[obj_mask], torch.ones_like(pred_obj[obj_mask]))
+        obj_loss = obj_loss.mean() if obj_loss.numel() > 0 else torch.tensor(0., device=device)
+        noobj_mask_final = (noobj_mask & (~ignore_mask))
+        noobj_loss = self.bce_obj(pred_obj[noobj_mask_final], torch.zeros_like(pred_obj[noobj_mask_final]))
+        noobj_loss = noobj_loss.mean() if noobj_loss.numel() > 0 else torch.tensor(0., device=device)
+        objectness_loss = obj_loss + self.lambda_noobj * noobj_loss
+        class_loss = torch.tensor(0., device=device, requires_grad=True)
+        if obj_mask.sum() > 0:
+            self.bce_cls.weight = self.class_weight.to(device)
+            cls_pred = pred_cls[obj_mask].to(device)
+            cls_gt = tgt_cls[obj_mask].to(device)
+            c_loss = self.bce_cls(cls_pred, cls_gt)
+            class_loss = c_loss.mean()
+        giou_loss = torch.tensor(0., device=device, requires_grad=True)
+        if obj_mask.sum() > 0:
+            px1_ = pred_x1[obj_mask]
+            py1_ = pred_y1[obj_mask]
+            px2_ = pred_x2[obj_mask]
+            py2_ = pred_y2[obj_mask]
+            p_xyxy = torch.stack([px1_,py1_,px2_,py2_], dim=-1)
+            gx1_ = gt_x1[obj_mask]
+            gy1_ = gt_y1[obj_mask]
+            gx2_ = gt_x2[obj_mask]
+            gy2_ = gt_y2[obj_mask]
+            g_xyxy = torch.stack([gx1_,gy1_,gx2_,gy2_], dim=-1)
+            giou = box_giou_xyxy(p_xyxy, g_xyxy)
+            giou_loss = (1. - giou).mean()
+        total_loss = objectness_loss + class_loss + giou_loss
+        if total_loss is None:
+            pass
+        return total_loss

src/models/yolov3.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from typing import Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def conv_batch(in_ch: int, out_ch: int, kernel_size: int = 3, padding: int = 1, stride: int = 1) -> nn.Sequential:
+    return nn.Sequential(
+        nn.Conv2d(in_ch, out_ch, kernel_size=kernel_size, stride=stride, padding=padding, bias=False),
+        nn.BatchNorm2d(out_ch),
+        nn.LeakyReLU()
+    )
+class DarkResidualBlock(nn.Module):
+    def __init__(self, in_channels: int) -> None:
+        super().__init__()
+        reduced_channels = in_channels // 2
+        self.layer1 = conv_batch(in_channels, reduced_channels, kernel_size=1, padding=0)
+        self.layer2 = conv_batch(reduced_channels, in_channels)
+    def forward(self, x):
+        return x + self.layer2(self.layer1(x))
+class Darknet53(nn.Module):
+    def __init__(self, block: nn.Module = DarkResidualBlock) -> None:
+        super().__init__()
+        self.conv1 = conv_batch(3, 32)
+        self.conv2 = conv_batch(32, 64, stride=2)
+        self.residual_block1 = self.make_layer(block, in_channels=64, num_blocks=1)
+        self.conv3 = conv_batch(64, 128, stride=2)
+        self.residual_block2 = self.make_layer(block, in_channels=128, num_blocks=2)
+        self.conv4 = conv_batch(128, 256, stride=2)
+        self.residual_block3 = self.make_layer(block, in_channels=256, num_blocks=8)
+        self.conv5 = conv_batch(256, 512, stride=2)
+        self.residual_block4 = self.make_layer(block, in_channels=512, num_blocks=8)
+        self.conv6 = conv_batch(512, 1024, stride=2)
+        self.residual_block5 = self.make_layer(block, in_channels=1024, num_blocks=4)
+    def make_layer(self, block: nn.Module, in_channels: int, num_blocks: int) -> nn.Sequential:
+        layers = []
+        for _ in range(num_blocks):
+            layers.append(block(in_channels))
+        return nn.Sequential(*layers)
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.residual_block1(x)
+        x = self.conv3(x)
+        x = self.residual_block2(x)
+        x = self.conv4(x)
+        x = self.residual_block3(x)
+        c4 = x
+        x = self.conv5(x)
+        x = self.residual_block4(x)
+        c5 = x
+        x = self.conv6(x)
+        x = self.residual_block5(x)
+        c6 = x
+        return c4, c5, c6
+def conv_leaky(in_ch: int, out_ch: int, k: int = 1, s: int = 1, p: int = 0):
+    return nn.Sequential(
+        nn.Conv2d(in_ch, out_ch, kernel_size=k, stride=s, padding=p, bias=False),
+        nn.BatchNorm2d(out_ch),
+        nn.LeakyReLU(0.1, inplace=True)
+    )
+class DetectionHead(nn.Module):
+    def __init__(self, in_ch: int, mid_ch: int, num_anchors: int = 3, num_classes: int = 3) -> None:
+        super().__init__()
+        self.block = nn.Sequential(
+            conv_leaky(in_ch, mid_ch, k=1, s=1, p=0),
+            conv_leaky(mid_ch, mid_ch * 2, k=3, s=1, p=1),
+            conv_leaky(mid_ch * 2, mid_ch, k=1, s=1, p=0),
+            conv_leaky(mid_ch, mid_ch * 2, k=3, s=1, p=1),
+            conv_leaky(mid_ch * 2, mid_ch, k=1, s=1, p=0)
+        )
+        self.out_conv = nn.Conv2d(mid_ch, num_anchors * (5 + num_classes), kernel_size=1, stride=1, padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.block(x)
+        out = self.out_conv(x)
+        return out
+class YOLOv3(nn.Module):
+    def __init__(self, num_classes: int = 3) -> None:
+        super().__init__()
+        self.backbone = Darknet53()
+        self.num_classes = num_classes
+        self.num_anchors = 3
+        self.head_large = DetectionHead(in_ch=1024, mid_ch=512, num_anchors=3, num_classes=num_classes)
+        self.head_medium = DetectionHead(in_ch=1024, mid_ch=256, num_anchors=3, num_classes=num_classes)
+        self.head_small = DetectionHead(in_ch=512, mid_ch=128, num_anchors=3, num_classes=num_classes)
+        self.conv_upsample_l2 = conv_leaky(1024, 512, k=1, s=1, p=0)
+        self.conv_upsample_l3 = conv_leaky(1024, 256, k=1, s=1, p=0)
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+        c4, c5, c6 = self.backbone(x)
+        out_l = self.head_large(c6)
+        x_l2 = self.conv_upsample_l2(c6)
+        x_l2_up = F.interpolate(x_l2, scale_factor=2, mode="nearest")
+        x_merge_l2 = torch.cat([x_l2_up, c5], dim=1)
+        out_m = self.head_medium(x_merge_l2)
+        x_l3 = self.conv_upsample_l3(x_merge_l2)
+        x_l3_up = F.interpolate(x_l3, scale_factor=2, mode="nearest")
+        x_merge_l3 = torch.cat([x_l3_up, c4], dim=1)
+        out_s = self.head_small(x_merge_l3)
+        return out_l, out_m, out_s

src/train.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import math
+from pathlib import Path
+from typing import List, Tuple, Dict
+from tqdm import tqdm
+import argparse
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+import wandb
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+import torchvision.ops as ops
+import PIL
+import numpy as np
+from dataset import MaskDataset, collate_fn, ANCHORS
+from utils import EMA
+from models.yolov3 import YOLOv3
+from loss import YoloLoss
+class WarmupCosineAnnealingLR(torch.optim.lr_scheduler._LRScheduler):
+    def __init__(self, optimizer: torch.optim.Optimizer, warmup_steps: int, total_steps: int, eta_min: int = 0, last_epoch: int = -1) -> None:
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.eta_min = eta_min
+        super().__init__(optimizer, last_epoch)
+    def get_lr(self) -> List[float]:
+        if self.last_epoch < self.warmup_steps:
+            return [base_lr * (self.last_epoch / max(1, self.warmup_steps)) for base_lr in self.base_lrs]
+        else:
+            current_step = self.last_epoch - self.warmup_steps
+            cosine_steps = max(1, self.total_steps - self.warmup_steps)
+            return [self.eta_min + (base_lr - self.eta_min) * 0.5 * (1 + math.cos(math.pi * current_step / cosine_steps)) for base_lr in self.base_lrs]
+def draw_bounding_boxes(image: PIL.Image.Image, boxes: torch.Tensor, colors: Dict[int, int] = {0: (178, 34, 34), 1: (34, 139, 34), 2: (184, 134, 11)}, labels = {0: "without_mask", 1: "with_mask", 2: "weared_incorrect"}, show_conf = False) -> None:
+    draw = PIL.ImageDraw.Draw(image)
+    for box in boxes:
+        xmin, ymin, xmax, ymax, class_id = int(box[0]), int(box[1]), int(box[2]), int(box[3]), int(box[-1])
+        conf_text = ""
+        if show_conf and box.shape[0] == 6:
+            conf = float(box[4])
+            conf_text = f" {conf:.2f}"
+        color = colors.get(class_id, (255, 255, 255))
+        label = labels.get(class_id, "Unknown") + conf_text
+        draw.rectangle([xmin, ymin, xmax, ymax], outline=color, width=2)
+        text_bbox = draw.textbbox((xmin, ymin), label)
+        text_width = text_bbox[2] - text_bbox[0]
+        text_height = text_bbox[3] - text_bbox[1]
+        draw.rectangle([xmin, ymin - text_height - 2, xmin + text_width + 2, ymin], fill=color)
+        draw.text((xmin + 1, ymin - text_height - 1), label, fill="white")
+def create_combined_image(img: torch.Tensor, gt_batch: List[torch.Tensor], results: List[torch.Tensor], mean: List[float] = [0.485, 0.456, 0.406], std: List[float] = [0.229, 0.224, 0.225]):
+    batch_size, _, height, width = img.shape
+    combined_height = height * 2
+    combined_width = width * batch_size
+    combined_image = np.zeros((combined_height, combined_width, 3), dtype=np.uint8)
+    for i in range(batch_size):
+        image = img[i].cpu().permute(1, 2, 0).numpy()
+        image = (image * std + mean).clip(0, 1)
+        image = (image * 255).astype(np.uint8)
+        gt_image = PIL.Image.fromarray(image.copy())
+        pred_image = PIL.Image.fromarray(image.copy())
+        draw_bounding_boxes(gt_image, gt_batch[i])
+        draw_bounding_boxes(pred_image, results[i], show_conf=True)
+        combined_image[:height, i * width:(i + 1) * width, :] = np.array(gt_image)
+        combined_image[height:, i * width:(i + 1) * width, :] = np.array(pred_image)
+    return PIL.Image.fromarray(combined_image)
+def decode_yolo_output_single(prediction: torch.Tensor, anchors: List[Tuple[int]], image_size: Tuple[int] = (416, 416), conf_threshold: float = 0.5, iou_threshold: float = 0.3, apply_nms: bool = True, num_classes: int = 3) -> List[torch.Tensor]:
+    device = prediction.device
+    B, _, H, W = prediction.shape
+    A = len(anchors)
+    prediction = prediction.view(B, A, 5 + num_classes, H, W)
+    prediction = prediction.permute(0, 1, 3, 4, 2).contiguous()
+    tx = prediction[..., 0]
+    ty = prediction[..., 1]
+    tw = prediction[..., 2]
+    th = prediction[..., 3]
+    obj = prediction[..., 4]
+    class_scores = prediction[..., 5:]
+    tx = tx.sigmoid()
+    ty = ty.sigmoid()
+    obj = obj.sigmoid()
+    class_scores = class_scores.softmax(dim=-1)
+    img_w, img_h = image_size
+    cell_w = img_w / W
+    cell_h = img_h / H
+    grid_x = torch.arange(W, device=device).view(1, 1, W).expand(1, H, W)
+    grid_y = torch.arange(H, device=device).view(1, H, 1).expand(1, H, W)
+    anchors_tensor = torch.tensor(anchors, dtype=torch.float32, device=device)
+    anchor_w = anchors_tensor[:, 0].view(1, A, 1, 1)
+    anchor_h = anchors_tensor[:, 1].view(1, A, 1, 1)
+    x_center = (grid_x + tx) * cell_w
+    y_center = (grid_y + ty) * cell_h
+    w = torch.exp(tw) * anchor_w
+    h = torch.exp(th) * anchor_h
+    xmin = x_center - w / 2
+    ymin = y_center - h / 2
+    xmax = x_center + w / 2
+    ymax = y_center + h / 2
+    max_class_probs, class_ids = class_scores.max(dim=-1)
+    confidence = obj * max_class_probs
+    outputs = []
+    for b_i in range(B):
+        box_xmin = xmin[b_i].view(-1)
+        box_ymin = ymin[b_i].view(-1)
+        box_xmax = xmax[b_i].view(-1)
+        box_ymax = ymax[b_i].view(-1)
+        conf = confidence[b_i].view(-1)
+        cls_id = class_ids[b_i].view(-1).float()
+        mask = (conf > conf_threshold)
+        box_xmin = box_xmin[mask]
+        box_ymin = box_ymin[mask]
+        box_xmax = box_xmax[mask]
+        box_ymax = box_ymax[mask]
+        conf = conf[mask]
+        cls_id = cls_id[mask]
+        if mask.sum() == 0:
+            outputs.append(torch.empty((0, 6), device=device))
+            continue
+        boxes = torch.stack([box_xmin, box_ymin, box_xmax, box_ymax], dim=-1)
+        if apply_nms:
+            keep = ops.nms(boxes, conf, iou_threshold)
+            boxes = boxes[keep]
+            conf = conf[keep]
+            cls_id = cls_id[keep]
+        out = torch.cat([boxes, conf.unsqueeze(-1), cls_id.unsqueeze(-1)], dim=-1)
+        outputs.append(out)
+    return outputs
+def decode_predictions_3scales(out_l: torch.Tensor, out_m: torch.Tensor, out_s: torch.Tensor, anchors_l: List[Tuple[int]], anchors_m: List[Tuple[int, int]], anchors_s: List[Tuple[int, int]], image_size: Tuple[int, int] = (416, 416), conf_threshold: float = 0.5, iou_threshold: float = 0.45, num_classes: int = 3) -> List[torch.Tensor]:
+    b_l = decode_yolo_output_single(out_l, anchors_l, image_size, conf_threshold, iou_threshold, apply_nms=False, num_classes=num_classes)
+    b_m = decode_yolo_output_single(out_m, anchors_m, image_size, conf_threshold, iou_threshold, apply_nms=False, num_classes=num_classes)
+    b_s = decode_yolo_output_single(out_s, anchors_s, image_size, conf_threshold, iou_threshold, apply_nms=False, num_classes=num_classes)
+    results = []
+    B = len(b_l)
+    for i in range(B):
+        boxes_all = torch.cat([b_l[i], b_m[i], b_s[i]], dim=0)
+        if boxes_all.numel() == 0:
+            results.append(boxes_all)
+            continue
+        xyxy = boxes_all[:, :4]
+        scores = boxes_all[:, 4]
+        keep = ops.nms(xyxy, scores, iou_threshold)
+        final = boxes_all[keep]
+        results.append(final)
+    return results
+def decode_target_single(target: torch.Tensor, anchors: List[Tuple[int]], image_size: Tuple[int] = (416, 416), obj_threshold: float = 0.5) -> List[torch.Tensor]:
+    args = parse_args()
+    target = target.to(args.device)
+    B, S, _, A, _ = target.shape
+    img_w, img_h = image_size
+    cell_w = img_w / S
+    cell_h = img_h / S
+    anchors_tensor = torch.tensor(anchors, dtype=torch.float)
+    tx = target[..., 0]
+    ty = target[..., 1]
+    tw = target[..., 2]
+    th = target[..., 3]
+    tobj = target[..., 4]
+    tcls = target[..., 5:]
+    results = []
+    for b_i in range(B):
+        bx_list = []
+        tx_b = tx[b_i]
+        ty_b = ty[b_i]
+        tw_b = tw[b_i]
+        th_b = th[b_i]
+        tobj_b = tobj[b_i]
+        tcls_b = tcls[b_i]
+        for i in range(S):
+            for j in range(S):
+                for a_i in range(A):
+                    if tobj_b[i,j,a_i] < obj_threshold:
+                        continue
+                    cls_one_hot = tcls_b[i, j, a_i]
+                    cls_id = cls_one_hot.argmax().item()
+                    x_center = (j + tx_b[i, j, a_i].item()) * cell_w
+                    y_center = (i + ty_b[i, j, a_i].item()) * cell_h
+                    anchor_w = anchors_tensor[a_i, 0]
+                    anchor_h = anchors_tensor[a_i, 1]
+                    box_w = torch.exp(tw_b[i, j, a_i]) * anchor_w
+                    box_h = torch.exp(th_b[i, j, a_i]) * anchor_h
+                    xmin = x_center - box_w / 2
+                    ymin = y_center - box_h / 2
+                    xmax = x_center + box_w / 2
+                    ymax = y_center + box_h / 2
+                    bx_list.append([xmin.item(), ymin.item(), xmax.item(), ymax.item(), cls_id])
+        if len(bx_list) == 0:
+            results.append(torch.empty((0, 5), dtype=torch.float32, device=args.device))
+        else:
+            results.append(torch.tensor(bx_list, dtype=torch.float32, device=args.device))
+    return results
+def decode_target_3scales(t_l: torch.Tensor, t_m: torch.Tensor, t_s: torch.Tensor, anchors_l: List[Tuple[int]], anchors_m: List[Tuple[int]], anchors_s: List[Tuple[int]], image_size: Tuple[int] = (416, 416), obj_threshold: float = 0.5) -> List[torch.Tensor]:
+    dec_l = decode_target_single(t_l, anchors_l, image_size, obj_threshold)
+    dec_m = decode_target_single(t_m, anchors_m, image_size, obj_threshold)
+    dec_s = decode_target_single(t_s, anchors_s, image_size, obj_threshold)
+    results = []
+    B = len(dec_l)
+    for i in range(B):
+        boxes_l = dec_l[i]
+        boxes_m = dec_m[i]
+        boxes_s = dec_s[i]
+        if boxes_l.numel() == 0 and boxes_m.numel() == 0 and boxes_s.numel() == 0:
+            results.append(torch.empty((0, 5), dtype=torch.float32, device=boxes_l.device))
+        else:
+            all_ = torch.cat([boxes_l, boxes_m, boxes_s], dim=0)
+            results.append(all_)
+    return results
+def iou_xyxy(box1: List[int | float], box2: List[int | float]) -> float:
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+    w = max(0., x2 - x1)
+    h = max(0., y2 - y1)
+    inter = w * h
+    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    union = area1 + area2 - inter
+    return inter / union if union > 0 else 0.0
+def compute_ap_per_class(boxes_pred: List[List[float]], boxes_gt: List[List[float]], iou_threshold: float = 0.45) -> float:
+    boxes_pred = sorted(boxes_pred, key=lambda x: x[4], reverse=True)
+    n_gt = len(boxes_gt)
+    if n_gt == 0 and len(boxes_pred) == 0:
+        return 1.0
+    if n_gt == 0:
+        return 0.0
+    matched = [False] * n_gt
+    tps = []
+    fps = []
+    for i, pred in enumerate(boxes_pred):
+        best_iou = 0.0
+        best_j = -1
+        for j, gt in enumerate(boxes_gt):
+            if matched[j]:
+                continue
+            iou = iou_xyxy(pred, gt)
+            if iou > best_iou:
+                best_iou = iou
+                best_j = j
+        if best_iou > iou_threshold and best_j >= 0:
+            tps.append(1)
+            fps.append(0)
+            matched[best_j] = True
+        else:
+            tps.append(0)
+            fps.append(1)
+    tps_cum = []
+    fps_cum = []
+    s_tp = 0
+    s_fp = 0
+    for i in range(len(tps)):
+        s_tp += tps[i]
+        s_fp += fps[i]
+        tps_cum.append(s_tp)
+        fps_cum.append(s_fp)
+    precisions = []
+    recalls = []
+    for i in range(len(tps)):
+        prec = tps_cum[i] / (tps_cum[i] + fps_cum[i]) if (tps_cum[i] + fps_cum[i]) > 0 else 0
+        rec = tps_cum[i] / n_gt
+        precisions.append(prec)
+        recalls.append(rec)
+    recalls = [0.0] + recalls + [1.0]
+    precisions = [1.0] + precisions + [0.0]
+    for i in range(len(precisions) - 2, -1, -1):
+        precisions[i] = max(precisions[i], precisions[i+1])
+    ap = 0.0
+    for i in range(len(precisions) - 1):
+        ap += (recalls[i+1] - recalls[i]) * precisions[i+1]
+    return ap
+def compute_map(all_pred: List[float], all_gt: List[float], num_classes: int = 3, iou_threshold: float = 0.45) -> float:
+    APs = []
+    for c in range(num_classes):
+        ap_c = compute_ap_per_class(all_pred[c], all_gt[c], iou_threshold)
+        APs.append(ap_c)
+    mAP = sum(APs) / len(APs) if len(APs) > 0 else 0.0
+    return mAP
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train a model on the face mask detection dataset")
+    parser.add_argument("--root", type=str, default="data/masks", help="Path to the data")
+    parser.add_argument("--batch-size", type=int, default=16, help="Batch size for training and testing")
+    parser.add_argument("--logs-dir", type=str, default="yolo-logs", help="Path to save logs")
+    parser.add_argument("--pin-memory", type=bool, default=True, help="Pin Memory for DataLoader")
+    parser.add_argument("--num-workers", type=int, default=0, help="Number of workers for DataLoader")
+    parser.add_argument("--num-epochs", type=int, default=100, help="Number of training epochs")
+    parser.add_argument("--optimizer", type=str, default="AdamW", help="Optimizer type")
+    parser.add_argument("--learning-rate", type=float, default=5e-4, help="Learning rate for the optimizer")
+    parser.add_argument("--save-frequency", type=int, default=4, help="Frequency of saving model weights")
+    parser.add_argument("--max-norm", type=float, default=10.0, help="Maximum gradient norm for clipping")
+    parser.add_argument("--project-name", type=str, default="YOLOv3, mask detection", help="Wandb project name")
+    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to run the training on")
+    parser.add_argument("--weights-path", type=str, default="weights/darknet53.pth", help="Path to the weights")
+    parser.add_argument("--seed", type=int, default=42, help="Value of the seed")
+    parser.add_argument("--mixed-precision", type=str, default="fp16", choices=["fp16", "bf16", "fp8", "no"], help="Value of the mixed precision")
+    parser.add_argument("--gradient-accumulation-steps", type=int, default=2, help="Value of the gradient accumulation steps")
+    parser.add_argument("--log-steps", type=int, default=13, help="Number of steps between logging training images and metrics")
+    parser.add_argument("--num-warmup-steps", type=int, default=400, help="Number of steps")
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    set_seed(args.seed)
+    accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision)
+    with accelerator.main_process_first():
+        logs_dir = Path(args.logs_dir)
+        logs_dir.mkdir(exist_ok=True)
+        wandb.init(project=args.project_name, dir=logs_dir)
+    train_dataset = MaskDataset(root=args.root, train=True)
+    test_dataset = MaskDataset(root=args.root, train=False)
+    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=args.pin_memory, num_workers=args.num_workers, collate_fn=collate_fn)
+    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=args.pin_memory, num_workers=args.num_workers, collate_fn=collate_fn)
+    model = YOLOv3().to(accelerator.device)
+    optimizer_class = getattr(torch.optim, args.optimizer)
+    if args.weights_path:
+        weights = torch.load(args.weights_path, map_location="cpu", weights_only=True)
+        model.backbone.load_state_dict(weights)
+    optimizer = optimizer_class(model.parameters(), lr=args.learning_rate)
+    criterion = YoloLoss(class_counts=train_dataset.class_counts)
+    scheduler = WarmupCosineAnnealingLR(optimizer, warmup_steps=args.num_warmup_steps//args.gradient_accumulation_steps, total_steps=args.num_epochs*len(train_loader)//args.gradient_accumulation_steps, eta_min=1e-7)
+    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)
+    best_map = 0.0
+    train_loss_ema = EMA()
+    for epoch in range(1, args.num_epochs + 1):
+        model.train()
+        pbar = tqdm(train_loader, desc = f"Train epoch {epoch} / {args.num_epochs}")
+        for images, (t_l, t_m, t_s) in pbar:
+            images = images.to(accelerator.device)
+            t_l = t_l.to(accelerator.device)
+            t_m = t_m.to(accelerator.device)
+            t_s = t_s.to(accelerator.device)
+            with accelerator.accumulate(model):
+                with accelerator.autocast():
+                    out_l, out_m, out_s = model(images)
+                    loss = criterion((out_l, out_m, out_s), (t_l, t_m, t_s))
+                    accelerator.backward(loss)
+                    grad_norm = None
+                    if accelerator.sync_gradients:
+                        grad_norm = accelerator.clip_grad_norm_(model.parameters(), args.max_norm).item()
+                        optimizer.step()
+                        optimizer.zero_grad()
+                        scheduler.step()
+                    lr = scheduler.get_last_lr()[0]
+                    pbar.set_postfix({"loss": train_loss_ema(loss.item())})
+                    log_data = {
+                        "train/epoch": epoch,
+                        "train/loss": loss.item(),
+                        "train/lr": lr
+                    }
+                    if grad_norm is not None:
+                        log_data["train/grad_norm"] = grad_norm
+                    if accelerator.is_main_process:
+                        wandb.log(log_data)
+        accelerator.wait_for_everyone()
+        model.eval()
+        all_pred = [[] for _ in range(model.num_classes)]
+        all_gt = [[] for _ in range(model.num_classes)]
+        with torch.inference_mode():
+            test_loss = 0.0
+            pbar = tqdm(test_loader, desc=f"Test epoch {epoch} / {args.num_epochs}")
+            for index, (images, (t_l, t_m, t_s)) in enumerate(pbar):
+                images = images.to(accelerator.device)
+                t_l = t_l.to(accelerator.device)
+                t_m = t_m.to(accelerator.device)
+                t_s = t_s.to(accelerator.device)
+                out_l, out_m, out_s = model(images)
+                loss = criterion((out_l, out_m, out_s), (t_l, t_m, t_s))
+                test_loss += loss.item()
+                results = decode_predictions_3scales(out_l, out_m, out_s, ANCHORS["large"], ANCHORS["medium"], ANCHORS["small"])
+                gt_batch = decode_target_3scales(t_l, t_m, t_s, ANCHORS["large"], ANCHORS["medium"], ANCHORS["small"])
+                if (index + 1) % args.log_steps == 0 and accelerator.is_main_process:
+                    images_to_log = []
+                    combined_image = create_combined_image(images, gt_batch, results)
+                    images_to_log.append(wandb.Image(combined_image, caption=f"Combined Image (Test, Epoch {epoch})"))
+                    wandb.log({"test_samples": images_to_log})
+                for b_i in range(len(images)):
+                    dets_b = results[b_i].detach().cpu().numpy()
+                    gts_b = gt_batch[b_i].detach().cpu().numpy()
+                    for db in dets_b:
+                        c = int(db[5])
+                        all_pred[c].append([db[0], db[1], db[2], db[3], db[4]])
+                    for gb in gts_b:
+                        c = int(gb[4])
+                        all_gt[c].append([gb[0], gb[1], gb[2], gb[3]])
+        test_loss /= len(test_loader)
+        test_map = compute_map(all_pred, all_gt)
+        accelerator.print(f"loss: {test_loss:.3f}, map: {test_map:.3f}")
+        if accelerator.is_main_process:
+            wandb.log({
+                "epoch": epoch,
+                "test/loss": test_loss,
+                "test/mAP": test_map
+            })
+            if test_map > best_map:
+                best_map = test_map
+                accelerator.save(model.state_dict(), logs_dir / "checkpoint-best.pth")
+            elif epoch % args.save_frequency == 0:
+                accelerator.save(model.state_dict(), logs_dir / f"checkpoint-{epoch:09}.pth")
+        accelerator.wait_for_everyone()
+    accelerator.wait_for_everyone()
+    wandb.finish()
+if __name__ == "__main__":
+    main()

src/utils.py ADDED Viewed

	@@ -0,0 +1,11 @@

+class EMA:
+    def __init__(self, alpha: float = 0.9) -> None:
+        self.value = None
+        self.alpha = alpha
+    def __call__(self, value: float) -> float:
+        if self.value is None:
+            self.value = value
+        else:
+            self.value = self.alpha * self.value + (1 - self.alpha) * value
+        return self.value

weights/checkpoint-best.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d496cd707cec1135b6d6cfece5c35b92572063914d81ae2bbbc8ded5c7366e10
+size 224442922