File size: 4,040 Bytes
60b0ddc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
import torch
import torchvision.transforms as T
import torchvision.transforms.functional as TF
import requests
import io
from os import path
from typing import Tuple, List
from pathlib import Path
import os


class ImageDatasetBINARY:
    """
    Creates a DataSet from numpy arrays while keeping the data
    in the more efficient numpy arrays for as long as possible and only
    converting to torchtensors when needed (torch tensors are the objects used
    to pass the data through the neural network and apply weights).
    """

    def __init__(self, x: Path, y: Path) -> None:
        # Target labels
        self.targets = ImageDatasetBINARY.load_numpy_arr_from_npy(y)
        # Images
        self.imgs = ImageDatasetBINARY.load_numpy_arr_from_npy(x)
        # Division in:
        # SICK = 1
        self.targets[self.targets == 0] = 1 # Atelactasis to SICK
        self.targets[self.targets == 1] = 1 # Effusion to SICK
        self.targets[self.targets == 2] = 1 # Infiltration to SICK
        self.targets[self.targets == 4] = 1 # Nodule to SICK
        self.targets[self.targets == 5] = 1 # Pneumonia to SICK

        # NON SICK = 0
        self.targets[self.targets == 3] = 0 # No Finding to NON SICK

        

    def __len__(self) -> int:
        return len(self.targets)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, np.ndarray]:
        # Template code
        image = torch.from_numpy(self.imgs[idx] / 255).float() 
        label = self.targets[idx]

        # Metrics for Normalization of the images
        mean = image.mean()
        std = image.std()

        # Compose: Composes several transforms together (torch documentation)
        compose = T.Compose([
            T.Normalize(mean, std),  # Normalization
            T.Resize(156),  # Resizing to 156x156
            T.CenterCrop(128),  # Cropping to focus on the center 128x128 region
            T.Lambda(lambda x: TF.rotate(x, angle=90)),  # Rotating by 90 degrees
            T.RandomHorizontalFlip(p=0.5),  # Random horizontal flip with a 50% probability
            T.RandomVerticalFlip(p=0.5),  # Random vertical flip with a 50% probability
            T.Lambda(lambda x: x + torch.randn_like(x) * 0.1)  # Adding random noise
        ])

        # Apply the transformation done by composee
        image = compose(image)
        
        return image, label
    
    def get_labels(self) -> List[np.ndarray]:
        return self.targets.tolist()

    @staticmethod
    def load_numpy_arr_from_npy(path: Path) -> np.ndarray:
        """
        Loads a numpy array from local storage.

        Input:
        path: local path of file

        Outputs:
        dataset: numpy array with input features or labels
        """

        return np.load(path)


def load_numpy_arr_from_url(url: str) -> np.ndarray:
    """
    Loads a numpy array from surfdrive.

    Input:
    url: Download link of dataset

    Outputs:
    dataset: numpy array with input features or labels
    """

    response = requests.get(url)
    response.raise_for_status()

    return np.load(io.BytesIO(response.content))


if __name__ == "__main__":
    cwd = os.getcwd()
    if path.exists(path.join(cwd + "data/")):
        print("Data directory exists, files may be overwritten!")
    else:
        os.mkdir(path.join(cwd, "data/"))
    ### Load labels
    train_y = load_numpy_arr_from_url(
        url="https://surfdrive.surf.nl/files/index.php/s/i6MvQ8nqoiQ9Tci/download"
    )
    np.save("data/Y_train.npy", train_y)
    test_y = load_numpy_arr_from_url(
        url="https://surfdrive.surf.nl/files/index.php/s/wLXiOjVAW4AWlXY/download"
    )
    np.save("data/Y_test.npy", test_y)
    ### Load data
    train_x = load_numpy_arr_from_url(
        url="https://surfdrive.surf.nl/files/index.php/s/4rwSf9SYO1ydGtK/download"
    )
    np.save("data/X_train.npy", train_x)
    test_x = load_numpy_arr_from_url(
        url="https://surfdrive.surf.nl/files/index.php/s/dvY2LpvFo6dHef0/download"
    )
    np.save("data/X_test.npy", test_x)