thoracic-disease-classifier / dc1 /image_dataset.py
ummtushar's picture
initial commit
60b0ddc verified
import numpy as np
import torch
import torchvision.transforms as T
import torchvision.transforms.functional as TF
import requests
import io
from os import path
from typing import Tuple, List
from pathlib import Path
import os
class ImageDataset:
"""
Creates a DataSet from numpy arrays while keeping the data
in the more efficient numpy arrays for as long as possible and only
converting to torchtensors when needed (torch tensors are the objects used
to pass the data through the neural network and apply weights).
"""
def __init__(self, x: Path, y: Path) -> None:
# Target labels
self.targets = ImageDataset.load_numpy_arr_from_npy(y)
# Images
self.imgs = ImageDataset.load_numpy_arr_from_npy(x)
def __len__(self) -> int:
return len(self.targets)
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, np.ndarray]:
# Template code
image = torch.from_numpy(self.imgs[idx] / 255).float()
label = self.targets[idx]
# Preprocessing
# Metrics for Normalization of the images
mean = image.mean()
std = image.std()
# Compose: Composes several transforms together (torch documentation)
compose = T.Compose([
T.Normalize(mean, std), # Normalization
T.Resize(156), # Resizing to 156x156
T.CenterCrop(128), # Cropping to focus on the center 128x128 region
T.Lambda(lambda x: TF.rotate(x, angle=90)), # Rotating by 90 degrees
T.RandomHorizontalFlip(p=0.5), # Random horizontal flip with a 50% probability
T.RandomVerticalFlip(p=0.5), # Random vertical flip with a 50% probability
T.Lambda(lambda x: x + torch.randn_like(x) * 0.1) # Adding random noise
])
# Apply the transformation done by composee
image = compose(image)
return image, label
def get_labels(self) -> List[np.ndarray]:
return self.targets.tolist()
@staticmethod
def load_numpy_arr_from_npy(path: Path) -> np.ndarray:
"""
Loads a numpy array from local storage.
Input:
path: local path of file
Outputs:
dataset: numpy array with input features or labels
"""
return np.load(path)
def load_numpy_arr_from_url(url: str) -> np.ndarray:
"""
Loads a numpy array from surfdrive.
Input:
url: Download link of dataset
Outputs:
dataset: numpy array with input features or labels
"""
response = requests.get(url)
response.raise_for_status()
return np.load(io.BytesIO(response.content))
if __name__ == "__main__":
cwd = os.getcwd()
if path.exists(path.join(cwd + "data/")):
print("Data directory exists, files may be overwritten!")
else:
os.mkdir(path.join(cwd, "data/"))
### Load labels
train_y = load_numpy_arr_from_url(
url="https://surfdrive.surf.nl/files/index.php/s/i6MvQ8nqoiQ9Tci/download"
)
np.save("data/Y_train.npy", train_y)
test_y = load_numpy_arr_from_url(
url="https://surfdrive.surf.nl/files/index.php/s/wLXiOjVAW4AWlXY/download"
)
np.save("data/Y_test.npy", test_y)
### Load data
train_x = load_numpy_arr_from_url(
url="https://surfdrive.surf.nl/files/index.php/s/4rwSf9SYO1ydGtK/download"
)
np.save("data/X_train.npy", train_x)
test_x = load_numpy_arr_from_url(
url="https://surfdrive.surf.nl/files/index.php/s/dvY2LpvFo6dHef0/download"
)
np.save("data/X_test.npy", test_x)