thoracic-disease-classifier / dc1 /processing.py

initial commit

60b0ddc verified 10 months ago

14.5 kB

	# # # # Imports
	# # # import torch
	# # # import numpy as np
	# # # import pandas as pd
	# # # import matplotlib.pyplot as plt
	# # # import seaborn as sns
	# # # # Imports
	# # # import torch
	# # # import numpy as np
	# # # import pandas as pd
	# # # import matplotlib.pyplot as plt
	# # # import seaborn as sns

	# # # from sklearn.metrics import confusion_matrix, roc_curve, auc
	# # # from typing import Callable, List, Tuple
	# # # import torch.nn as nn
	# # # from pathlib import Path
	# # # import torch.nn.functional as F
	# # # from yaml import FlowSequenceStartToken
	# # # from sklearn.metrics import confusion_matrix, roc_curve, auc
	# # # from typing import Callable, List, Tuple
	# # # import torch.nn as nn
	# # # from pathlib import Path
	# # # import torch.nn.functional as F
	# # # from yaml import FlowSequenceStartToken

	# # Import files
	# from image_dataset import ImageDataset
	# from net import Net, ResNetModel, EfficientNetModel
	# from train_test import train_model, test_model
	# from batch_sampler import BatchSampler

	# NOTE: File used in the very beginning of the project. Please ignore!

	# maincolor = '#4a8cffff'
	# secondcolor = '#e06666'

	# # Train data
	# labels_train_path = 'dc1/data/Y_train.npy'
	# data_train_path = 'dc1/data/X_train.npy'
	# # Test data
	# labels_test_path = 'dc1/data/Y_test.npy'
	# data_test_path = 'dc1/data/X_test.npy'


	# y_train = np.load(labels_train_path)
	# unique_labels = np.unique(y_train)
	# data_train = np.load(data_train_path)


	# # Data Verification to check if we all have everything good
	# data_shape = data_train.shape
	# data_type = data_train.dtype
	# labels_shape = y_train.shape
	# labels_type = y_train.dtype
	# print(f"Data Shape: {data_shape}, Data Type: {data_type}")
	# print(f"Labels Shape: {labels_shape}, Labels Type: {labels_type}")

	# # Check the range and distribution of features
	# data_range = (np.min(data_train), np.max(data_train))

	# # Label Encoding in accordance to the diseases
	# class_names_mapping = {
	# 0: 'Atelectasis',
	# 1: 'Effusion',
	# 2: 'Infiltration',
	# 3: 'No Finding',
	# 4: 'Nodule',
	# 5: 'Pneumonia'
	# }

	# print("Unique classes in the training set:")
	# for class_id in unique_labels:
	# print(f"Class ID {class_id}: {class_names_mapping[class_id]}")

	# # df for distribution analysis
	# df_data_range = pd.DataFrame(data_train.reshape(data_train.shape[0], -1))

	# ###################################################################
	# ########### A D V A N C E D A N L Y S I S ###########
	# ##################################################################

	# # Y test data (labels)
	# y_test = np.load(labels_test_path)

	# # Initialize model (NET)
	# n_classes = 6
	# # NOTE : change the nn here!
	# model = Net(n_classes=n_classes)
	# # model = ResNetModel(n_classes=n_classes)
	# # model = EfficientNetModel(n_classes=n_classes)

	# # Device for test_model function call
	# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# model.to(device)

	# # Initialize the loss function
	# loss_function = nn.CrossEntropyLoss() # we can use another, this one i found in internet but I was getting errors...


	# # # Data Verification to check if we all have everything good
	# # data_shape = data_train.shape
	# # data_type = data_train.dtype
	# # labels_shape = y_train.shape
	# # labels_type = y_train.dtype
	# # print(f"Data Shape: {data_shape}, Data Type: {data_type}")
	# # print(f"Labels Shape: {labels_shape}, Labels Type: {labels_type}")

	# # # Check the range and distribution of features
	# # data_range = (np.min(data_train), np.max(data_train))

	# # # Label Encoding in accordance to the diseases
	# # class_names_mapping = {
	# # 0: 'Atelectasis',
	# # 1: 'Effusion',
	# # 2: 'Infiltration',
	# # 3: 'No Finding',
	# # 4: 'Nodule',
	# # 5: 'Pneumonia'
	# # }

	# # print("Unique classes in the training set:")
	# # for class_id in unique_labels:
	# # print(f"Class ID {class_id}: {class_names_mapping[class_id]}")

	# # # df for distribution analysis
	# # df_data_range = pd.DataFrame(data_train.reshape(data_train.shape[0], -1))

	# # ###################################################################
	# # ########### A D V A N C E D A N L Y S I S ###########
	# # ##################################################################

	# # # Y test data (labels)
	# # y_test = np.load(labels_test_path)

	# # # Initialize model (NET)
	# # n_classes = 6
	# # # NOTE : change the nn here!
	# # model = Net(n_classes=n_classes)
	# # # model = ResNetModel(n_classes=n_classes)
	# # # model = EfficientNetModel(n_classes=n_classes)

	# # # Device for test_model function call
	# # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# # model.to(device)

	# # # Initialize the loss function
	# # loss_function = nn.CrossEntropyLoss() # we can use another, this one i found in internet but I was getting errors...

	# # # Load test dataset w function
	# # test_dataset = ImageDataset(Path("dc1/data/X_test.npy"), Path("dc1/data/Y_test.npy"))

	# # # Initialize the BatchSampler
	# # batch_size = 32
	# # test_loader = BatchSampler(batch_size=batch_size, dataset=test_dataset, balanced=False) # 'balanced' or not we can choose depending on what we want

	# # # Function call
	# # losses, predicted_labels, true_labels, probabilities = test_model(model, test_loader, loss_function, device)

	# ##################### R O C C U R V E #####################
	# def plot_multiclass_roc_curve(y_true, y_scores, num_classes):
	# # Compute ROC curve and ROC area for each class
	# fpr = dict()
	# tpr = dict()
	# roc_auc = dict()

	# for i in range(num_classes):
	# fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_scores[:, i])
	# roc_auc[i] = auc(fpr[i], tpr[i])

	# # Plot all ROC curves
	# plt.figure()
	# for i in range(num_classes):
	# plt.plot(fpr[i], tpr[i], label=f'ROC curve of class {i} (area = {roc_auc[i]:.2f})')

	# plt.plot([0, 1], [0, 1], 'k--')
	# plt.xlim([0.0, 1.0])
	# plt.ylim([0.0, 1.05])
	# plt.xlabel('False Positive Rate')
	# plt.ylabel('True Positive Rate')
	# plt.title('Multiclass ROC Curve')
	# plt.legend(loc="lower right")
	# plt.show()

	# # Calculate the probabilities for each class
	# model_predictions = []
	# model_probabilities = []
	# model_probabilities = F.softmax(torch.tensor(model_predictions), dim=0).numpy()

	# plot_multiclass_roc_curve(y_test_binarized, model_probabilities, n_classes)

	# model.eval() # Set the model to evaluation mode
	# with torch.no_grad(): # Turn off gradients for the following block
	# for data, target in test_loader:
	# data, target = data.to(device), target.to(device)
	# output = model(data)

	# # Get class predictions
	# _, preds = torch.max(output, 1)
	# model_predictions.extend(preds.cpu().numpy())

	# # Get probabilities for the positive class
	# probs = F.softmax(output, dim=1)[:, 1] # Adjust the index based on your positive class
	# model_probabilities.extend(probs.cpu().numpy())

	# # # Specificity = Number of true negatives (Number of true negatives + number of false positives) =
	# # # = Total number of individuals without the illness

	# # def sensitivity_specificity(conf_matrix):
	# # num_classes = conf_matrix.shape[0]
	# # sensitivity = np.zeros(num_classes)
	# # specificity = np.zeros(num_classes)

	# # for i in range(num_classes):
	# # TP = conf_matrix[i, i]
	# # FN = sum(conf_matrix[i, :]) - TP
	# # FP = sum(conf_matrix[:, i]) - TP
	# # TN = conf_matrix.sum() - (TP + FP + FN)

	# # sensitivity[i] = TP / (TP + FN) if (TP + FN) != 0 else 0
	# # specificity[i] = TN / (TN + FP) if (TN + FP) != 0 else 0

	# # return sensitivity, specificity

	# # from sklearn.preprocessing import label_binarize

	# # # Binarize the labels for multiclass (suggestion of LLM)
	# # y_test_binarized = label_binarize(y_test, classes=np.unique(y_test))

	# # ##################### R O C C U R V E #####################
	# # def plot_multiclass_roc_curve(y_true, y_scores, num_classes):
	# # # Compute ROC curve and ROC area for each class
	# # fpr = dict()
	# # tpr = dict()
	# # roc_auc = dict()

	# # for i in range(num_classes):
	# # fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_scores[:, i])
	# # roc_auc[i] = auc(fpr[i], tpr[i])

	# # # Plot all ROC curves
	# # plt.figure()
	# # for i in range(num_classes):
	# # plt.plot(fpr[i], tpr[i], label=f'ROC curve of class {i} (area = {roc_auc[i]:.2f})')

	# # plt.plot([0, 1], [0, 1], 'k--')
	# # plt.xlim([0.0, 1.0])
	# # plt.ylim([0.0, 1.05])
	# # plt.xlabel('False Positive Rate')
	# # plt.ylabel('True Positive Rate')
	# # plt.title('Multiclass ROC Curve')
	# # plt.legend(loc="lower right")
	# # plt.show()

	# # # Calculate the probabilities for each class
	# # model_predictions = []
	# # model_probabilities = []
	# # model_probabilities = F.softmax(torch.tensor(model_predictions), dim=0).numpy()

	# # plot_multiclass_roc_curve(y_test_binarized, model_probabilities, n_classes)

	# # model.eval() # Set the model to evaluation mode
	# # with torch.no_grad(): # Turn off gradients for the following block
	# # for data, target in test_loader:
	# # data, target = data.to(device), target.to(device)
	# # output = model(data)

	# # # Get class predictions
	# # _, preds = torch.max(output, 1)
	# # model_predictions.extend(preds.cpu().numpy())

	# # # Get probabilities for the positive class
	# # probs = F.softmax(output, dim=1)[:, 1] # Adjust the index based on your positive class
	# # model_probabilities.extend(probs.cpu().numpy())


	# # # Calculate sensitivity and specificity
	# # sensitivity, specificity = sensitivity_specificity(y_test, model_predictions)
	# # print(f"Sensitivity: {sensitivity}")
	# # print(f"Specificity: {specificity}")


	# # ##################################################################################################################################################################

	# # # # Display the images, 1 for each class
	# # # def display_images(images, titles, num_images):
	# # # plt.figure(figsize=(15, 5))
	# # # for i in range(num_images):
	# # # image = np.squeeze(images[i]) # squeeze to make it easy to ptint in 2d
	# # # plt.subplot(1, num_images, i + 1)
	# # # plt.imshow(image, cmap='gray')
	# # # plt.title(titles[i])
	# # # plt.axis('off')
	# # # plt.show()

	# # >>>>>>> ab59272 (Net / ResNet / EfficientNet Experiments)
	# # # data_train = np.load(data_train_path)


	# # # # Data Verification to check if we all have everything good
	# # # data_shape = data_train.shape
	# # # data_type = data_train.dtype
	# # # labels_shape = y_train.shape
	# # # labels_type = y_train.dtype
	# # # print(f"Data Shape: {data_shape}, Data Type: {data_type}")
	# # # print(f"Labels Shape: {labels_shape}, Labels Type: {labels_type}")

	# # # # Check the range and distribution of features
	# # # data_range = (np.min(data_train), np.max(data_train))

	# # # # Label Encoding in accordance to the diseases
	# # # class_names_mapping = {
	# # # 0: 'Atelectasis',
	# # # 1: 'Effusion',
	# # # 2: 'Infiltration',
	# # # 3: 'No Finding',
	# # # 4: 'Nodule',
	# # # 5: 'Pneumonia'
	# # # }

	# # # print("Unique classes in the training set:")
	# # # for class_id in unique_labels:
	# # # print(f"Class ID {class_id}: {class_names_mapping[class_id]}")

	# # # # df for distribution analysis
	# # # df_data_range = pd.DataFrame(data_train.reshape(data_train.shape[0], -1))


	# # # Calculate the probabilities for each class
	# # model_predictions = []
	# # model_probabilities = []
	# # model_probabilities = F.softmax(torch.tensor(model_predictions), dim=0).numpy()

	# # plot_multiclass_roc_curve(y_test_binarized, model_probabilities, n_classes)

	# # model.eval() # Set the model to evaluation mode
	# # with torch.no_grad(): # Turn off gradients for the following block
	# # for data, target in test_loader:
	# # data, target = data.to(device), target.to(device)
	# # output = model(data)

	# # # Get class predictions
	# # _, preds = torch.max(output, 1)
	# # model_predictions.extend(preds.cpu().numpy())

	# # # Get probabilities for the positive class
	# # probs = F.softmax(output, dim=1)[:, 1] # Adjust the index based on your positive class
	# # model_probabilities.extend(probs.cpu().numpy())


	# # # Calculate sensitivity and specificity
	# # sensitivity, specificity = sensitivity_specificity(y_test, model_predictions)
	# # print(f"Sensitivity: {sensitivity}")
	# # print(f"Specificity: {specificity}")


	# # ##################################################################################################################################################################

	# # # # Display the images, 1 for each class
	# # # def display_images(images, titles, num_images):
	# # # plt.figure(figsize=(15, 5))
	# # # for i in range(num_images):
	# # # image = np.squeeze(images[i]) # squeeze to make it easy to ptint in 2d
	# # # plt.subplot(1, num_images, i + 1)
	# # # plt.imshow(image, cmap='gray')
	# # # plt.title(titles[i])
	# # # plt.axis('off')
	# # # plt.show()

	# # >>>>>>> ab59272 (Net / ResNet / EfficientNet Experiments)
	# # # data_train = np.load(data_train_path)


	# # # # Data Verification to check if we all have everything good
	# # # data_shape = data_train.shape
	# # # data_type = data_train.dtype
	# # # labels_shape = y_train.shape
	# # # labels_type = y_train.dtype
	# # # print(f"Data Shape: {data_shape}, Data Type: {data_type}")
	# # # print(f"Labels Shape: {labels_shape}, Labels Type: {labels_type}")

	# # # # Check the range and distribution of features
	# # # data_range = (np.min(data_train), np.max(data_train))

	# # # # Label Encoding in accordance to the diseases
	# # # class_names_mapping = {
	# # # 0: 'Atelectasis',
	# # # 1: 'Effusion',
	# # # 2: 'Infiltration',
	# # # 3: 'No Finding',
	# # # 4: 'Nodule',
	# # # 5: 'Pneumonia'
	# # # }

	# # # print("Unique classes in the training set:")
	# # # for class_id in unique_labels:
	# # # print(f"Class ID {class_id}: {class_names_mapping[class_id]}")

	# # # # df for distribution analysis
	# # # df_data_range = pd.DataFrame(data_train.reshape(data_train.shape[0], -1))