Merge pull request #2 from shrits-ai/main
Browse files- .gitignore +2 -0
- README.md +26 -0
- resnet_execute.py +125 -26
- tmppl87qjev/_remote_module_non_scriptable.py +81 -0
.gitignore
CHANGED
@@ -3,4 +3,6 @@ data/
|
|
3 |
__pycache__
|
4 |
ResNet 50_Model.xlsx
|
5 |
~$ResNet 50_Model.xlsx
|
|
|
|
|
6 |
|
|
|
3 |
__pycache__
|
4 |
ResNet 50_Model.xlsx
|
5 |
~$ResNet 50_Model.xlsx
|
6 |
+
checkpoint.pth
|
7 |
+
|
8 |
|
README.md
CHANGED
@@ -6,6 +6,32 @@
|
|
6 |
|
7 |
## Data Augmentations
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
## Model Results
|
|
|
6 |
|
7 |
## Data Augmentations
|
8 |
|
9 |
+
To enhance the model's robustness and generalization capabilities, we apply a series of data augmentations to the training dataset. These augmentations are inspired by the original ResNet paper and implemented using the albumentations library. The augmentations include random resized cropping, horizontal flipping, and color jittering, followed by normalization. These transformations help the model learn invariant features and improve performance on unseen data.
|
10 |
+
|
11 |
+
### Augmentations and Hyperparameters
|
12 |
+
|
13 |
+
1. **Random Resized Crop:**
|
14 |
+
- Height: 224
|
15 |
+
- Width: 224
|
16 |
+
- Scale: (0.08, 1.0)
|
17 |
+
- Aspect Ratio: (3/4, 4/3)
|
18 |
+
- Probability: 1.0
|
19 |
+
|
20 |
+
2. **Horizontal Flip:**
|
21 |
+
- Probability: 0.5
|
22 |
+
|
23 |
+
3. **Color Jitter:**
|
24 |
+
- Brightness: 0.4
|
25 |
+
- Contrast: 0.4
|
26 |
+
- Saturation: 0.4
|
27 |
+
- Hue: 0.1
|
28 |
+
- Probability: 0.8
|
29 |
+
|
30 |
+
4. **Normalization:**
|
31 |
+
- Mean: (0.485, 0.456, 0.406)
|
32 |
+
- Standard Deviation: (0.229, 0.224, 0.225)
|
33 |
+
|
34 |
+
These augmentations are applied only to the training dataset, while the test dataset undergoes resizing and normalization to ensure consistent evaluation metrics.
|
35 |
|
36 |
|
37 |
## Model Results
|
resnet_execute.py
CHANGED
@@ -8,39 +8,55 @@ from resnet_model import ResNet50
|
|
8 |
from tqdm import tqdm
|
9 |
from torchvision import datasets
|
10 |
from checkpoint import save_checkpoint, load_checkpoint
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
# Define transformations
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
])
|
19 |
|
20 |
# Train dataset and loader
|
21 |
-
trainset = datasets.ImageFolder(root='/mnt/imagenet/ILSVRC/Data/CLS-LOC/train', transform=
|
22 |
-
trainloader = DataLoader(trainset, batch_size=128, shuffle=True, num_workers=
|
23 |
|
24 |
-
testset = datasets.ImageFolder(root='/mnt/imagenet/ILSVRC/Data/CLS-LOC/val', transform=
|
25 |
-
testloader = DataLoader(testset, batch_size=
|
26 |
|
27 |
# Initialize model, loss function, and optimizer
|
28 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
29 |
model = ResNet50()
|
30 |
model = torch.nn.DataParallel(model)
|
31 |
model = model.to(device)
|
|
|
32 |
|
33 |
criterion = nn.CrossEntropyLoss()
|
34 |
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
|
35 |
|
36 |
# Training function
|
37 |
from torch.amp import autocast
|
38 |
-
from tqdm import tqdm
|
39 |
|
40 |
def train(model, device, train_loader, optimizer, criterion, epoch, accumulation_steps=4):
|
41 |
model.train()
|
42 |
running_loss = 0.0
|
43 |
-
|
|
|
44 |
total = 0
|
45 |
pbar = tqdm(train_loader)
|
46 |
|
@@ -58,24 +74,28 @@ def train(model, device, train_loader, optimizer, criterion, epoch, accumulation
|
|
58 |
optimizer.zero_grad()
|
59 |
|
60 |
running_loss += loss.item() * accumulation_steps
|
61 |
-
_, predicted = outputs.
|
62 |
total += targets.size(0)
|
63 |
-
|
|
|
64 |
|
65 |
-
pbar.set_description(desc=f'Epoch {epoch} | Loss: {running_loss / (batch_idx + 1):.4f} |
|
66 |
|
67 |
if (batch_idx + 1) % 50 == 0:
|
68 |
torch.cuda.empty_cache()
|
69 |
|
70 |
-
return 100. *
|
71 |
-
|
72 |
|
73 |
# Testing function
|
74 |
def test(model, device, test_loader, criterion):
|
75 |
model.eval()
|
76 |
test_loss = 0
|
77 |
-
|
|
|
78 |
total = 0
|
|
|
|
|
|
|
79 |
|
80 |
with torch.no_grad():
|
81 |
for inputs, targets in test_loader:
|
@@ -84,13 +104,22 @@ def test(model, device, test_loader, criterion):
|
|
84 |
loss = criterion(outputs, targets)
|
85 |
|
86 |
test_loss += loss.item()
|
87 |
-
_, predicted = outputs.
|
88 |
total += targets.size(0)
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
94 |
|
95 |
# Main execution
|
96 |
if __name__ == '__main__':
|
@@ -105,10 +134,19 @@ if __name__ == '__main__':
|
|
105 |
except FileNotFoundError:
|
106 |
print("No checkpoint found, starting from scratch.")
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
if test_loss < best_loss:
|
113 |
best_loss = test_loss
|
114 |
patience_counter = 0
|
@@ -119,3 +157,64 @@ if __name__ == '__main__':
|
|
119 |
if patience_counter >= patience:
|
120 |
print("Early stopping triggered. Training terminated.")
|
121 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from tqdm import tqdm
|
9 |
from torchvision import datasets
|
10 |
from checkpoint import save_checkpoint, load_checkpoint
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
from torchvision.utils import make_grid
|
13 |
+
import albumentations as A
|
14 |
+
from albumentations.pytorch import ToTensorV2
|
15 |
+
import numpy as np
|
16 |
+
from torchsummary import summary
|
17 |
|
18 |
# Define transformations
|
19 |
+
train_transform = A.Compose([
|
20 |
+
A.RandomResizedCrop(height=224, width=224, scale=(0.08, 1.0), ratio=(3/4, 4/3), p=1.0),
|
21 |
+
A.HorizontalFlip(p=0.5),
|
22 |
+
A.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1, p=0.8),
|
23 |
+
A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
|
24 |
+
ToTensorV2()
|
25 |
+
])
|
26 |
+
|
27 |
+
test_transform = A.Compose([
|
28 |
+
A.Resize(height=256, width=256),
|
29 |
+
A.CenterCrop(height=224, width=224),
|
30 |
+
A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
|
31 |
+
ToTensorV2()
|
32 |
])
|
33 |
|
34 |
# Train dataset and loader
|
35 |
+
trainset = datasets.ImageFolder(root='/mnt/imagenet/ILSVRC/Data/CLS-LOC/train', transform=lambda img: train_transform(image=np.array(img))['image'])
|
36 |
+
trainloader = DataLoader(trainset, batch_size=128, shuffle=True, num_workers=8, pin_memory=True)
|
37 |
|
38 |
+
testset = datasets.ImageFolder(root='/mnt/imagenet/ILSVRC/Data/CLS-LOC/val', transform=lambda img: test_transform(image=np.array(img))['image'])
|
39 |
+
testloader = DataLoader(testset, batch_size=500, shuffle=False, num_workers=8, pin_memory=True)
|
40 |
|
41 |
# Initialize model, loss function, and optimizer
|
42 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
43 |
+
print( device )
|
44 |
model = ResNet50()
|
45 |
model = torch.nn.DataParallel(model)
|
46 |
model = model.to(device)
|
47 |
+
summary(model, input_size=(3, 224, 224))
|
48 |
|
49 |
criterion = nn.CrossEntropyLoss()
|
50 |
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
|
51 |
|
52 |
# Training function
|
53 |
from torch.amp import autocast
|
|
|
54 |
|
55 |
def train(model, device, train_loader, optimizer, criterion, epoch, accumulation_steps=4):
|
56 |
model.train()
|
57 |
running_loss = 0.0
|
58 |
+
correct1 = 0
|
59 |
+
correct5 = 0
|
60 |
total = 0
|
61 |
pbar = tqdm(train_loader)
|
62 |
|
|
|
74 |
optimizer.zero_grad()
|
75 |
|
76 |
running_loss += loss.item() * accumulation_steps
|
77 |
+
_, predicted = outputs.topk(5, 1, True, True)
|
78 |
total += targets.size(0)
|
79 |
+
correct1 += predicted[:, :1].eq(targets.view(-1, 1).expand_as(predicted[:, :1])).sum().item()
|
80 |
+
correct5 += predicted.eq(targets.view(-1, 1).expand_as(predicted)).sum().item()
|
81 |
|
82 |
+
pbar.set_description(desc=f'Epoch {epoch} | Loss: {running_loss / (batch_idx + 1):.4f} | Top-1 Acc: {100. * correct1 / total:.2f} | Top-5 Acc: {100. * correct5 / total:.2f}')
|
83 |
|
84 |
if (batch_idx + 1) % 50 == 0:
|
85 |
torch.cuda.empty_cache()
|
86 |
|
87 |
+
return 100. * correct1 / total, 100. * correct5 / total, running_loss / len(train_loader)
|
|
|
88 |
|
89 |
# Testing function
|
90 |
def test(model, device, test_loader, criterion):
|
91 |
model.eval()
|
92 |
test_loss = 0
|
93 |
+
correct1 = 0
|
94 |
+
correct5 = 0
|
95 |
total = 0
|
96 |
+
misclassified_images = []
|
97 |
+
misclassified_labels = []
|
98 |
+
misclassified_preds = []
|
99 |
|
100 |
with torch.no_grad():
|
101 |
for inputs, targets in test_loader:
|
|
|
104 |
loss = criterion(outputs, targets)
|
105 |
|
106 |
test_loss += loss.item()
|
107 |
+
_, predicted = outputs.topk(5, 1, True, True)
|
108 |
total += targets.size(0)
|
109 |
+
correct1 += predicted[:, :1].eq(targets.view(-1, 1).expand_as(predicted[:, :1])).sum().item()
|
110 |
+
correct5 += predicted.eq(targets.view(-1, 1).expand_as(predicted)).sum().item()
|
111 |
+
|
112 |
+
# Collect misclassified samples
|
113 |
+
for i in range(inputs.size(0)):
|
114 |
+
if targets[i] not in predicted[i, :1]:
|
115 |
+
misclassified_images.append(inputs[i].cpu())
|
116 |
+
misclassified_labels.append(targets[i].cpu())
|
117 |
+
misclassified_preds.append(predicted[i, :1].cpu())
|
118 |
|
119 |
+
test_accuracy1 = 100. * correct1 / total
|
120 |
+
test_accuracy5 = 100. * correct5 / total
|
121 |
+
print(f'Test Loss: {test_loss/len(test_loader):.4f}, Top-1 Accuracy: {test_accuracy1:.2f}, Top-5 Accuracy: {test_accuracy5:.2f}')
|
122 |
+
return test_accuracy1, test_accuracy5, test_loss / len(test_loader), misclassified_images, misclassified_labels, misclassified_preds
|
123 |
|
124 |
# Main execution
|
125 |
if __name__ == '__main__':
|
|
|
134 |
except FileNotFoundError:
|
135 |
print("No checkpoint found, starting from scratch.")
|
136 |
|
137 |
+
# Store results for each epoch
|
138 |
+
results = []
|
139 |
+
learning_rates = []
|
140 |
+
|
141 |
+
for epoch in range(1, 26): # 20 epochs
|
142 |
+
train_accuracy1, train_accuracy5, train_loss = train(model, device, trainloader, optimizer, criterion, epoch)
|
143 |
+
test_accuracy1, test_accuracy5, test_loss, misclassified_images, misclassified_labels, misclassified_preds = test(model, device, testloader, criterion)
|
144 |
+
print(f'Epoch {epoch} | Train Top-1 Acc: {train_accuracy1:.2f} | Train Top-5 Acc: {train_accuracy5:.2f} | Test Top-1 Acc: {test_accuracy1:.2f} | Test Top-5 Acc: {test_accuracy5:.2f}')
|
145 |
+
|
146 |
+
# Append results for this epoch
|
147 |
+
results.append((epoch, train_accuracy1, train_accuracy5, test_accuracy1, test_accuracy5, train_loss, test_loss))
|
148 |
+
learning_rates.append(optimizer.param_groups[0]['lr'])
|
149 |
+
|
150 |
if test_loss < best_loss:
|
151 |
best_loss = test_loss
|
152 |
patience_counter = 0
|
|
|
157 |
if patience_counter >= patience:
|
158 |
print("Early stopping triggered. Training terminated.")
|
159 |
break
|
160 |
+
|
161 |
+
# Only process misclassified samples after the last epoch
|
162 |
+
if epoch == 25:
|
163 |
+
# Display or process misclassified samples
|
164 |
+
if misclassified_images:
|
165 |
+
print("\nDisplaying some misclassified samples from the last epoch:")
|
166 |
+
misclassified_grid = make_grid(misclassified_images[:16], nrow=4, normalize=True, scale_each=True)
|
167 |
+
plt.figure(figsize=(8, 8))
|
168 |
+
plt.imshow(misclassified_grid.permute(1, 2, 0))
|
169 |
+
plt.title("Misclassified Samples")
|
170 |
+
plt.axis('off')
|
171 |
+
plt.show()
|
172 |
+
|
173 |
+
# Print the Top-1 accuracy results in a tab-separated format
|
174 |
+
print("\nEpoch\tTrain Top-1 Accuracy\tTest Top-1 Accuracy")
|
175 |
+
for epoch, train_acc1, test_acc1, *_ in results:
|
176 |
+
print(f"{epoch}\t{train_acc1:.2f}\t{test_acc1:.2f}")
|
177 |
+
|
178 |
+
# Plotting
|
179 |
+
epochs = [r[0] for r in results]
|
180 |
+
train_acc1 = [r[1] for r in results]
|
181 |
+
train_acc5 = [r[2] for r in results]
|
182 |
+
test_acc1 = [r[3] for r in results]
|
183 |
+
test_acc5 = [r[4] for r in results]
|
184 |
+
train_losses = [r[5] for r in results]
|
185 |
+
test_losses = [r[6] for r in results]
|
186 |
+
|
187 |
+
plt.figure(figsize=(12, 8))
|
188 |
+
plt.subplot(2, 2, 1)
|
189 |
+
plt.plot(epochs, train_acc1, label='Train Top-1 Acc')
|
190 |
+
plt.plot(epochs, test_acc1, label='Test Top-1 Acc')
|
191 |
+
plt.xlabel('Epoch')
|
192 |
+
plt.ylabel('Accuracy')
|
193 |
+
plt.legend()
|
194 |
+
plt.title('Top-1 Accuracy')
|
195 |
+
|
196 |
+
plt.subplot(2, 2, 2)
|
197 |
+
plt.plot(epochs, train_acc5, label='Train Top-5 Acc')
|
198 |
+
plt.plot(epochs, test_acc5, label='Test Top-5 Acc')
|
199 |
+
plt.xlabel('Epoch')
|
200 |
+
plt.ylabel('Accuracy')
|
201 |
+
plt.legend()
|
202 |
+
plt.title('Top-5 Accuracy')
|
203 |
+
|
204 |
+
plt.subplot(2, 2, 3)
|
205 |
+
plt.plot(epochs, train_losses, label='Train Loss')
|
206 |
+
plt.plot(epochs, test_losses, label='Test Loss')
|
207 |
+
plt.xlabel('Epoch')
|
208 |
+
plt.ylabel('Loss')
|
209 |
+
plt.legend()
|
210 |
+
plt.title('Loss')
|
211 |
+
|
212 |
+
plt.subplot(2, 2, 4)
|
213 |
+
plt.plot(epochs, learning_rates, label='Learning Rate')
|
214 |
+
plt.xlabel('Epoch')
|
215 |
+
plt.ylabel('Learning Rate')
|
216 |
+
plt.legend()
|
217 |
+
plt.title('Learning Rate')
|
218 |
+
|
219 |
+
plt.tight_layout()
|
220 |
+
plt.show()
|
tmppl87qjev/_remote_module_non_scriptable.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import *
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import torch.distributed.rpc as rpc
|
5 |
+
from torch import Tensor
|
6 |
+
from torch._jit_internal import Future
|
7 |
+
from torch.distributed.rpc import RRef
|
8 |
+
from typing import Tuple # pyre-ignore: unused import
|
9 |
+
|
10 |
+
|
11 |
+
module_interface_cls = None
|
12 |
+
|
13 |
+
|
14 |
+
def forward_async(self, *args, **kwargs):
|
15 |
+
args = (self.module_rref, self.device, self.is_device_map_set, *args)
|
16 |
+
kwargs = {**kwargs}
|
17 |
+
return rpc.rpc_async(
|
18 |
+
self.module_rref.owner(),
|
19 |
+
_remote_forward,
|
20 |
+
args,
|
21 |
+
kwargs,
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
def forward(self, *args, **kwargs):
|
26 |
+
args = (self.module_rref, self.device, self.is_device_map_set, *args)
|
27 |
+
kwargs = {**kwargs}
|
28 |
+
ret_fut = rpc.rpc_async(
|
29 |
+
self.module_rref.owner(),
|
30 |
+
_remote_forward,
|
31 |
+
args,
|
32 |
+
kwargs,
|
33 |
+
)
|
34 |
+
return ret_fut.wait()
|
35 |
+
|
36 |
+
|
37 |
+
_generated_methods = [
|
38 |
+
forward_async,
|
39 |
+
forward,
|
40 |
+
]
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
def _remote_forward(
|
46 |
+
module_rref: RRef[module_interface_cls], device: str, is_device_map_set: bool, *args, **kwargs):
|
47 |
+
module = module_rref.local_value()
|
48 |
+
device = torch.device(device)
|
49 |
+
|
50 |
+
if device.type != "cuda":
|
51 |
+
return module.forward(*args, **kwargs)
|
52 |
+
|
53 |
+
# If the module is on a cuda device,
|
54 |
+
# move any CPU tensor in args or kwargs to the same cuda device.
|
55 |
+
# Since torch script does not support generator expression,
|
56 |
+
# have to use concatenation instead of
|
57 |
+
# ``tuple(i.to(device) if isinstance(i, Tensor) else i for i in *args)``.
|
58 |
+
args = (*args,)
|
59 |
+
out_args: Tuple[()] = ()
|
60 |
+
for arg in args:
|
61 |
+
arg = (arg.to(device),) if isinstance(arg, Tensor) else (arg,)
|
62 |
+
out_args = out_args + arg
|
63 |
+
|
64 |
+
kwargs = {**kwargs}
|
65 |
+
for k, v in kwargs.items():
|
66 |
+
if isinstance(v, Tensor):
|
67 |
+
kwargs[k] = kwargs[k].to(device)
|
68 |
+
|
69 |
+
if is_device_map_set:
|
70 |
+
return module.forward(*out_args, **kwargs)
|
71 |
+
|
72 |
+
# If the device map is empty, then only CPU tensors are allowed to send over wire,
|
73 |
+
# so have to move any GPU tensor to CPU in the output.
|
74 |
+
# Since torch script does not support generator expression,
|
75 |
+
# have to use concatenation instead of
|
76 |
+
# ``tuple(i.cpu() if isinstance(i, Tensor) else i for i in module.forward(*out_args, **kwargs))``.
|
77 |
+
ret: Tuple[()] = ()
|
78 |
+
for i in module.forward(*out_args, **kwargs):
|
79 |
+
i = (i.cpu(),) if isinstance(i, Tensor) else (i,)
|
80 |
+
ret = ret + i
|
81 |
+
return ret
|