Spaces:

Muthukamalan
/

AttnViz

Sleeping

App Files Files Community

Muthukamalan commited on Oct 20, 2024

Commit

197f827

1 Parent(s): 8b649e6

init gradio

Browse files

Files changed (16) hide show

LICENSE +24 -0
Plybooks.ipynb +199 -0
README.md +60 -14
app.py +96 -0
assets/attention-part.png +0 -0
assets/embedding.png +0 -0
assets/patches.png +0 -0
assets/vit.png +0 -0
requirements.txt +13 -0
samples/mr_bean.png +0 -0
samples/sectional-sofa.png +0 -0
src/__pycache__/gradcams.cpython-311.pyc +0 -0
src/datamodule.py +51 -0
src/gradcams.py +61 -0
src/old.py.old +194 -0
src/vit.py +262 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,24 @@

+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+For more information, please refer to <https://unlicense.org>

Plybooks.ipynb ADDED Viewed

	@@ -0,0 +1,199 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import lightning as pl \n",
+    "from src.datamodule import CIFAR10DataModule\n",
+    "from src.vit import ViTLightning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "HPU available: False, using: 0 HPUs\n"
+     ]
+    }
+   ],
+   "source": [
+    "trainer = pl.Trainer(max_epochs=15,accelerator='auto',reload_dataloaders_every_n_epochs=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = ViTLightning()\n",
+    "dm = CIFAR10DataModule()\n",
+    "dm.setup()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You are using a CUDA device ('NVIDIA GeForce RTX 4050 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Files already downloaded and verified\n",
+      "Files already downloaded and verified\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "Loading `train_dataloader` to estimate number of stepping batches.\n",
+      "\n",
+      "  | Name      | Type               | Params | Mode \n",
+      "---------------------------------------------------------\n",
+      "0 | vit       | ViT                | 154 K  | train\n",
+      "1 | train_acc | MulticlassAccuracy | 0      | train\n",
+      "2 | val_acc   | MulticlassAccuracy | 0      | train\n",
+      "3 | test_acc  | MulticlassAccuracy | 0      | train\n",
+      "---------------------------------------------------------\n",
+      "154 K     Trainable params\n",
+      "0         Non-trainable params\n",
+      "154 K     Total params\n",
+      "0.616     Total estimated model params size (MB)\n",
+      "37        Modules in train mode\n",
+      "0         Modules in eval mode\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 14: 100%|██████████| 1407/1407 [00:28<00:00, 49.90it/s, v_num=0, train_loss=0.518, train_acc=0.875, val_loss=0.996, val_acc=0.644]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "`Trainer.fit` stopped: `max_epochs=15` reached.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 14: 100%|██████████| 1407/1407 [00:28<00:00, 49.87it/s, v_num=0, train_loss=0.518, train_acc=0.875, val_loss=0.996, val_acc=0.644]\n"
+     ]
+    }
+   ],
+   "source": [
+    "trainer.fit(datamodule=dm,model=model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Files already downloaded and verified\n",
+      "Files already downloaded and verified\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Validation DataLoader 0: 100%|██████████| 157/157 [00:00<00:00, 163.27it/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\">      Validate metric      </span>┃<span style=\"font-weight: bold\">       DataLoader 0        </span>┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\">          val_acc          </span>│<span style=\"color: #800080; text-decoration-color: #800080\">    0.6284000277519226     </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\">         val_loss          </span>│<span style=\"color: #800080; text-decoration-color: #800080\">    1.0169780254364014     </span>│\n",
+       "└───────────────────────────┴───────────────────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1m     Validate metric     \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m      DataLoader 0       \u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│\u001b[36m \u001b[0m\u001b[36m         val_acc         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m   0.6284000277519226    \u001b[0m\u001b[35m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36m        val_loss         \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m   1.0169780254364014    \u001b[0m\u001b[35m \u001b[0m│\n",
+       "└───────────────────────────┴───────────────────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'val_loss': 1.0169780254364014, 'val_acc': 0.6284000277519226}]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.validate(model,dm)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

README.md CHANGED Viewed

@@ -1,14 +1,60 @@
----
-title: AttnViz
-emoji: 🐨
-colorFrom: pink
-colorTo: green
-sdk: gradio
-sdk_version: 5.1.0
-app_file: app.py
-pinned: false
-license: unlicense
-short_description: Trained on Cifar10 & explore attention visually
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# ViT
+- GitHub source repo⭐:: [VitCiFar](https://github.com/Muthukamalan/VitCiFar)
+As we all know Transformer architecture, taken up the world by Storm.
+In this Repo, I practised (from scratch) how we implement this to Vision. Transformers are data hungry don't just compare with CNN (not apples to apple comparison here)
+#### Model
+<div align='center'><img src="https://raw.githubusercontent.com/Muthukamalan/VitCiFar/main/assets/vit.png" width=500 height=300></div>
+**Patches**
+```python
+nn.Conv2d(
+            in_chans,
+            emb_dim,
+            kernel_size = patch_size,
+            stride = patch_size
+        )
+```
+<div align='center'>
+    <img src="https://raw.githubusercontent.com/Muthukamalan/VitCiFar/main/assets/patches.png" width=500 height=300 style="display:inline-block; margin-right: 10px;" alt="patchs">
+    <img src="https://raw.githubusercontent.com/Muthukamalan/VitCiFar/main/assets/embedding.png" width=500 height=300 style="display:inline-block;">
+</div>
+> [!NOTE] CASUAL MASK
+> Unlike in words, we don't use casual mask here.
+<!-- <div align='center'><img src="assets/attention-part.png" width=300 height=500 style="display:inline-block; margin-right: 10px;"></div> -->
+<p align="center">
+  <img src="https://raw.githubusercontent.com/Muthukamalan/VitCiFar/main/assets/attention-part.png" alt="Attention Visualization" />
+</p>
+At Final Projection layer,
+- pooling (combine) and projected what peredicted layer
+- Add One Token before train transformer-block after then pick that token pass it to projection layer (like `BERT` did)  << ViT chooses
+```python
+        # Transformer Encoder
+        xformer_out = self.enc(out) # [batch, 65, 384]
+        if self.is_cls_token:
+            token_out = xformer_out[:,0] # [batch, 384]
+        else:
+            token_out = xformer_out.mean(1)
+        # MLP Head
+        projection_out = self.mlp_head(token_out) # [batch, 10]
+```
+#### Context Grad-CAM
+[Xplain AI](https://github.com/jacobgil/pytorch-grad-cam)
+- register_forward_hook::  hook will be executed during the forward pass of the model

app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+############################
+#
+#   Imports
+#
+############################
+import timm
+import torch
+from skimage import io
+from src.gradcams import GradCam
+import numpy as np
+import cv2
+import gradio  as gr
+from PIL import Image
+############################
+#
+#   model
+#
+############################
+model:torch.nn.Module = timm.create_model("vit_small_patch16_224",pretrained=True) # num_classes=10
+model.eval()
+############################
+#
+#   utility functions
+#
+############################
+def prepare_input(image:np.array)->torch.Tensor:
+    image = image.copy()   # (H,W,C)
+    mean  = np.array([0.5,.5,.5])
+    stds = np.array([.5,.5,.5])
+    image -= mean
+    image /= stds
+    image = np.ascontiguousarray(np.transpose(image,(2,0,1)))  # transpose the image to match model's input format (C,H,W)
+    image = image[np.newaxis,...]  # (bs, C, H, W)
+    return torch.tensor(image,requires_grad=True)
+def gen_cam(image, mask):
+    # create a heatmap from the Grad-CAM mask
+    heatmap = cv2.applyColorMap(np.uint8(255*mask), cv2.COLORMAP_JET)
+    heatmap = np.float32(heatmap)/255.
+    # superimpose the heatmap on the original image
+    cam =  (.5*heatmap) + (.5*image.squeeze(0).permute(1,2,0).detach().cpu().numpy())
+    # normalize
+    cam = cam/ np.max(cam)
+    return np.uint8(255*cam)
+def attn_viz(image,number:int=2):
+    image = np.float32(cv2.resize(image,(224,224) )) / 255
+    image = prepare_input(image)
+    target_layer = model.blocks[number]
+    grad_cam = GradCam(model=model,target=target_layer)
+    mask = grad_cam(image)
+    result = gen_cam(image=image,mask=mask)
+    return Image.fromarray(result)
+# Create a Gradio TabbedInterface with two tabs
+with gr.Blocks(
+            title="AttnViz",
+    ) as demo:
+    with gr.Tab("Image Processing"):
+        # Create an image input and a number input
+        image_input = gr.Image(label="Input Image",type='numpy')
+        number_input = gr.Number(label="Number",minimum=0,maximum=11,show_label=True)
+        # Create an image output
+        image_output = gr.Image(label="Output Image")
+        # Set up the event listener for the image processing function
+        process_button = gr.Button("Process Image")
+        process_button.click(attn_viz, inputs=[image_input, number_input], outputs=image_output)
+        gr.Examples(
+            examples=[
+                ["samples/mr_bean.png", 1],
+                ["samples/sectional-sofa.png", 8],
+            ],
+            inputs=[image_input, number_input],
+        )
+    with gr.Tab("README"):
+        # Add a simple text description in the About tab
+        with open("README.md", "r+") as file: readme_content = file.read()
+        gr.Markdown(readme_content)
+if __name__=='__main__':
+    demo.launch(show_error=True,share=False,)

assets/attention-part.png ADDED Viewed

assets/embedding.png ADDED Viewed

assets/patches.png ADDED Viewed

assets/vit.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+Pilow==10.1.0
+opencv-python==4.8.1.78
+opencv-python-headless==4.8.1.78
+torch==2.5.0
+pytorch-gradcam==0.2.1
+torchvision==0.20.0
+timm==1.0.9
+gradio==4.44.1
+gradio_client==1.3.0
+lightning==2.4.0
+lightning-utilities==0.11.6
+pytorch-lightning==2.4.0
+numpy==1.26.1

samples/mr_bean.png ADDED Viewed

samples/sectional-sofa.png ADDED Viewed

src/__pycache__/gradcams.cpython-311.pyc ADDED Viewed

Binary file (4.17 kB). View file

src/datamodule.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+from torch.utils.data import DataLoader, random_split
+from torchvision import transforms, datasets
+import lightning as pl
+class CIFAR10DataModule(pl.LightningDataModule):
+    def __init__(self, data_dir: str = r'/home/muthu/GitHub/DATA 📁/CIFAR', batch_size: int = 32, num_workers: int = 4):
+        super().__init__()
+        self.data_dir = data_dir
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        # Define data transforms for train, validation and test
+        self.transform_train = transforms.Compose([
+            transforms.RandomHorizontalFlip(),
+            transforms.RandomCrop(32, padding=4),
+            transforms.Resize((32,32)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
+        ])
+        self.transform_test = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
+        ])
+    def prepare_data(self):
+        # Download CIFAR-10 dataset
+        datasets.CIFAR10(root=self.data_dir, train=True, download=True)
+        datasets.CIFAR10(root=self.data_dir, train=False, download=True)
+    def setup(self, stage=None):
+        # Split dataset for training, validation and test
+        if stage == 'fit' or stage is None:
+            full_train_dataset = datasets.CIFAR10(root=self.data_dir, train=True, transform=self.transform_train)
+            self.train_dataset, self.val_dataset = random_split(full_train_dataset, [45000, 5000])
+        if stage == 'test' or stage is None:
+            self.test_dataset = datasets.CIFAR10(root=self.data_dir, train=False, transform=self.transform_test)
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)
+    def val_dataloader(self):
+        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)
+    def test_dataloader(self):
+        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers)
+    def predict_dataloader(self):
+        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

src/gradcams.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import cv2  # OpenCV for image processing
+import numpy as np  # NumPy for numerical operations
+class GradCam:
+    def __init__(self, model, target):
+        self.model = model.eval()  # Set the model to evaluation mode
+        self.feature = None  # To store the features from the target layer
+        self.gradient = None  # To store the gradients from the target layer
+        self.handlers = []  # List to keep track of hooks
+        self.target = target  # Target layer for Grad-CAM
+        self._get_hook()  # Register hooks to the target layer
+    # Hook to get features from the forward pass
+    def _get_features_hook(self, module, input, output):
+        self.feature = self.reshape_transform(output)  # Store and reshape the output features
+    # Hook to get gradients from the backward pass
+    def _get_grads_hook(self, module, input_grad, output_grad):
+        self.gradient = self.reshape_transform(output_grad)  # Store and reshape the output gradients
+        def _store_grad(grad):
+            self.gradient = self.reshape_transform(grad)  # Store gradients for later use
+        output_grad.register_hook(_store_grad)  # Register hook to store gradients
+    # Register forward hooks to the target layer
+    def _get_hook(self):
+        self.target.register_forward_hook(self._get_features_hook)
+        self.target.register_forward_hook(self._get_grads_hook)
+    # Function to reshape the tensor for visualization
+    def reshape_transform(self, tensor, height=14, width=14):
+        result = tensor[:, 1:, :].reshape(tensor.size(0), height, width, tensor.size(2))
+        result = result.transpose(2, 3).transpose(1, 2)  # Rearrange dimensions to (C, H, W)
+        return result
+    # Function to compute the Grad-CAM heatmap
+    def __call__(self, inputs):
+        self.model.zero_grad()  # Zero the gradients
+        output = self.model(inputs)  # Forward pass
+        # Get the index of the highest score in the output
+        index = np.argmax(output.cpu().data.numpy())
+        target = output[0][index]  # Get the target score
+        target.backward()  # Backward pass to compute gradients
+        # Get the gradients and features
+        gradient = self.gradient[0].cpu().data.numpy()
+        weight = np.mean(gradient, axis=(1, 2))  # Average the gradients
+        feature = self.feature[0].cpu().data.numpy()
+        # Compute the weighted sum of the features
+        cam = feature * weight[:, np.newaxis, np.newaxis]
+        cam = np.sum(cam, axis=0)  # Sum over the channels
+        cam = np.maximum(cam, 0)  # Apply ReLU to remove negative values
+        # Normalize the heatmap
+        cam -= np.min(cam)
+        cam /= np.max(cam)
+        cam = cv2.resize(cam, (224, 224))  # Resize to match the input image size
+        return cam  # Return the Grad-CAM heatmap

src/old.py.old ADDED Viewed

	@@ -0,0 +1,194 @@

+import math
+from lightning.pytorch.utilities.types import EVAL_DATALOADERS
+import torch
+from typing import Dict,Optional,Tuple,Union
+from dataclasses import dataclass
+import lightning as pl
+from torchmetrics import Accuracy
+# @dataclass
+# class ViTCfg:
+#     image_size:   int
+#     patch_size:   int
+#     num_channels: int
+#     model_dim:    int
+#     num_attn_heads:int
+#     attn_dropout:  int
+#     d_ff:         int
+#     number_encoders:int
+#     classification_heads:int
+class PatchEmbedding(torch.nn.Module):
+    def __init__(self, cfg:Dict) -> None:
+        super().__init__()
+        for k,v in cfg.items(): setattr(self,k,v)
+        assert self.image_size % self.patch_size==0,"patch size is not divide image_size properly"
+        self.num_patchs = (self.image_size // self.patch_size)**2
+        self.img2flattn:torch.nn.Conv2d = torch.nn.Conv2d (
+            in_channels = self.num_channels,
+            out_channels=self.model_dim,
+            kernel_size = self.patch_size,
+            stride      = self.patch_size,
+            bias=False
+        )
+    def forward(self,x:torch.Tensor)->torch.Tensor:
+        # (bs, 3, 32, 32 ) >> (bs, model_dim, img_size//patch_size, img_size//patch_size ) >> ( 1. model_dim, img_size**2 ) >>  ( 1, img_size**2, model_dim )
+        return self.img2flattn(x).flatten(2).transpose(1,2)
+class Embedding(torch.nn.Module):
+    def __init__(self,cfg:Dict ) -> None:
+        super().__init__()
+        for k,v in cfg.items(): setattr(self,k,v)
+        self.patch_embedding:PatchEmbedding = PatchEmbedding(cfg=cfg)
+        # single [CLS] token
+        self.cls_token:torch.nn.Parameter = torch.nn.Parameter( torch.randn(1,1, self.model_dim ) )
+        self.position_embd:torch.nn.Parameter = torch.nn.Parameter(
+            torch.randn( 1, int( (self.image_size // self.patch_size)**2  + 1), self.model_dim  )
+        )
+    def forward(self,x:torch.Tensor)->torch.Tensor:
+        x = self.patch_embedding(x)
+        cls_token  = self.cls_token.expand( x.shape[0], -1, -1 )
+        x = torch.cat( (cls_token,x) , dim=1)
+        x = x + self.position_embd
+        return x
+class AttentionBlock(torch.nn.Module):
+    def __init__(self,cfg:Dict ) -> None:
+        super().__init__()
+        for k,v in cfg.items(): self.__setattr__(k,v)
+        assert self.model_dim % self.num_attn_heads ==0, "model dim is not divisible by n heads"
+        self.attn_layer:torch.nn.Linear = torch.nn.Linear(self.model_dim, 3*self.model_dim, bias=False)
+        self.out       :torch.nn.Linear = torch.nn.Linear(self.model_dim,self.model_dim,bias=False)
+        self.attn_dropout:torch.nn.Dropout = torch.nn.Dropout()
+        self.resid_dropout:torch.nn.Dropout= torch.nn.Dropout()
+        # casual mask to ensure that attention is only applied to the left in the input seq
+        # self.register_buffer('bias',tensor= torch.tril(torch.ones(self.block_size,self.block_size)).view(1, 1, self.block_size, self.block_size) )
+        '''
+            block_size=10
+                [[[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
+                [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
+                [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
+                [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
+                [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
+                [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
+                [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
+                [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
+                [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
+                [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]]
+            # Batch-1, Seq-1, Mask-(10,10)
+        '''
+    def forward(self,x:torch.Tensor, attention_outputs:bool)->Tuple[torch.Tensor, Union[torch.Tensor,None]]:
+        '''
+            input (bs,seq_len,embedding_dim)  >> output (bs,seq_len,embedding_dim)
+            x     :: (bs,seq_len,embedding_dim)
+            attn  :: (bs, seq_len, 3*embedding_dim)
+            .split:: (bs, seq_len, 3*embedding_dim).split(embedding_dim,dim=2)
+            # Each chunk (bs,seq_len,embedding) is a view of the original tensor, split across embeddin_dim so, 3 will get
+            k,q,v >> (bs,seql_len, n_heads, embedding_dim//n_heads) >> (bs,head, seql_len, embedding_dim//n_heads)
+            # Each Heads are responsible for different context of seq_len
+        '''
+        B,T,C = x.size()  #(bs, seq_len ,embedding_dim)
+        # calc q,k,v
+        q:torch.Tensor;
+        k:torch.Tensor;
+        v:torch.Tensor;
+        q,k,v = self.attn_layer(x).split(split_size=self.model_dim,dim=2)
+        q = q.view(B,T,self.num_attn_heads, C//self.num_attn_heads).transpose(1,2)
+        k = k.view(B,T,self.num_attn_heads, C//self.num_attn_heads).transpose(1,2)
+        v = v.view(B,T,self.num_attn_heads, C//self.num_attn_heads).transpose(1,2)
+        attn = (q @ k.transpose(-2,-1)) * (1/math.sqrt(k.size(-1)))
+        # attn = attn.masked_fill(self.bias[:,:,:T,:T]==0,float('-inf'))
+        attn = torch.nn.functional.softmax(attn,dim=-1)
+        attn = self.attn_dropout(attn)
+        y:torch.Tensor    = attn @ v   # (bs, n_heads, T,T) @ (bs, n_heads, T, embding_dm/n_heads ) >> (bs,n_heads, seq_len, embedding_dim/n_heads )
+        y:torch.Tensor    = y.transpose(1,2).contiguous().view(B,T,C)
+        return self.resid_dropout(self.out(y)), attn if attention_outputs else None
+class MLP(torch.nn.Module):
+    def __init__(self,cfg:Dict ) -> None:
+        super().__init__()
+        for k,v in cfg.items(): self.__setattr__(k,v)
+        super().__init__()
+        self.dense_1 = torch.nn.Linear(self.model_dim, self.d_ff)
+        self.activation = torch.nn.ReLU()
+        self.layernorm = torch.nn.LayerNorm(self.d_ff)
+        self.dense_2 = torch.nn.Linear(self.d_ff, self.model_dim)
+        self.dropout = torch.nn.Dropout(0.2)
+    def forward(self,x:torch.Tensor)->torch.Tensor:
+        return self.dropout( self.dense_2( self.layernorm(self.activation( self.dense_1(x) )) ) )
+class EncoderBlock(torch.nn.Module):
+    def __init__(self,cfg:Dict ) -> None:
+        super().__init__()
+        for k,v in cfg.items(): self.__setattr__(k,v)
+        self.attn_block  = AttentionBlock(cfg)
+        self.layernorm_1 = torch.nn.LayerNorm(self.model_dim)
+        self.mlp         = MLP(cfg)
+        self.layernorm_2 = torch.nn.LayerNorm(self.model_dim)
+    def forward(self,x:torch.Tensor, attention_outputs:bool)->Tuple[torch.Tensor, Union[torch.Tensor,None]]:
+        #  self-attention
+        attention_op, attn = self.attn_block(self.layernorm_1(x),  attention_outputs=attention_outputs )
+        x = x + attention_op
+        # FC
+        mlp_output = self.mlp( self.layernorm_2(x) )
+        x = x + mlp_output
+        return x, attn if attention_outputs==True else None # Return the transformer block's output and the attention probabilities (optional)
+class Encoder(torch.nn.Module):
+    """
+    The transformer encoder module.
+    """
+    def __init__(self,cfg:Dict ) -> None:
+        super().__init__()
+        for k,v in cfg.items(): self.__setattr__(k,v)
+        # Create a list of transformer blocks
+        self.blocks = torch.nn.ModuleList([])
+        for _ in range(self.number_encoders):
+            block = EncoderBlock(cfg)
+            self.blocks.append(block)
+    def forward(self,x:torch.Tensor,attention_outputs:bool):
+        # Calculate the transformer block's output for each block
+        all_attn = []
+        for block in self.blocks:
+            x,attn = block(x,attention_outputs=attention_outputs)
+            all_attn.append(attn)
+        # Return the encoder's output and the attention probabilities (optional)
+        return x,all_attn if attention_outputs==True else None
+class ViTClassifier(torch.nn.Module):
+    def __init__(self, cfg:Dict ) -> None:
+        super().__init__()
+        for k,v in cfg.items(): self.__setattr__(k,v)
+        self.embed:Embedding  = Embedding(cfg)
+        self.encoders:Encoder = Encoder(cfg=cfg)
+        self.classifier:torch.nn.Linear = torch.nn.Linear(self.model_dim ,self.classification_heads,bias=False)
+    def forward(self,x:torch.Tensor,attention_outputs=False):
+        x = self.embed(x)
+        x,attn = self.encoders(x,attention_outputs=attention_outputs)
+        return self.classifier(x[:,0]), attn if attention_outputs else None

src/vit.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import torch
+import torch.nn as nn
+class PatchEmbedding(nn.Module): # Done
+    """
+    img_size: 1d size of each image (32 for CIFAR-10)
+    patch_size: 1d size of each patch (img_size/num_patch_1d, 4 in this experiment)
+    in_chans: input channel (3 for RGB images)
+    emb_dim: flattened length for each token (or patch)
+    """
+    def __init__(self, img_size:int, patch_size:int, in_chans:int=3, emb_dim:int=48):
+        super(PatchEmbedding, self).__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(
+            in_chans,
+            emb_dim,
+            kernel_size = patch_size,
+            stride = patch_size
+        )
+    def forward(self, x):
+        with torch.no_grad():
+            # x: [batch, in_chans, img_size, img_size]
+            x = self.proj(x) # [batch, embed_dim, # of patches in a row, # of patches in a col], [batch, 48, 8, 8] in this experiment
+            x = x.flatten(2) # [batch, embed_dim, total # of patches], [batch, 48, 64] in this experiment
+            x = x.transpose(1, 2) # [batch, total # of patches, emb_dim] => Transformer encoder requires this dimensions [batch, number of words, word_emb_dim]
+        return x
+class TransformerEncoder(nn.Module): # Done
+    def __init__(self, input_dim:int, mlp_hidden_dim:int, num_head:int=8, dropout:float=0.):
+        # input_dim and head for Multi-Head Attention
+        super(TransformerEncoder, self).__init__()
+        self.norm1 = nn.LayerNorm(input_dim) # LayerNorm is BatchNorm for NLP
+        self.msa = MultiHeadSelfAttention(input_dim, n_heads=num_head)
+        self.norm2 = nn.LayerNorm(input_dim)
+        # Position-wise Feed-Forward Networks with GELU activation functions
+        self.mlp = nn.Sequential(
+            nn.Linear(input_dim, mlp_hidden_dim),
+            nn.GELU(),
+            nn.Linear(mlp_hidden_dim, input_dim),
+            nn.GELU(),
+        )
+    def forward(self, x):
+        out = self.msa(self.norm1(x)) + x # add residual connection
+        out = self.mlp(self.norm2(out)) + out # add another residual connection
+        return out
+class MultiHeadSelfAttention(nn.Module):
+    """
+    dim: dimension of input and out per token features (emb dim for tokens)
+    n_heads: number of heads
+    qkv_bias: whether to have bias in qkv linear layers
+    attn_p: dropout probability for attention
+    proj_p: droupout probability last linear layer
+    scale: scaling factor for attention (1/sqrt(dk))
+    qkv: initial linear layer for the query, key, and value
+    proj: last linear layer
+    attn_drop, proj_drop: dropout layers for attn and proj
+    """
+    def __init__(self, dim:int, n_heads:int=8, qkv_bias:bool=True, attn_p:float=0.01, proj_p:float=0.01):
+        super(MultiHeadSelfAttention, self).__init__()
+        self.n_heads = n_heads
+        self.dim = dim # embedding dimension for input
+        self.head_dim = dim // n_heads # d_q, d_k, d_v in the paper (int div needed to preserve input dim = output dim)
+        self.scale = self.head_dim ** -0.5 # 1/sqrt(d_k)
+        self.qkv = nn.Linear(dim, dim*3, bias=qkv_bias) # lower linear layers in Figure 2 of the paper
+        self.attn_drop = nn.Dropout(attn_p)
+        self.proj = nn.Linear(dim, dim) # upper linear layers in Figure 2 of the paper
+        self.proj_drop = nn.Dropout(proj_p)
+    def forward(self, x):
+        """
+        Input and Output shape: [batch_size, n_patches + 1, dim]
+        """
+        batch_size, n_tokens, x_dim = x.shape # n_tokens = n_patches + 1 (1 is cls_token), x_dim is input dim
+        # Sanity Check
+        if x_dim != self.dim: # make sure input dim is same as concatnated dim (output dim)
+            raise ValueError
+        if self.dim != self.head_dim*self.n_heads: # make sure dim is divisible by n_heads
+            raise ValueError(f"Input & Output dim should be divisible by Number of Heads")
+        # Linear Layers for Query, Key, Value
+        qkv = self.qkv(x) # (batch_size, n_patches+1, 3*dim)
+        qkv = qkv.reshape(batch_size, n_tokens, 3, self.n_heads, self.head_dim) # (batch_size, n_patches+1, 3, n_heads, head_dim)
+        qkv = qkv.permute(2, 0, 3, 1, 4) # (3, batch_size, n_heads, n_patches+1, head_dim)
+        q, k, v = qkv[0], qkv[1], qkv[2] # (batch_size, n_heads, n_patches+1, head_dim)
+        # Scaled Dot-Product Attention
+        k_t = k.transpose(-2, -1) # K Transpose: (batch_size, n_heads, head_dim, n_patches+1)
+        dot_product = (q @ k_t)*self.scale # Query, Key Dot Product with Scale Factor: (batch_size, n_heads, n_patches+1, n_patches+1)
+        attn = dot_product.softmax(dim=-1) # Softmax: (batch_size, n_heads, n_patches+1, n_patches+1)
+        attn = self.attn_drop(attn) # Attention Dropout: (batch_size, n_heads, n_patches+1, n_patches+1)
+        weighted_avg = attn @ v # (batch_size, n_heads, n_patches+1, head_dim)
+        weighted_avg = weighted_avg.transpose(1, 2) # (batch_size, n_patches+1, n_heads, head_dim)
+        # Concat and Last Linear Layer
+        weighted_avg = weighted_avg.flatten(2) # Concat: (batch_size, n_patches+1, dim)
+        x = self.proj(weighted_avg) # Last Linear Layer: (batch_size, n_patches+1, dim)
+        x = self.proj_drop(x) # Last Linear Layer Dropout: (batch_size, n_patches+1, dim)
+        return x
+class ViT(nn.Module): # Done
+    def __init__(
+            self,
+            in_c:int=3,
+            num_classes:int=10,
+            img_size:int=32,
+            num_patch_1d:int=16,
+            dropout:float=0.1,
+            num_enc_layers:int=2,
+            hidden_dim:int=128,
+            mlp_hidden_dim:int=128//2,
+            num_head:int=4,
+            is_cls_token:bool=True
+        ):
+        super(ViT, self).__init__()
+        """
+        is_cls_token: are we using class token?
+        num_patch_1d: number of patches in one row (or col), 3 in Figure 1 of the paper, 8 in this experiment
+        patch_size: # 1d size (size of row or col) of each patch, 16 for ImageNet in the paper, 4 in this experiment
+        flattened_patch_dim: Flattened vec length for each patch (4 x 4 x 3, each side is 4 and 3 color scheme), 48 in this experiment
+        num_tokens: number of total patches + 1 (class token), 10 in Figure 1 of the paper, 65 in this experiment
+        """
+        self.is_cls_token = is_cls_token
+        self.num_patch_1d = num_patch_1d
+        self.patch_size = img_size//self.num_patch_1d
+        num_tokens = (self.num_patch_1d**2)+1 if self.is_cls_token else (self.num_patch_1d**2)
+        # Divide each image into patches
+        self.images_to_patches = PatchEmbedding(
+                                    img_size=img_size,
+                                    patch_size=img_size//num_patch_1d,
+                                    emb_dim=num_patch_1d*num_patch_1d
+                                )
+        # Linear Projection of Flattened Patches
+        self.lpfp = nn.Linear(num_patch_1d*num_patch_1d, hidden_dim) # 48 x 384 (384 is the latent vector size D in the paper)
+        # Patch + Position Embedding (Learnable)
+        self.cls_token = nn.Parameter(torch.randn(1, 1, hidden_dim)) if is_cls_token else None # learnable classification token with dim [1, 1, 384]. 1 in 2nd dim because there is only one class per each image not each patch
+        self.pos_emb = nn.Parameter(torch.randn(1, num_tokens, hidden_dim)) # learnable positional embedding with dim [1, 65, 384]
+        # Transformer Encoder
+        enc_list = [TransformerEncoder(hidden_dim, mlp_hidden_dim=mlp_hidden_dim, dropout=dropout, num_head=num_head) for _ in range(num_enc_layers)] # num_enc_layers is L in Transformer Encoder at Figure 1
+        self.enc = nn.Sequential(*enc_list) # * should be adeed if given regular python list to nn.Sequential
+        # MLP Head (Standard Classifier)
+        self.mlp_head = nn.Sequential(
+            nn.LayerNorm(hidden_dim),
+            nn.Linear(hidden_dim, num_classes)
+        )
+    def forward(self, x): # x: [batch, 3, 32, 32]
+        # Images into Patches (including flattening)
+        out = self.images_to_patches(x) # [batch, 64, 48]
+        # Linear Projection on Flattened Patches
+        out = self.lpfp(out) # [batch, 64, 384]
+        # Add Class Token and Positional Embedding
+        if self.is_cls_token:
+            out = torch.cat([self.cls_token.repeat(out.size(0),1,1), out], dim=1) # [batch, 65, 384], added as extra learnable embedding
+        out = out + self.pos_emb # [batch, 65, 384]
+        # Transformer Encoder
+        out = self.enc(out) # [batch, 65, 384]
+        if self.is_cls_token:
+            out = out[:,0] # [batch, 384]
+        else:
+            out = out.mean(1)
+        # MLP Head
+        out = self.mlp_head(out) # [batch, 10]
+        return out
+import lightning as pl
+from torchmetrics import Accuracy
+class ViTLightning(pl.LightningModule):
+    def __init__(self, learning_rate: float = 1e-3):
+        super(ViTLightning, self).__init__()
+        self.vit = ViT(
+            in_c=3,
+            num_classes=10,
+            img_size=32,
+            num_patch_1d=16,
+            dropout=0.1,
+            num_enc_layers=2,
+            hidden_dim=96,
+            mlp_hidden_dim=64,
+            num_head=8,
+            is_cls_token=True
+        )
+        self.train_acc = Accuracy('multiclass',num_classes=10)
+        self.val_acc = Accuracy('multiclass',num_classes=10)
+        self.test_acc = Accuracy('multiclass',num_classes=10)
+        self.learning_rate = learning_rate
+    def forward(self, x):
+        return self.vit(x)
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        preds = self.forward(x)
+        loss = nn.CrossEntropyLoss()(preds, y)
+        acc = self.train_acc(preds, y)
+        self.log('train_loss', loss, prog_bar=True, logger=True)
+        self.log('train_acc', acc, prog_bar=True, logger=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        preds = self.forward(x)
+        loss = nn.CrossEntropyLoss()(preds, y)
+        acc = self.val_acc(preds, y)
+        self.log('val_loss', loss, prog_bar=True, logger=True)
+        self.log('val_acc', acc, prog_bar=True, logger=True)
+        return loss
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        preds = self.forward(x)
+        loss = nn.CrossEntropyLoss()(preds, y)
+        acc = self.test_acc(preds, y)
+        self.log('test_loss', loss, prog_bar=True, logger=True)
+        self.log('test_acc', acc, prog_bar=True, logger=True)
+        return loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam( self.vit.parameters(), )
+        num_epochs = self.trainer.max_epochs,
+        scheduler = torch.optim.lr_scheduler.OneCycleLR(
+            optimizer=optimizer,
+            total_steps=self.trainer.estimated_stepping_batches,
+            epochs=num_epochs,
+            pct_start= .3,
+            div_factor= 100,
+            max_lr= 1e-3,
+            three_phase= False,
+            final_div_factor= 100,
+            anneal_strategy='linear'
+        )
+        return {
+            'optimizer':optimizer,
+            'lr_scheduler':{
+                'scheduler':scheduler,
+                'monitor': "val_loss",
+                "interval":"step",
+                "frequency":1
+            }
+        }