Spaces:

soutrik
/

gradio_demo_CatDogClassifier

Runtime error

App Files Files Community

Soutrik Chowdhury commited on Dec 4, 2024

Commit

2cfb2b4

unverified ·

2 Parent(s): 54d0f60 ab682a6

Merge pull request #1 from soutrik71/feat/litserve_gpu_gradio

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dvc/.gitignore +3 -0
.dvc/config +8 -0
.dvcignore +3 -0
.flake8 +27 -0
.gitattributes +2 -0
.github/workflows/cd.yaml +65 -0
.github/workflows/ci.yaml +171 -0
.github/workflows/hf_deploy.yaml +61 -0
.github/workflows/test_deploy.yml +62 -0
.gitignore +32 -0
.gradio/certificate.pem +31 -0
.project-root +0 -0
Dockerfile +76 -0
app.py +115 -0
basic_setup.md +419 -0
client.py +18 -0
configs/callbacks/default.yaml +24 -0
configs/callbacks/early_stopping.yaml +15 -0
configs/callbacks/model_checkpoint.yaml +17 -0
configs/callbacks/rich_model_summary.yaml +4 -0
configs/callbacks/rich_progress_bar.yaml +4 -0
configs/data/catdog.yaml +9 -0
configs/experiment/catdog_experiment.yaml +62 -0
configs/experiment/catdog_experiment_resnet.yaml +59 -0
configs/hydra/default.yaml +19 -0
configs/infer.yaml +52 -0
configs/logger/aim.yaml +6 -0
configs/logger/csv.yaml +7 -0
configs/logger/default.yaml +5 -0
configs/logger/mlflow.yaml +9 -0
configs/logger/tensorboard.yaml +10 -0
configs/model/catdog_classifier.yaml +22 -0
configs/model/catdog_classifier_resnet.yaml +13 -0
configs/paths/catdog.yaml +27 -0
configs/train.yaml +47 -0
configs/trainer/default.yaml +20 -0
data.dvc +6 -0
docker-compose-old.yaml +74 -0
docker-compose.yaml +90 -0
docker_compose_exec.sh +58 -0
dvc.lock +31 -0
dvc.yaml +28 -0
ec2_runner_setup.md +357 -0
image.jpg +0 -0
main.py +5 -0
notebooks/datamodule_lightning.ipynb +301 -0
notebooks/training_lightning_tests.ipynb +1011 -0
poetry.lock +0 -0
pyproject.toml +94 -0
requirements.txt +28 -0

.dvc/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/config.local
+/tmp
+/cache

.dvc/config ADDED Viewed

	@@ -0,0 +1,8 @@

+[core]
+    autostage = true
+    remote = aws_remote
+['remote "local_remote"']
+    url = /tmp/dvclocalstore
+['remote "aws_remote"']
+    url = s3://deep-bucket-s3/data
+    region = ap-south-1

.dvcignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore

.flake8 ADDED Viewed

	@@ -0,0 +1,27 @@

+[flake8]
+max-line-length = 120
+# Exclude the virtual environment, notebooks folder, tests folder, and other unnecessary directories
+exclude =
+    .venv,
+    __pycache__,
+    .git,
+    build,
+    dist,
+    notebooks,
+    tests,
+    .ipynb_checkpoints,
+    .mypy_cache,
+    .pytest_cache,
+    pytorch_project
+ignore =
+    E203,
+    W503,
+    E501,
+    E402,
+    F401,
+    E401
+max-complexity = 10
+show-source = True

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.ckpt filter=lfs diff=lfs merge=lfs -text
2	+ checkpoints/*.ckpt filter=lfs diff=lfs merge=lfs -text

.github/workflows/cd.yaml ADDED Viewed

	@@ -0,0 +1,65 @@

+name: Deploy PyTorch Training to ECR with Docker Compose
+on:
+  push:
+    branches:
+      - main
+      - feat/pytorch-catdog-setup
+jobs:
+  deploy:
+    runs-on: self-hosted
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Log in to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v2
+      - name: Create .env file
+        run: |
+          echo "AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}" >> .env
+          echo "AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >> .env
+          echo "AWS_REGION=${{ secrets.AWS_REGION }}" >> .env
+      - name: Run Docker Compose for train service
+        run: |
+          docker-compose stop
+          docker-compose build
+          docker-compose up -d train
+          docker-compose up -d eval
+          docker-compose up -d server
+          docker-compose up -d client
+          docker-compose remove
+      - name: Build, tag, and push Docker image to Amazon ECR
+        env:
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+          REPOSITORY: soutrik71/pytorch_catdog
+          IMAGE_TAG: ${{ github.sha }}
+        run: |
+          docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
+          docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
+          docker tag $REGISTRY/$REPOSITORY:$IMAGE_TAG $REGISTRY/$REPOSITORY:latest
+          docker push $REGISTRY/$REPOSITORY:latest
+      - name: Pull Docker image from ECR and verify
+        env:
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+          REPOSITORY: soutrik71/pytorch_catdog
+          IMAGE_TAG: ${{ github.sha }}
+        run: |
+          docker pull $REGISTRY/$REPOSITORY:$IMAGE_TAG
+          docker images | grep "$REGISTRY/$REPOSITORY"

.github/workflows/ci.yaml ADDED Viewed

	@@ -0,0 +1,171 @@

+name: CI Pipeline
+on:
+  push:
+    branches:
+      - main
+    #   - feat/pytorch-catdog-setup
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+jobs:
+  python_basic_test:
+    name: Test current codebase and setup Python environment
+    runs-on: self-hosted
+    strategy:
+      matrix:
+        python-version: [3.10.15]
+    env:
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AWS_REGION: ${{ secrets.AWS_REGION }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Print branch name
+        run: echo "Branch name is ${{ github.ref_name }}"
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install Poetry
+        run: |
+          python -m pip install --upgrade pip
+          pip install poetry
+          poetry config virtualenvs.in-project true
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v3
+        with:
+          path: |
+            .venv
+            ~/.cache/pypoetry
+          key: ${{ runner.os }}-poetry-${{ hashFiles('poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Install dependencies
+        run: poetry install --no-root --no-interaction
+      - name: Check Poetry environment
+        run: poetry env info
+      - name: Create .env file
+        run: |
+          echo "AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}" >> .env
+          echo "AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}" >> .env
+          echo "AWS_REGION=${AWS_REGION}" >> .env
+          echo ".env file created"
+      - name: Run lint checks
+        run: poetry run flake8 . --exclude=.venv,tests,notebooks
+      - name: black
+        run: poetry run black . --exclude="(\.venv|tests|notebooks)"
+  pytorch_code_test:
+    name: Test PyTorch code
+    runs-on: self-hosted
+    env:
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AWS_REGION: ${{ secrets.AWS_REGION }}
+    needs: python_basic_test
+    strategy:
+      matrix:
+        python-version: [3.10.15]
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install Poetry
+        run: |
+          python -m pip install --upgrade pip
+          pip install poetry
+          poetry config virtualenvs.in-project true
+      - name: Cache Poetry dependencies
+        uses: actions/cache@v3
+        with:
+          path: |
+            .venv
+            ~/.cache/pypoetry
+          key: ${{ runner.os }}-poetry-${{ hashFiles('poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+      - name: Install dependencies
+        run: poetry install --no-root --no-interaction
+      - name: Check Poetry environment
+        run: poetry env info
+      - name: Get data from DVC
+        run: |
+          poetry run dvc pull || echo "No data to pull from DVC"
+      - name: Run Train code
+        run: |
+          echo "Training the model"
+          poetry run python -m src.train_optuna_callbacks experiment=catdog_experiment ++task_name=train ++train=True ++test=False || exit 1
+          poetry run python -m src.create_artifacts
+      - name: Run Test code
+        run: |
+          echo "Testing the model"
+          poetry run python -m src.train_optuna_callbacks experiment=catdog_experiment ++task_name=test ++train=False ++test=True || exit 1
+      - name: upload model checkpoints
+        uses: actions/upload-artifact@v4
+        with:
+          name: model-checkpoints
+          path: ./checkpoints/
+      - name: upload logs
+        uses: actions/upload-artifact@v4
+        with:
+          name: logs
+          path: ./logs/
+      - name: upload configs
+        uses: actions/upload-artifact@v4
+        with:
+          name: configs
+          path: ./configs/
+      - name: upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: artifacts
+          path: ./artifacts/

.github/workflows/hf_deploy.yaml ADDED Viewed

	@@ -0,0 +1,61 @@

+name: Sync to Hugging Face Hub
+on:
+  push:
+    branches:
+        - main
+        - feat/litserve_gpu_gradio
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Install Git LFS
+        run: |
+            curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
+            sudo apt-get install git-lfs
+            git lfs install
+            git lfs pull
+      - name: Add remote
+        run: |
+          git remote add space https://$USER:[email protected]/spaces/$USER/$SPACE
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          USER: soutrik
+          SPACE: gradio_demo_CatDogClassifier
+      - name: Add README.md
+        run: |
+          cat <<EOF > README.md
+          ---
+          title: My Gradio App CatDog Classifier
+          emoji: 🚀
+          colorFrom: blue
+          colorTo: green
+          sdk: gradio
+          sdk_version: "5.7.1"
+          app_file: app.py
+          pinned: false
+          ---
+          EOF
+      - name: Configure Git identity
+        run: |
+          git config user.name "soutrik"
+          git config user.email "[email protected]"
+      - name: Push to hub
+        run: |
+          git add README.md
+          git commit -m "Add README.md"
+          git push --force https://$USER:[email protected]/spaces/$USER/$SPACE main
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          USER: soutrik
+          SPACE: gradio_demo_CatDogClassifier

.github/workflows/test_deploy.yml ADDED Viewed

	@@ -0,0 +1,62 @@

+name: Deploy to ECR and Run Docker Compose with AWS Actions for GitHub and Docker Buildx
+on:
+  push:
+    branches:
+      - main
+      - feat/framework-setup
+jobs:
+  deploy:
+    runs-on: self-hosted
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Log in to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v2
+      - name: Build, tag, and push docker image to Amazon ECR
+        env:
+          POSTGRES_DB: ${{ secrets.POSTGRES_DB }}
+          POSTGRES_USER: ${{ secrets.POSTGRES_USER }}
+          POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }}
+          REDIS_PORT: ${{ secrets.REDIS_PORT }}
+          REDIS_HOST: ${{ secrets.REDIS_HOST }}
+          FLOWER_BASIC_AUTH: ${{ secrets.FLOWER_BASIC_AUTH }}
+          REDIS_URL: ${{ secrets.REDIS_URL }}
+          DATABASE_URL: ${{ secrets.DATABASE_URL }}
+          BROKER_URL: ${{ secrets.BROKER_URL }}
+          REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+          REPOSITORY: soutrik71/test
+          IMAGE_TAG: ${{ github.sha }}
+        run: |
+          docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG .
+          docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG
+      - name: Run Docker Compose
+        env:
+          POSTGRES_DB: ${{ secrets.POSTGRES_DB }}
+          POSTGRES_USER: ${{ secrets.POSTGRES_USER }}
+          POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }}
+          REDIS_PORT: ${{ secrets.REDIS_PORT }}
+          REDIS_HOST: ${{ secrets.REDIS_HOST }}
+          FLOWER_BASIC_AUTH: ${{ secrets.FLOWER_BASIC_AUTH }}
+          REDIS_URL: ${{ secrets.REDIS_URL }}
+          DATABASE_URL: ${{ secrets.DATABASE_URL }}
+          BROKER_URL: ${{ secrets.BROKER_URL }}
+        run: |
+          docker-compose up -d --build app

.gitignore ADDED Viewed

	@@ -0,0 +1,32 @@

+aws/
+*.zip
+*.tar.gz
+*.tar.bz2
+.env
+*.pyc
+*.cpython-*.*
+src/__pycache__/
+src/*.egg-info/
+src/dist/
+src/build/
+src/.eggs/
+src/.pytest_cache/
+src/.mypy_cache/
+src/.tox/
+src/.coverage
+src/.vscode/
+src/.vscode-test/
+app/core/__pycache__/
+src/__pycache__/test_infra.cpython-310.pyc
+app/core/__pycache__/config.cpython-310.pyc
+data/
+!configs/data/
+checkpoints/
+logs/
+/data
+artifacts/
+artifacts/*
+*png
+*jpg
+*jpeg
+artifacts/image_prediction.png

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

.project-root ADDED Viewed

File without changes

Dockerfile ADDED Viewed

	@@ -0,0 +1,76 @@

+# Stage 1: Base image with CUDA 12.2, cuDNN 9, and minimal runtime for PyTorch
+FROM nvidia/cuda:12.2.0-runtime-ubuntu20.04 as base
+LABEL maintainer="Soutrik [email protected]" \
+      description="Base Docker image for running a Python app with Poetry and GPU support."
+# Install necessary system dependencies, including Python 3.10
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update && apt-get install -y --no-install-recommends \
+    python3.10 \
+    python3.10-venv \
+    python3.10-dev \
+    python3-pip \
+    curl \
+    git \
+    build-essential && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+# Set Python 3.10 as the default
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
+    python --version
+# Install Poetry
+RUN curl -sSL https://install.python-poetry.org | python3 - && \
+    ln -s /root/.local/bin/poetry /usr/local/bin/poetry
+# Configure Poetry environment
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+# Set the working directory to /app
+WORKDIR /app
+# Copy pyproject.toml and poetry.lock to install dependencies
+COPY pyproject.toml poetry.lock /app/
+# Install Python dependencies without building the app itself
+RUN --mount=type=cache,target=/tmp/poetry_cache poetry install --only main --no-root
+# Stage 2: Build stage for the application
+FROM base as builder
+# Copy application source code and necessary files
+COPY src /app/src
+COPY configs /app/configs
+COPY .project-root /app/.project-root
+COPY main.py /app/main.py
+# Stage 3: Final runtime stage
+FROM base as runner
+# Copy application source code and dependencies from the builder stage
+COPY --from=builder /app/src /app/src
+COPY --from=builder /app/configs /app/configs
+COPY --from=builder /app/.project-root /app/.project-root
+COPY --from=builder /app/main.py /app/main.py
+COPY --from=builder /app/.venv /app/.venv
+# Copy client files
+COPY run_client.sh /app/run_client.sh
+# Set the working directory to /app
+WORKDIR /app
+# Add virtual environment to PATH
+ENV PATH="/app/.venv/bin:$PATH"
+# Install PyTorch with CUDA 12.2 support (adjusted for compatibility)
+RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu122
+# Default command to run the application
+CMD ["python", "-m", "main"]

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import gradio as gr
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from pathlib import Path
+from torchvision import transforms
+from src.models.catdog_model_resnet import ResnetClassifier
+from src.utils.aws_s3_services import S3Handler
+from src.utils.logging_utils import setup_logger
+from loguru import logger
+import rootutils
+# Load environment variables and configure logger
+setup_logger(Path("./logs") / "gradio_app.log")
+# Setup root directory
+root = rootutils.setup_root(__file__, indicator=".project-root")
+class ImageClassifier:
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.classes = cfg.labels
+        # Download and load model from S3
+        logger.info("Downloading model from S3...")
+        s3_handler = S3Handler(bucket_name="deep-bucket-s3")
+        s3_handler.download_folder("checkpoints", "checkpoints")
+        logger.info("Loading model checkpoint...")
+        self.model = ResnetClassifier.load_from_checkpoint(
+            checkpoint_path=cfg.ckpt_path
+        )
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        # Image transform
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize((cfg.data.image_size, cfg.data.image_size)),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+    def predict(self, image):
+        if image is None:
+            return "No image provided.", None
+        # Preprocess the image
+        logger.info("Processing input image...")
+        img_tensor = self.transform(image).unsqueeze(0).to(self.device)
+        # Inference
+        with torch.no_grad():
+            output = self.model(img_tensor)
+            probabilities = F.softmax(output, dim=1)
+            predicted_class_idx = torch.argmax(probabilities, dim=1).item()
+            confidence = probabilities[0][predicted_class_idx].item()
+        predicted_label = self.classes[predicted_class_idx]
+        logger.info(f"Prediction: {predicted_label} (Confidence: {confidence:.2f})")
+        return predicted_label, confidence
+def create_gradio_app(cfg):
+    classifier = ImageClassifier(cfg)
+    def classify_image(image):
+        """Gradio interface function."""
+        predicted_label, confidence = classifier.predict(image)
+        if predicted_label:
+            return f"Predicted: {predicted_label} (Confidence: {confidence:.2f})"
+        return "Error during prediction."
+    # Create Gradio interface
+    with gr.Blocks() as demo:
+        gr.Markdown(
+            """
+            # Cat vs Dog Classifier
+            Upload an image of a cat or a dog to classify it with confidence.
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(
+                    label="Input Image", type="pil", image_mode="RGB"
+                )
+                predict_button = gr.Button("Classify")
+            with gr.Column():
+                output_text = gr.Textbox(label="Prediction")
+        # Define interaction
+        predict_button.click(
+            fn=classify_image, inputs=[input_image], outputs=[output_text]
+        )
+    return demo
+# Hydra config wrapper for launching Gradio app
+if __name__ == "__main__":
+    import hydra
+    from omegaconf import DictConfig
+    @hydra.main(config_path="configs", config_name="infer", version_base="1.3")
+    def main(cfg: DictConfig):
+        logger.info("Launching Gradio App...")
+        demo = create_gradio_app(cfg)
+        demo.launch(share=True, server_name="0.0.0.0", server_port=7860)
+    main()

basic_setup.md ADDED Viewed

	@@ -0,0 +1,419 @@

+## __POETRY SETUP__
+```bash
+# Install poetry
+conda create -n poetry_env python=3.10 -y
+conda activate poetry_env
+pip install poetry
+poetry env info
+poetry new pytorch_project
+cd pytorch_project/
+# fill up the pyproject.toml file without pytorch and torchvision
+poetry install
+# Add dependencies to the project for pytorch and torchvision
+poetry source add --priority explicit pytorch_cpu https://download.pytorch.org/whl/cpu
+poetry add --source pytorch_cpu torch torchvision
+poetry lock
+poetry show
+poetry install --no-root
+# Add dependencies to the project
+poetry add matplotlib
+poetry add hydra-core
+poetry add omegaconf
+poetry add hydra_colorlog
+poetry add --dev black #
+poetry lock
+poetry show
+Type	Purpose	Installation Command
+  Normal Dependency	Required for the app to run in production.	poetry add <package>
+  Development Dependency	Needed only during development (e.g., testing, linting).	poetry add --dev <package>
+# Add dependencies to the project with specific version
+poetry add <package_name>@<version>
+```
+## __MULTISTAGEDOCKER SETUP__
+#### Step-by-Step Guide to Creating Dockerfile and docker-compose.yml for a New Code Repo
+If you're new to the project and need to set up Docker and Docker Compose to run the training and inference steps, follow these steps.
+---
+### 1. Setting Up the Dockerfile
+A Dockerfile is a set of instructions that Docker uses to create an image. In this case, we'll use a __multi-stage build__ to make the final image lightweight while managing dependencies with `Poetry`.
+#### Step-by-Step Process for Creating the Dockerfile
+1. __Choose a Base Image__:
+   - We need to choose a Python image that matches the project's required version (e.g., Python 3.10.14).
+   - Use the lightweight __`slim`__ version to minimize image size.
+   ```Dockerfile
+   FROM python:3.10.14-slim as builder
+   ```
+2. __Install Dependencies in the Build Stage__:
+   - We'll use __Poetry__ for dependency management. Install it using `pip`.
+   - Next, copy the `pyproject.toml` and `poetry.lock` files to the `/app` directory to install dependencies.
+   ```Dockerfile
+   RUN pip3 install poetry==1.7.1
+   WORKDIR /app
+   COPY pytorch_project/pyproject.toml pytorch_project/poetry.lock /app/
+   ```
+3. __Configure Poetry__:
+   - Configure Poetry to install the dependencies in a virtual environment inside the project directory (not globally). This keeps everything contained and avoids conflicts with the system environment.
+   ```Dockerfile
+   ENV POETRY_NO_INTERACTION=1 \
+       POETRY_VIRTUALENVS_IN_PROJECT=1 \
+       POETRY_VIRTUALENVS_CREATE=true \
+       POETRY_CACHE_DIR=/tmp/poetry_cache
+   ```
+4. __Install Dependencies__:
+   - Use `poetry install --no-root` to install only the dependencies and not the package itself. This is because you typically don't need to install the actual project code at this stage.
+   ```Dockerfile
+   RUN --mount=type=cache,target=/tmp/poetry_cache poetry install --only main --no-root
+   ```
+5. __Build the Runtime Stage__:
+   - Now, set up the final runtime image. This stage will only include the required application code and the virtual environment created in the first stage.
+   - The final image will use the same Python base image but remain small by avoiding the re-installation of dependencies.
+   ```Dockerfile
+   FROM python:3.10.14-slim as runner
+   WORKDIR /app
+   COPY src /app/src
+   COPY --from=builder /app/.venv /app/.venv
+   ```
+6. __Set Up the Path to Use the Virtual Environment__:
+   - Update the `PATH` environment variable to use the Python binaries from the virtual environment.
+   ```Dockerfile
+   ENV PATH="/app/.venv/bin:$PATH"
+   ```
+7. __Set a Default Command__:
+   - Finally, set the command that will be executed by default when the container is run. You can change or override this later in the Docker Compose file.
+   ```Dockerfile
+   CMD ["python", "-m", "src.train"]
+   ```
+### Final Dockerfile
+```Dockerfile
+# Stage 1: Build environment with Poetry and dependencies
+FROM python:3.10.14-slim as builder
+RUN pip3 install poetry==1.7.1
+WORKDIR /app
+COPY pytorch_project/pyproject.toml pytorch_project/poetry.lock /app/
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=true \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+RUN --mount=type=cache,target=/tmp/poetry_cache poetry install --only main --no-root
+# Stage 2: Runtime environment
+FROM python:3.10.14-slim as runner
+WORKDIR /app
+COPY src /app/src
+COPY --from=builder /app/.venv /app/.venv
+ENV PATH="/app/.venv/bin:$PATH"
+CMD ["python", "-m", "src.train"]
+```
+---
+### 2. Setting Up the docker-compose.yml File
+The `docker-compose.yml` file is used to define and run multiple Docker containers as services. In this case, we need two services: one for __training__ and one for __inference__.
+### Step-by-Step Process for Creating docker-compose.yml
+1. __Define the Version__:
+   - Docker Compose uses a versioning system. Use version `3.8`, which is widely supported and offers features such as networking and volume support.
+   ```yaml
+   version: '3.8'
+   ```
+2. __Set Up the `train` Service__:
+   - The `train` service is responsible for running the training script. It builds the Docker image, runs the training command, and uses volumes to store the data, checkpoints, and artifacts.
+   ```yaml
+   services:
+     train:
+       build:
+         context: .
+       command: python -m src.train
+       volumes:
+         - data:/app/data
+         - checkpoints:/app/checkpoints
+         - artifacts:/app/artifacts
+       shm_size: '2g'  # Increase shared memory to prevent DataLoader issues
+       networks:
+         - default
+       env_file:
+         - .env  # Load environment variables
+   ```
+3. __Set Up the `inference` Service__:
+   - The `inference` service runs after the training has completed. It waits for a file (e.g., `train_done.flag`) to be created by the training process and then runs the inference script.
+   ```yaml
+     inference:
+       build:
+         context: .
+       command: /bin/bash -c "while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done; python -m src.infer"
+       volumes:
+         - checkpoints:/app/checkpoints
+         - artifacts:/app/artifacts
+       shm_size: '2g'
+       networks:
+         - default
+       depends_on:
+         - train
+       env_file:
+         - .env
+   ```
+4. __Define Shared Volumes__:
+   - Volumes allow services to share data. Here, we define three shared volumes:
+     - `data`: Stores the input data.
+     - `checkpoints`: Stores the model checkpoints and the flag indicating training is complete.
+     - `artifacts`: Stores the final model outputs or artifacts.
+   ```yaml
+   volumes:
+     data:
+     checkpoints:
+     artifacts:
+   ```
+5. __Set Up Networking__:
+   - Use the default network to allow the services to communicate.
+   ```yaml
+   networks:
+     default:
+   ```
+### Final docker-compose.yml
+```yaml
+version: '3.8'
+services:
+  train:
+    build:
+      context: .
+    command: python -m src.train
+    volumes:
+      - data:/app/data
+      - checkpoints:/app/checkpoints
+      - artifacts:/app/artifacts
+    shm_size: '2g'
+    networks:
+      - default
+    env_file:
+      - .env
+  inference:
+    build:
+      context: .
+    command: /bin/bash -c "while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done; python -m src.infer"
+    volumes:
+      - checkpoints:/app/checkpoints
+      - artifacts:/app/artifacts
+    shm_size: '2g'
+    networks:
+      - default
+    depends_on:
+      - train
+    env_file:
+      - .env
+volumes:
+  data:
+  checkpoints:
+  artifacts:
+networks:
+  default:
+```
+---
+### Summary
+1. __Dockerfile__:
+   - A multi-stage Dockerfile is used to create a lightweight image where the dependencies are installed with Poetry and the application code is run using a virtual environment.
+   - It ensures that all dependencies are isolated in a virtual environment, and the final container only includes what is necessary for the runtime.
+2. __docker-compose.yml__:
+   - The `docker-compose.yml` file defines two services:
+     - __train__: Runs the training script and stores checkpoints.
+     - __inference__: Waits for the training to finish and runs inference based on the saved model.
+   - Shared volumes ensure that the services can access data, checkpoints, and artifacts.
+   - `shm_size` is increased to prevent issues with DataLoader in PyTorch when using multiple workers.
+This setup allows for easy management of multiple services using Docker Compose, ensuring reproducibility and simplicity.
+## __References__
+- <https://stackoverflow.com/questions/53835198/integrating-python-poetry-with-docker>
+- <https://github.com/fralik/poetry-with-private-repos/blob/master/Dockerfile>
+- <https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0>
+- <https://www.martinrichards.me/post/python_poetry_docker/>
+- <https://gist.github.com/soof-golan/6ebb97a792ccd87816c0bda1e6e8b8c2>
+8. ## __DVC SETUP__
+First, install dvc using the following command
+```bash
+dvc init
+dvc version
+dvc init -f
+dvc config core.autostage true
+dvc add data
+dvc remote add -d myremote /tmp/dvcstore
+dvc push
+```
+Add some more file in the data directory and run the following commands
+```bash
+dvc add data
+dvc push
+dvc pull
+```
+Next go back to 1 commit and run the following command
+```bash
+git checkout HEAD~1
+dvc checkout
+# you will get one file less
+```
+Next go back to the latest commit and run the following command
+```bash
+git checkout -
+dvc checkout
+dv pull
+dvc commit
+```
+Next run the following command to add google drive as a remote
+```bash
+dvc remote add --default gdrive gdrive://1w2e3r4t5y6u7i8o9p0
+dvc remote modify gdrive gdrive_acknowledge_abuse true
+dvc remote modify gdrive gdrive_client_id <>
+dvc remote modify gdrive gdrive_client_secret <>
+# does not work when used from VM and port forwarding to local machine
+```
+Next run the following command to add azure-blob as a remote
+```bash
+dvc remote remove azblob
+dvc remote add --default azblob azure://mycontainer/myfolder
+dvc remote modify --local azblob connection_string "<>"
+dvc remote modify azblob  allow_anonymous_login true
+dvc push -r azblob
+# this works when used and requires no explicit login
+```
+Next we will add S3 as a remote
+```bash
+dvc remote add --default aws_remote s3://deep-bucket-s3/data
+dvc remote modify --local aws_remote access_key_id <>
+dvc remote modify --local aws_remote secret_access_key <>
+dvc remote modify --local aws_remote region ap-south-1
+dvc remote modify aws_remote region ap-south-1
+dvc push -r aws_remote -v
+```
+9. ## __HYDRA SETUP__
+```bash
+# Install hydra
+pip install hydra-core hydra_colorlog omegaconf
+# Fillup the configs folder with the files as per the project
+# Run the following command to run the hydra experiment
+# for train
+python -m src.hydra_test experiment=catdog_experiment ++task_name=train ++train=True ++test=False
+# for eval
+python -m src.hydra_test experiment=catdog_experiment ++task_name=eval ++train=False ++test=True
+# for both
+python -m src.hydra_test experiment=catdog_experiment task_name=train train=True test=True # + means adding new key value pair to the existing config and ++ means overriding the existing key value pair
+```
+10. ## __LOCAL SETUP__
+```bash
+ python -m src.train experiment=catdog_experiment ++task_name=train ++train=True ++test=False
+ python -m src.train experiment=catdog_experiment ++task_name=eval ++train=False ++test=True
+ python -m src.infer experiment=catdog_experiment
+```
+11. ## _DVC_PIPELINE_SETUP_
+```bash
+dvc repro
+```
+12. ## _DVC Experiments_
+  - To run the dvc experiments keep different experiment_<>.yaml files in the configs folder under experiment folder
+  - Make sure to override the default values in the experiment_<>.yaml file for each parameter that you want to change
+13. ## _HYDRA Experiments_
+  - make sure to declare te config file in yaml format in the configs folder hparam
+  - have hparam null in train and eval config file
+  - run the following command to run the hydra experiment
+  ```bash
+   python -m src.train --multirun experiment=catdog_experiment_convnext ++task_name=train ++train=True ++test=False hparam=catdog_classifier_covnext
+   python -m src.create_artifacts
+  ```
+14. ## __Latest Execution Command__
+```bash
+python -m src.train_optuna_callbacks experiment=catdog_experiment ++task_name=train ++train=True ++test=False
+python -m src.train_optuna_callbacks experiment=catdog_experiment ++task_name=test ++train=False ++test=True
+python -m src.infer experiment=catdog_experiment
+```
+15. ## __GPU Setup__
+```bash
+docker build -t my-gpu-app .
+docker run --gpus all my-gpu-app
+docker exec -it <container_id> /bin/bash
+# pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime supports cuda 12.1 and python 3.10.14
+```
+```bash
+# for docker compose what we need to is follow similar to the following
+services:
+  test:
+    image: nvidia/cuda:12.3.1-base-ubuntu20.04
+    command: nvidia-smi
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+```

client.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import requests
+response = requests.post("http://127.0.0.1:8080/predict", json={"input": 4.0})
+print(f"Status: {response.status_code}\nResponse:\n {response.text}")

configs/callbacks/default.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+defaults:
+  - model_checkpoint
+  - early_stopping
+  - rich_model_summary
+  - rich_progress_bar
+  - _self_
+model_checkpoint:
+  dirpath: ${paths.ckpt_dir}
+  monitor: "val_loss"
+  mode: "min"
+  save_last: False
+  auto_insert_metric_name: False
+early_stopping:
+  monitor: "val_loss"
+  patience: 3
+  mode: "min"
+rich_model_summary:
+  max_depth: -1
+rich_progress_bar:
+  refresh_rate: 1

configs/callbacks/early_stopping.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.EarlyStopping.html
+early_stopping:
+  _target_: lightning.pytorch.callbacks.EarlyStopping
+  monitor: val_loss # quantity to be monitored, must be specified !!!
+  min_delta: 0. # minimum change in the monitored quantity to qualify as an improvement
+  patience: 3 # number of checks with no improvement after which training will be stopped
+  verbose: False # verbosity mode
+  mode: "min" # "max" means higher metric value is better, can be also "min"
+  strict: True # whether to crash the training if monitor is not found in the validation metrics
+  check_finite: True # when set True, stops training when the monitor becomes NaN or infinite
+  stopping_threshold: null # stop training immediately once the monitored quantity reaches this threshold
+  divergence_threshold: null # stop training as soon as the monitored quantity becomes worse than this threshold
+  check_on_train_epoch_end: null # whether to run early stopping at the end of the training epoch
+  # log_rank_zero_only: False  # this keyword argument isn't available in stable version

configs/callbacks/model_checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html
+model_checkpoint:
+  _target_: lightning.pytorch.callbacks.ModelCheckpoint
+  dirpath: null # directory to save the model file
+  filename: best-checkpoint # checkpoint filename
+  monitor: val_loss # name of the logged metric which determines when model is improving
+  verbose: False # verbosity mode
+  save_last: False # additionally always save an exact copy of the last checkpoint to a file last.ckpt
+  save_top_k: 1 # save k best models (determined by above metric)
+  mode: "min" # "max" means higher metric value is better, can be also "min"
+  auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name
+  save_weights_only: False # if True, then only the model’s weights will be saved
+  every_n_train_steps: null # number of training steps between checkpoints
+  train_time_interval: null # checkpoints are monitored at the specified time interval
+  every_n_epochs: null # number of epochs between checkpoints
+  save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation

configs/callbacks/rich_model_summary.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html
+rich_model_summary:
+  _target_: lightning.pytorch.callbacks.RichModelSummary
+  max_depth: 1

configs/callbacks/rich_progress_bar.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichProgressBar.html
+rich_progress_bar:
+  _target_: lightning.pytorch.callbacks.RichProgressBar
+  refresh_rate: 1

configs/data/catdog.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+_target_: src.datamodules.catdog_datamodule.CatDogImageDataModule
+root_dir: ${paths.data_dir}
+data_dir: "cats_and_dogs_filtered"
+url: ${paths.data_url}
+num_workers: 4
+batch_size: 32
+train_val_split: [0.8, 0.2]
+pin_memory: False
+image_size: 224

configs/experiment/catdog_experiment.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=catdog_ex
+defaults:
+  - override /paths: catdog
+  - override /data: catdog
+  - override /model: catdog_classifier
+  - override /callbacks: default
+  - override /logger: default
+  - override /trainer: default
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+seed: 42
+name: "catdog_experiment"
+data:
+  data_dir: "cats_and_dogs_filtered"
+  batch_size: 64
+  num_workers: 8
+  pin_memory: True
+  image_size: 224
+model:
+  lr: 5e-5
+  weight_decay: 1e-5
+  factor: 0.5
+  patience: 5
+  min_lr: 1e-6
+  num_classes: 2
+  patch_size: 16
+  embed_dim: 256
+  depth: 4
+  num_heads: 4
+  mlp_ratio: 4
+trainer:
+  min_epochs: 1
+  max_epochs: 5
+callbacks:
+  model_checkpoint:
+    monitor: "val_acc"
+    mode: "max"
+    save_top_k: 1
+    save_last: True
+    verbose: True
+  early_stopping:
+    monitor: "val_acc"
+    patience: 10
+    mode: "max"
+    verbose: True
+  rich_model_summary:
+    max_depth: 1
+  rich_progress_bar:
+    refresh_rate: 1

configs/experiment/catdog_experiment_resnet.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=catdog_ex
+defaults:
+  - override /paths: catdog
+  - override /data: catdog
+  - override /model: catdog_classifier_resnet
+  - override /callbacks: default
+  - override /logger: default
+  - override /trainer: default
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+seed: 42
+name: "catdog_experiment_resnet"
+# Logger-specific configurations
+logger:
+  aim:
+    experiment: ${name}
+  mlflow:
+    experiment_name: ${name}
+    tags:
+      model_type: "timm_classify"
+data:
+  batch_size: 64
+  num_workers: 8
+  pin_memory: True
+  image_size: 160
+model:
+  base_model: efficientnet_b0
+  pretrained: True
+  lr: 1e-3
+  weight_decay: 1e-5
+  factor: 0.1
+  patience: 5
+  min_lr: 1e-6
+  num_classes: 2
+trainer:
+  min_epochs: 1
+  max_epochs: 5
+callbacks:
+  model_checkpoint:
+    monitor: "val_acc"
+    mode: "max"
+    save_top_k: 1
+    save_last: True
+  early_stopping:
+    monitor: "val_acc"
+    patience: 3
+    mode: "max"

configs/hydra/default.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+# https://hydra.cc/docs/configure_hydra/intro/
+# enable color logging
+defaults:
+  - override hydra_logging: colorlog
+  - override job_logging: colorlog
+# output directory, generated dynamically on each run
+run:
+  dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
+sweep:
+  dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
+  subdir: ${hydra.job.num}
+job_logging:
+  handlers:
+    file:
+      # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
+      filename: ${hydra.runtime.output_dir}/${task_name}.log

configs/infer.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+# @package _global_
+# specify here default configuration
+# order of defaults determines the order in which configs override each other
+defaults:
+  - _self_
+  - data: catdog
+  - model: catdog_classifier
+  - callbacks: default
+  - logger: null # set logger here or use command line (e.g. `python train.py logger=tensorboard`)
+  - trainer: default
+  - paths: dogbreed
+  - hydra: default
+  # experiment configs allow for version control of specific hyperparameters
+  # e.g. best hyperparameters for given model and datamodule
+  - experiment: catdog_experiment
+  # debugging config (enable through command line, e.g. `python train.py debug=default)
+  - debug: null
+# task name, determines output directory path
+task_name: "infer"
+# tags to help you identify your experiments
+# you can overwrite this in experiment configs
+# overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
+tags: ["dev"]
+# set False to skip model training
+train: False
+# evaluate on test set, using best model weights achieved during training
+# lightning chooses best weights based on the metric specified in checkpoint callback
+test: False
+# simply provide checkpoint path to resume training
+ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt
+# seed for random number generators in pytorch, numpy and python.random
+seed: 42
+# name of the experiment
+name: "catdog_experiment"
+server:
+  port: 8080
+  max_batch_size: 8
+  batch_timeout: 0.01
+  accelerator: "auto"
+  devices: "auto"
+  workers_per_device: 2
+labels: ["cat", "dog"]

configs/logger/aim.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+aim:
+  _target_: aim.pytorch_lightning.AimLogger
+  experiment: ${name}
+  train_metric_prefix: train_
+  test_metric_prefix: test_
+  val_metric_prefix: val_

configs/logger/csv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# csv logger built in lightning
+csv:
+  _target_: lightning.pytorch.loggers.csv_logs.CSVLogger
+  save_dir: "${paths.output_dir}"
+  name: "csv/"
+  prefix: ""

configs/logger/default.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+# train with many loggers at once
+defaults:
+  - csv
+  - tensorboard

configs/logger/mlflow.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+# MLflow logger configuration
+mlflow:
+  _target_: lightning.pytorch.loggers.MLFlowLogger
+  experiment_name: ${name}
+  tracking_uri: file:${paths.log_dir}/mlruns
+  save_dir: ${paths.log_dir}/mlruns
+  log_model: False
+  prefix: ""

configs/logger/tensorboard.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+# https://www.tensorflow.org/tensorboard/
+tensorboard:
+  _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
+  save_dir: "${paths.output_dir}/tensorboard/"
+  name: null
+  log_graph: False
+  default_hp_metric: True
+  prefix: ""
+  # version: ""

configs/model/catdog_classifier.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+# model class
+_target_: src.models.catdog_model.ViTTinyClassifier
+# model params
+img_size: ${data.image_size}
+patch_size: 16
+num_classes: 2
+embed_dim: 128
+depth: 6
+num_heads: 4
+mlp_ratio: 4
+pre_norm: False
+# optimizer params
+lr: 1e-3
+weight_decay: 1e-5
+# scheduler params
+factor: 0.1
+patience: 10
+min_lr: 1e-6

configs/model/catdog_classifier_resnet.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+_target_: src.models.catdog_model_resnet.ResnetClassifier
+# model params
+base_model: efficientnet_b0
+pretrained: True
+num_classes: 2
+# optimizer params
+lr: 1e-3
+weight_decay: 1e-5
+# scheduler params
+factor: 0.1
+patience: 10
+min_lr: 1e-6

configs/paths/catdog.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+# path to root directory
+# this requires PROJECT_ROOT environment variable to exist
+# you can replace it with "." if you want the root to be the current working directory
+root_dir: ${oc.env:PROJECT_ROOT}
+# path to data directory
+data_dir: ${paths.root_dir}/data/
+# path to logging directory
+log_dir: ${paths.root_dir}/logs/
+# path to checkpoint directory
+ckpt_dir: ${paths.root_dir}/checkpoints
+# path to artifact directory
+artifact_dir: ${paths.root_dir}/artifacts/
+# download url for the dataset
+data_url: "https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip"
+# path to output directory, created dynamically by hydra
+# path generation pattern is specified in `configs/hydra/default.yaml`
+# use it to store all files generated during the run, like ckpts and metrics
+output_dir: ${hydra:runtime.output_dir}
+# path to working directory
+work_dir: ${hydra:runtime.cwd}

configs/train.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+# @package _global_
+# specify here default configuration
+# order of defaults determines the order in which configs override each other
+defaults:
+  - _self_
+  - data: catdog
+  - model: catdog_classifier
+  - callbacks: default
+  - logger: default # set logger here or use command line (e.g. `python train.py logger=tensorboard`)
+  - trainer: default
+  - paths: catdog
+  - hydra: default
+  - experiment: catdog_experiment
+  # debugging config (enable through command line, e.g. `python train.py debug=default)
+  - debug: null
+# task name, determines output directory path
+task_name: "train"
+# tags to help you identify your experiments
+# you can overwrite this in experiment configs
+# overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
+tags: ["dev"]
+# set False to skip model training
+train: True
+# evaluate on test set, using best model weights achieved during training
+# lightning chooses best weights based on the metric specified in checkpoint callback
+test: False
+# simply provide checkpoint path to resume training
+ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt
+# seed for random number generators in pytorch, numpy and python.random
+seed: 42
+# name of the experiment
+name: "catdog_experiment"
+# optimization metric
+optimization_metric: "val_acc"
+# optuna hyperparameter optimization
+n_trials: 2

configs/trainer/default.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+default_root_dir: ${paths.output_dir}
+min_epochs: 1
+max_epochs: 6
+accelerator: auto
+devices: auto
+# mixed precision for extra speed-up
+# precision: 16
+# set True to to ensure deterministic results makes training slower but gives more reproducibility than just setting seeds
+deterministic: True
+# Log every N steps in training and validation
+log_every_n_steps: 10
+fast_dev_run: False
+gradient_clip_val: 1.0
+gradient_clip_algorithm: 'norm'

data.dvc ADDED Viewed

	@@ -0,0 +1,6 @@

+outs:
+- md5: 1a2429ba45778743c46917f7e6b9b542.dir
+  size: 97446370
+  nfiles: 3002
+  hash: md5
+  path: data

docker-compose-old.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+version: '3.8'
+services:
+  train:
+    build:
+      context: .
+    command: |
+      python -m src.train_optuna_callbacks experiment=catdog_experiment ++task_name=train ++train=True ++test=False && \
+      python -m src.create_artifacts && \
+      touch ./checkpoints/train_done.flag
+    volumes:
+      - ./data:/app/data
+      - ./checkpoints:/app/checkpoints
+      - ./artifacts:/app/artifacts
+      - ./logs:/app/logs
+    environment:
+      - PYTHONUNBUFFERED=1
+      - PYTHONPATH=/app
+    shm_size: '4g'
+    networks:
+      - default
+    env_file:
+      - .env
+  eval:
+    build:
+      context: .
+    command: |
+      sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.train_optuna_callbacks experiment=catdog_experiment ++task_name=test ++train=False ++test=True'
+    volumes:
+      - ./data:/app/data
+      - ./checkpoints:/app/checkpoints
+      - ./artifacts:/app/artifacts
+      - ./logs:/app/logs
+    environment:
+      - PYTHONUNBUFFERED=1
+      - PYTHONPATH=/app
+    shm_size: '4g'
+    networks:
+      - default
+    env_file:
+      - .env
+    depends_on:
+      - train
+  inference:
+    build:
+      context: .
+    command: |
+      sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.infer experiment=catdog_experiment'
+    volumes:
+      - ./data:/app/data
+      - ./checkpoints:/app/checkpoints
+      - ./artifacts:/app/artifacts
+      - ./logs:/app/logs
+    environment:
+      - PYTHONUNBUFFERED=1
+      - PYTHONPATH=/app
+    shm_size: '4g'
+    networks:
+      - default
+    env_file:
+      - .env
+    depends_on:
+      - train
+volumes:
+  data:
+  checkpoints:
+  artifacts:
+  logs:
+networks:
+  default:

docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+services:
+  train:
+    build:
+      context: .
+    command: |
+      python -m src.train_optuna_callbacks experiment=catdog_experiment_resnet ++task_name=train ++train=True ++test=False && \
+      python -m src.create_artifacts && \
+      touch ./checkpoints/train_done.flag
+    volumes:
+      - ./data:/app/data
+      - ./checkpoints:/app/checkpoints
+      - ./artifacts:/app/artifacts
+      - ./logs:/app/logs
+    environment:
+      - PYTHONUNBUFFERED=1
+      - PYTHONPATH=/app
+    shm_size: '4g'
+    networks:
+      - default
+    env_file:
+      - .env
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+  eval:
+    build:
+      context: .
+    command: |
+      sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.train_optuna_callbacks experiment=catdog_experiment_resnet ++task_name=test ++train=False ++test=True'
+    volumes:
+      - ./data:/app/data
+      - ./checkpoints:/app/checkpoints
+      - ./artifacts:/app/artifacts
+      - ./logs:/app/logs
+    environment:
+      - PYTHONUNBUFFERED=1
+      - PYTHONPATH=/app
+    shm_size: '4g'
+    networks:
+      - default
+    env_file:
+      - .env
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+  inference:
+    build:
+      context: .
+    command: |
+      sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.infer experiment=catdog_experiment_resnet'
+    volumes:
+      - ./data:/app/data
+      - ./checkpoints:/app/checkpoints
+      - ./artifacts:/app/artifacts
+      - ./logs:/app/logs
+    environment:
+      - PYTHONUNBUFFERED=1
+      - PYTHONPATH=/app
+    shm_size: '4g'
+    networks:
+      - default
+    env_file:
+      - .env
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+volumes:
+  data:
+  checkpoints:
+  artifacts:
+  logs:
+networks:
+  default:

docker_compose_exec.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+#!/bin/bash
+# Exit on any error
+set -e
+# Helper function to wait for a condition
+wait_for_condition() {
+  local condition=$1
+  local description=$2
+  echo "Waiting for $description..."
+  while ! eval "$condition"; do
+    echo "$description not ready. Retrying in 5 seconds..."
+    sleep 5
+  done
+  echo "$description is ready!"
+}
+# Step 1: Stop and rebuild all containers
+echo "Stopping all running services..."
+docker-compose stop
+echo "Building all services..."
+docker-compose build
+# Step 2: Start the train service
+echo "Starting 'train' service..."
+docker-compose up -d train
+# Step 3: Wait for train to complete
+wait_for_condition "[ -f ./checkpoints/train_done.flag ]" "'train' service to complete"
+# Step 4: Start the eval service
+echo "Starting 'eval' service..."
+docker-compose up -d eval
+# Step 5: Start the server service
+echo "Starting 'server' service..."
+docker-compose up -d server
+# Step 6: Wait for the server to be healthy
+wait_for_condition "curl -s http://localhost:8080/health" "'server' service to be ready"
+# Step 7: Start the client service
+echo "Starting 'client' service..."
+docker-compose up -d client
+# Step 8: Show all running services
+echo "All services are up and running:"
+docker-compose ps
+# Step 9: Stop and remove all containers after completion
+echo "Stopping all services..."
+docker-compose stop
+echo "Removing all stopped containers..."
+docker-compose rm -f
+echo "Workflow complete!"

dvc.lock ADDED Viewed

	@@ -0,0 +1,31 @@

+schema: '2.0'
+stages:
+  train:
+    cmd: docker-compose run --rm train
+    deps:
+    - path: data
+      hash: md5
+      md5: a372d6faac374b9f988d530864d0d7d5.dir
+      size: 97446370
+      nfiles: 3002
+    - path: docker-compose.yaml
+      hash: md5
+      md5: 85a64185c917ce60ae28e32c20c70164
+      size: 1735
+      isexec: true
+    - path: src/train.py
+      hash: md5
+      md5: 86b3871600a12f311e71dc171a2a37b9
+      size: 5972
+      isexec: true
+    outs:
+    - path: checkpoints/best-checkpoint.ckpt
+      hash: md5
+      md5: 6b6dcaa677324992489edaa51fc8b24f
+      size: 3755038
+      isexec: true
+    - path: checkpoints/train_done.flag
+      hash: md5
+      md5: bfc5d6f6817daa48ad7ae164aa621dbf
+      size: 20
+      isexec: true

dvc.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+stages:
+  train:
+    cmd: docker-compose run --rm train
+    deps:
+      - docker-compose.yaml
+      - src/train_optuna_callbacks.py
+      - src/create_artifacts.py
+      - data
+    outs:
+      - checkpoints/best-checkpoint.ckpt
+      - checkpoints/train_done.flag
+  # eval:
+  #   cmd: docker-compose run --rm eval
+  #   deps:
+  #     - docker-compose.yaml
+  #     - src/train.py
+  #     - checkpoints/best-checkpoint.ckpt
+  #     - checkpoints/train_done.flag
+  # inference:
+  #   cmd: docker-compose run --rm inference
+  #   deps:
+  #     - docker-compose.yaml
+  #     - src/infer.py
+  #     - checkpoints/best-checkpoint.ckpt
+  #     - checkpoints/train_done.flag
+  #   outs:
+  #     - artifacts/image_prediction.png

ec2_runner_setup.md ADDED Viewed

	@@ -0,0 +1,357 @@

+**Install docker and docker-compose on Ubuntu 22.04**
+__PreRequisites__:
+    * Have an aws account with a user that has the necessary permissions
+    * Have the access key either on env variables or in the github actions secrets
+    * Have an ec2 runner instance running/created in the aws account
+    * Have a s3 bucket created in the aws account
+    * Have aws container registry created in the aws account
+__Local VM setup__:
+    * Install aws configure and setup the access key and secret key and the right zone
+        ```bash
+        curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
+        unzip awscliv2.zip
+        sudo ./aws/install
+        aws configure
+        ```
+__Install docker__:
+```bash
+sudo apt update
+sudo apt install -y apt-transport-https ca-certificates curl software-properties-common
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
+echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+sudo apt update
+sudo apt install -y docker-ce
+sudo systemctl start docker
+sudo systemctl enable docker
+sudo usermod -aG docker $USER
+sudo systemctl restart docker
+sudo reboot
+docker --version
+docker ps
+```
+__Install docker-compose__:
+```bash
+sudo rm /usr/local/bin/docker-compose
+sudo curl -L "https://github.com/docker/compose/releases/download/v2.30.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
+sudo chmod +x /usr/local/bin/docker-compose
+docker-compose --version
+```
+__Github actions self-hosted runner__:
+```bash
+mkdir actions-runner && cd actions-runner
+curl -o actions-runner-linux-x64-2.320.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.320.0/actions-runner-linux-x64-2.320.0.tar.gz
+echo "93ac1b7ce743ee85b5d386f5c1787385ef07b3d7c728ff66ce0d3813d5f46900  actions-runner-linux-x64-2.320.0.tar.gz" | shasum -a 256 -c
+tar xzf ./actions-runner-linux-x64-2.320.0.tar.gz
+./config.sh --url https://github.com/soutrik71/pytorch-template-aws --token <Latest>
+# cd actions-runner/
+./run.sh
+./config.sh remove --token <> # To remove the runner
+# https://github.com/soutrik71/pytorch-template-aws/settings/actions/runners/new?arch=x64&os=linux
+```
+__Activate aws cli__:
+```bash
+curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
+sudo apt install unzip
+unzip awscliv2.zip
+sudo ./aws/install
+aws --version
+aws configure
+```
+__S3 bucket operations__:
+```bash
+aws s3 cp data s3://deep-bucket-s3/data --recursive
+aws s3 ls s3://deep-bucket-s3
+aws s3 rm s3://deep-bucket-s3/data --recursive
+```
+__Cuda Update Setup__:
+```bash
+# if you already have nvidia drivers installed and you have a Tesla T4 GPU
+sudo apt update
+sudo apt upgrade
+sudo reboot
+sudo apt --fix-broken install
+sudo apt install ubuntu-drivers-common
+sudo apt autoremove
+nvidia-smi
+lsmod | grep nvidia
+sudo apt install nvidia-cuda-toolkit
+nvcc --version
+ls /usr/local/ | grep cuda
+ldconfig -p | grep cudnn
+lspci | grep -i nvidia
+Based on the provided details, here is the breakdown of the information about your GPU, CUDA, and environment setup:
+---
+### **1. GPU Details**
+- **Model**: Tesla T4
+  - A popular NVIDIA GPU for deep learning and AI workloads.
+  - It belongs to the Turing architecture (TU104GL).
+- **Memory**: 16 GB
+  - Only **2 MiB is currently in use**, indicating minimal GPU activity.
+- **Temperature**: 25°C
+  - The GPU is operating at a low temperature, suggesting no heavy utilization currently.
+- **Power Usage**: 11W / 70W
+  - The GPU is in idle or low-performance mode (P8).
+- **MIG Mode**: Not enabled.
+  - MIG (Multi-Instance GPU) mode is specific to NVIDIA A100 and other GPUs, so it is not applicable here.
+---
+### **2. Driver and CUDA Version**
+- **Driver Version**: 535.216.03
+  - Installed NVIDIA driver supports CUDA 12.x.
+- **CUDA Runtime Version**: 12.2
+  - This is the active runtime version compatible with the driver.
+---
+### **3. CUDA Toolkit Versions**
+From your `nvcc` and file system checks:
+- **Default `nvcc` Version**: CUDA 10.1
+  - The system's default `nvcc` is pointing to an older CUDA 10.1 installation (`nvcc --version` output shows CUDA 10.1).
+- **Installed CUDA Toolkits**:
+  - `cuda-12`
+  - `cuda-12.2`
+  - `cuda` (likely symlinked to `cuda-12.2`)
+  Multiple CUDA versions are installed. However, the runtime and drivers align with **CUDA 12.2**, while the default compiler (`nvcc`) is still from CUDA 10.1.
+---
+### **4. cuDNN Version**
+From `cudnn_version.h` and `ldconfig`:
+- **cuDNN Version**: 9.5.1
+  - This cuDNN version is compatible with **CUDA 12.x**.
+- **cuDNN Runtime**: The libraries for cuDNN 9 are present under `/lib/x86_64-linux-gnu`.
+---
+### **5. NVIDIA Software Packages**
+From `dpkg`:
+- **NVIDIA Drivers**: Driver version 535 is installed.
+- **CUDA Toolkit**: Multiple versions installed (`10.1`, `12`, `12.2`).
+- **cuDNN**: Versions for CUDA 12 and CUDA 12.6 are installed (`cudnn9-cuda-12`, `cudnn9-cuda-12-6`).
+---
+### **6. Other Observations**
+- **Graphics Settings Issue**:
+  - `nvidia-settings` failed due to the lack of a display server connection (`Connection refused`). Likely, this is a headless server without a GUI environment.
+- **OpenGL Tools Missing**:
+  - `glxinfo` command is missing, indicating the `mesa-utils` package needs to be installed.
+---
+### **Summary of Setup**
+- **GPU**: Tesla T4
+- **Driver Version**: 535.216.03
+- **CUDA Runtime Version**: 12.2
+- **CUDA Toolkit Versions**: 10.1 (default `nvcc`), 12, 12.2
+- **cuDNN Version**: 9.5.1 (compatible with CUDA 12.x)
+- **Software Packages**: NVIDIA drivers, CUDA, cuDNN installed
+```
+__CUDA New Installation__:
+```bash
+# if you don't have nvidia drivers installed and you have a Tesla T4 GPU
+lspci | grep -i nvidia # Check if the GPU is detected
+To set up the T4 GPU from scratch, starting with no drivers or CUDA tools, and replicating the above configurations and drivers, follow these reverse-engineered steps:
+---
+### **1. Update System**
+Ensure the system is updated:
+```bash
+sudo apt update && sudo apt upgrade -y
+sudo reboot
+```
+---
+### **2. Install NVIDIA Driver**
+#### **a. Identify Required Driver**
+The T4 GPU requires a compatible NVIDIA driver version. Based on your configurations, we will install **Driver 535**.
+#### **b. Add NVIDIA Repository**
+Add the official NVIDIA driver repository:
+```bash
+sudo apt install -y software-properties-common
+sudo add-apt-repository -y ppa:graphics-drivers/ppa
+sudo apt update
+```
+#### **c. Install Driver**
+Install the driver for the T4 GPU:
+```bash
+sudo apt install -y nvidia-driver-535
+```
+#### **d. Verify Driver Installation**
+Reboot the system and check the driver:
+```bash
+sudo reboot
+nvidia-smi
+```
+This should display the GPU model and driver version.
+---
+### **3. Install CUDA Toolkit**
+#### **a. Add CUDA Repository**
+Download and install the CUDA 12.2 repository for Ubuntu 20.04:
+```bash
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
+sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
+wget https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda-repo-ubuntu2004-12-2-local_12.2.0-535.86.10-1_amd64.deb
+sudo dpkg -i cuda-repo-ubuntu2004-12-2-local_12.2.0-535.86.10-1_amd64.deb
+sudo cp /var/cuda-repo-ubuntu2004-12-2-local/cuda-*-keyring.gpg /usr/share/keyrings/
+sudo apt update
+```
+#### **b. Install CUDA Toolkit**
+Install CUDA 12.2:
+```bash
+sudo apt install -y cuda
+```
+#### **c. Set Up Environment Variables**
+Add CUDA binaries to the PATH and library paths:
+```bash
+echo 'export PATH=/usr/local/cuda-12.2/bin:$PATH' >> ~/.bashrc
+echo 'export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc
+source ~/.bashrc
+```
+#### **d. Verify CUDA Installation**
+Check CUDA installation:
+```bash
+nvcc --version
+nvidia-smi
+```
+---
+### **4. Install cuDNN**
+#### **a. Download cuDNN**
+Download cuDNN 9.5.1 (compatible with CUDA 12.x) from the [NVIDIA cuDNN page](https://developer.nvidia.com/cudnn). You’ll need to log in and download the appropriate `.deb` files for Ubuntu 20.04.
+#### **b. Install cuDNN**
+Install the downloaded `.deb` files:
+```bash
+sudo dpkg -i libcudnn9*.deb
+```
+#### **c. Verify cuDNN**
+Check the installed version:
+```bash
+cat /usr/include/cudnn_version.h | grep CUDNN_MAJOR -A 2
+```
+---
+### **5. Install NCCL and Other Libraries**
+Install additional NVIDIA libraries (like NCCL) required for distributed deep learning:
+```bash
+sudo apt install -y libnccl2 libnccl-dev
+```
+---
+### **6. Install PyTorch**
+#### **a. Install Python Environment**
+Install Python and `pip` if not already present:
+```bash
+sudo apt install -y python3 python3-pip
+```
+#### **b. Install PyTorch with CUDA 12.2**
+Install PyTorch with the appropriate CUDA runtime:
+```bash
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu122
+```
+#### **c. Test PyTorch**
+Run a quick test:
+```python
+import torch
+print(torch.cuda.is_available())  # Should return True
+print(torch.cuda.get_device_name(0))  # Should return "Tesla T4"
+```
+---
+### **7. Optional: Install Nsight Tools**
+For debugging and profiling:
+```bash
+sudo apt install -y nsight-compute nsight-systems
+```
+---
+### **8. Check for OpenGL**
+If you need OpenGL utilities (like `glxinfo`):
+```bash
+sudo apt install -y mesa-utils
+glxinfo | grep "OpenGL version"
+```
+---
+### **9. Validate Entire Setup**
+Run the NVIDIA sample tests to confirm the configuration:
+```bash
+cd /usr/local/cuda-12.2/samples/1_Utilities/deviceQuery
+make
+./deviceQuery
+```
+If successful, it should show details of the T4 GPU.
+---
+### **Summary of Installed Components**
+- **GPU**: Tesla T4
+- **Driver**: 535
+- **CUDA Toolkit**: 12.2
+- **cuDNN**: 9.5.1
+- **PyTorch**: Installed with CUDA 12.2 support
+This setup ensures your system is ready for deep learning workloads with the T4 GPU.
+Install conda and create a new environment for the project
+Install pytorch and torchvision in the new environment
+Install other dependencies like numpy, pandas, matplotlib, etc.
+Run the project code in the new environment
+>>> import torch
+>>> print(torch.cuda.is_available())
+>>> print(torch.cuda.get_device_name(0))
+>>> print(torch.version.cuda)
+```
+__CUDA Docker Setup__:
+```bash
+# If you are using docker and want to run a container with CUDA support
+sudo apt install -y nvidia-container-toolkit
+nvidia-ctk --version
+sudo systemctl restart docker
+sudo systemctl status docker
+docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu20.04 nvidia-smi
+docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu20.04 nvcc --version
+```

image.jpg ADDED Viewed

main.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import torch
+print(torch.cuda.is_available())  # Should return True if our GPU is enabled
+print(torch.cuda.get_device_name(0))  # Should return "Tesla T4" if our GPU is enabled
+print(torch.version.cuda)  # Should return "12.4" if our GPU is enabled

notebooks/datamodule_lightning.ipynb ADDED Viewed

	@@ -0,0 +1,301 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "In this notebook, we will be discussing about the pytorch lightning datamodule library with images in a folder strutcture with folders as class labels. We will be using  the cats and dogs dataset from kaggle. The dataset can be downloaded from [here](https://www.kaggle.com/c/dogs-vs-cats/data). The dataset contains 25000 images of cats and dogs. We will be using 20000 images for training and 5000 images for validation. The images are in a folder structure with folders as class labels."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/javascript": "IPython.notebook.set_autosave_interval(300000)"
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Autosaving every 300 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "%autosave 300\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "%reload_ext autoreload\n",
+    "%config Completer.use_jedi = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.chdir(\"..\")\n",
+    "print(os.getcwd())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/anaconda/envs/emlo_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pathlib import Path\n",
+    "from typing import Union, Tuple, Optional, List\n",
+    "import os\n",
+    "import lightning as L\n",
+    "from torch.utils.data import DataLoader, random_split\n",
+    "from torchvision import transforms\n",
+    "from torchvision.datasets import ImageFolder\n",
+    "from torchvision.datasets.utils import download_and_extract_archive\n",
+    "from loguru import logger"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class CatDogImageDataModule(L.LightningDataModule):\n",
+    "    \"\"\"DataModule for Cat and Dog Image Classification using ImageFolder.\"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        data_root: Union[str, Path] = \"data\",\n",
+    "        data_dir: Union[str, Path] = \"cats_and_dogs_filtered\",\n",
+    "        batch_size: int = 32,\n",
+    "        num_workers: int = 4,\n",
+    "        train_val_split: List[float] = [0.8, 0.2],\n",
+    "        pin_memory: bool = False,\n",
+    "        image_size: int = 224,\n",
+    "        url: str = \"https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip\",\n",
+    "    ):\n",
+    "        super().__init__()\n",
+    "        self.data_root = Path(data_root)\n",
+    "        self.data_dir = data_dir\n",
+    "        self.batch_size = batch_size\n",
+    "        self.num_workers = num_workers\n",
+    "        self.train_val_split = train_val_split\n",
+    "        self.pin_memory = pin_memory\n",
+    "        self.image_size = image_size\n",
+    "        self.url = url\n",
+    "\n",
+    "        # Initialize variables for datasets\n",
+    "        self.train_dataset = None\n",
+    "        self.val_dataset = None\n",
+    "        self.test_dataset = None\n",
+    "\n",
+    "    def prepare_data(self):\n",
+    "        \"\"\"Download the dataset if it doesn't exist.\"\"\"\n",
+    "        self.dataset_path = self.data_root / self.data_dir\n",
+    "        if not self.dataset_path.exists():\n",
+    "            logger.info(\"Downloading and extracting dataset.\")\n",
+    "            download_and_extract_archive(\n",
+    "                url=self.url, download_root=self.data_root, remove_finished=True\n",
+    "            )\n",
+    "            logger.info(\"Download completed.\")\n",
+    "\n",
+    "    def setup(self, stage: Optional[str] = None):\n",
+    "        \"\"\"Set up the train, validation, and test datasets.\"\"\"\n",
+    "\n",
+    "        train_transform = transforms.Compose(\n",
+    "            [\n",
+    "                transforms.Resize((self.image_size, self.image_size)),\n",
+    "                transforms.RandomHorizontalFlip(0.1),\n",
+    "                transforms.RandomRotation(10),\n",
+    "                transforms.RandomAffine(0, shear=10, scale=(0.8, 1.2)),\n",
+    "                transforms.RandomAutocontrast(0.1),\n",
+    "                transforms.RandomAdjustSharpness(2, 0.1),\n",
+    "                transforms.ToTensor(),\n",
+    "                transforms.Normalize(\n",
+    "                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]\n",
+    "                ),\n",
+    "            ]\n",
+    "        )\n",
+    "\n",
+    "        test_transform = transforms.Compose(\n",
+    "            [\n",
+    "                transforms.Resize((self.image_size, self.image_size)),\n",
+    "                transforms.ToTensor(),\n",
+    "                transforms.Normalize(\n",
+    "                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]\n",
+    "                ),\n",
+    "            ]\n",
+    "        )\n",
+    "\n",
+    "        train_path = self.dataset_path / \"train\"\n",
+    "        test_path = self.dataset_path / \"test\"\n",
+    "\n",
+    "        self.prepare_data()\n",
+    "\n",
+    "        if stage == \"fit\" or stage is None:\n",
+    "            full_train_dataset = ImageFolder(root=train_path, transform=train_transform)\n",
+    "            self.class_names = full_train_dataset.classes\n",
+    "            train_size = int(self.train_val_split[0] * len(full_train_dataset))\n",
+    "            val_size = len(full_train_dataset) - train_size\n",
+    "            self.train_dataset, self.val_dataset = random_split(\n",
+    "                full_train_dataset, [train_size, val_size]\n",
+    "            )\n",
+    "            logger.info(\n",
+    "                f\"Train/Validation split: {len(self.train_dataset)} train, {len(self.val_dataset)} validation images.\"\n",
+    "            )\n",
+    "\n",
+    "        if stage == \"test\" or stage is None:\n",
+    "            self.test_dataset = ImageFolder(root=test_path, transform=test_transform)\n",
+    "            logger.info(f\"Test dataset size: {len(self.test_dataset)} images.\")\n",
+    "\n",
+    "    def _create_dataloader(self, dataset, shuffle: bool = False) -> DataLoader:\n",
+    "        \"\"\"Helper function to create a DataLoader.\"\"\"\n",
+    "        return DataLoader(\n",
+    "            dataset=dataset,\n",
+    "            batch_size=self.batch_size,\n",
+    "            num_workers=self.num_workers,\n",
+    "            pin_memory=self.pin_memory,\n",
+    "            shuffle=shuffle,\n",
+    "        )\n",
+    "\n",
+    "    def train_dataloader(self) -> DataLoader:\n",
+    "        return self._create_dataloader(self.train_dataset, shuffle=True)\n",
+    "\n",
+    "    def val_dataloader(self) -> DataLoader:\n",
+    "        return self._create_dataloader(self.val_dataset)\n",
+    "\n",
+    "    def test_dataloader(self) -> DataLoader:\n",
+    "        return self._create_dataloader(self.test_dataset)\n",
+    "\n",
+    "    def get_class_names(self) -> List[str]:\n",
+    "        return self.class_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "datamodule = CatDogImageDataModule(\n",
+    "    data_root=\"data\",\n",
+    "    data_dir=\"cats_and_dogs_filtered\",\n",
+    "    batch_size=32,\n",
+    "    num_workers=4,\n",
+    "    train_val_split=[0.8, 0.2],\n",
+    "    pin_memory=True,\n",
+    "    image_size=224,\n",
+    "    url=\"https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-11-10 05:37:17.840\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36msetup\u001b[0m:\u001b[36m81\u001b[0m - \u001b[1mTrain/Validation split: 2241 train, 561 validation images.\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-11-10 05:37:17.910\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36msetup\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTest dataset size: 198 images.\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "datamodule.prepare_data()\n",
+    "datamodule.setup()\n",
+    "class_names = datamodule.get_class_names()\n",
+    "train_dataloader = datamodule.train_dataloader()\n",
+    "val_dataloader= datamodule.val_dataloader()\n",
+    "test_dataloader= datamodule.test_dataloader()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['cats', 'dogs']"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "class_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "emlo_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

notebooks/training_lightning_tests.ipynb ADDED Viewed

	@@ -0,0 +1,1011 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/javascript": "IPython.notebook.set_autosave_interval(300000)"
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Autosaving every 300 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "%autosave 300\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "%reload_ext autoreload\n",
+    "%config Completer.use_jedi = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "import os\n",
+    "\n",
+    "os.chdir(\"..\")\n",
+    "print(os.getcwd())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/anaconda/envs/emlo_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import shutil\n",
+    "from pathlib import Path\n",
+    "import torch\n",
+    "import lightning as L\n",
+    "from lightning.pytorch.loggers import Logger\n",
+    "from typing import List\n",
+    "from src.datamodules.catdog_datamodule import CatDogImageDataModule\n",
+    "from src.utils.logging_utils import setup_logger, task_wrapper\n",
+    "from loguru import logger\n",
+    "from dotenv import load_dotenv, find_dotenv\n",
+    "import rootutils\n",
+    "import hydra\n",
+    "from omegaconf import DictConfig, OmegaConf\n",
+    "from lightning.pytorch.callbacks import (\n",
+    "    ModelCheckpoint,\n",
+    "    EarlyStopping,\n",
+    "    RichModelSummary,\n",
+    "    RichProgressBar,\n",
+    ")\n",
+    "from lightning.pytorch.loggers import TensorBoardLogger, CSVLogger"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-11-08 18:25:17.572\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m9\u001b[0m - \u001b[31m\u001b[1mname '__file__' is not defined\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load environment variables\n",
+    "load_dotenv(find_dotenv(\".env\"))\n",
+    "\n",
+    "# Setup root directory\n",
+    "try:\n",
+    "    root = rootutils.setup_root(__file__, indicator=\".project-root\")\n",
+    "\n",
+    "except Exception as e:\n",
+    "    logger.error(e)\n",
+    "    root = Path(os.getcwd())\n",
+    "    os.environ[\"PROJECT_ROOT\"] = str(root)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_checkpoint_if_available(ckpt_path: str) -> str:\n",
+    "    \"\"\"Check if the specified checkpoint exists and return the valid checkpoint path.\"\"\"\n",
+    "    if ckpt_path and Path(ckpt_path).exists():\n",
+    "        logger.info(f\"Checkpoint found: {ckpt_path}\")\n",
+    "        return ckpt_path\n",
+    "    else:\n",
+    "        logger.warning(\n",
+    "            f\"No checkpoint found at {ckpt_path}. Using current model weights.\"\n",
+    "        )\n",
+    "        return None\n",
+    "\n",
+    "\n",
+    "def clear_checkpoint_directory(ckpt_dir: str):\n",
+    "    \"\"\"Clear all contents of the checkpoint directory without deleting the directory itself.\"\"\"\n",
+    "    ckpt_dir_path = Path(ckpt_dir)\n",
+    "    if ckpt_dir_path.exists() and ckpt_dir_path.is_dir():\n",
+    "        logger.info(f\"Clearing checkpoint directory: {ckpt_dir}\")\n",
+    "        # Iterate over all files and directories in the checkpoint directory and remove them\n",
+    "        for item in ckpt_dir_path.iterdir():\n",
+    "            try:\n",
+    "                if item.is_file() or item.is_symlink():\n",
+    "                    item.unlink()  # Remove file or symlink\n",
+    "                elif item.is_dir():\n",
+    "                    shutil.rmtree(item)  # Remove directory\n",
+    "            except Exception as e:\n",
+    "                logger.error(f\"Failed to delete {item}: {e}\")\n",
+    "        logger.info(f\"Checkpoint directory cleared: {ckpt_dir}\")\n",
+    "    else:\n",
+    "        logger.info(\n",
+    "            f\"Checkpoint directory does not exist. Creating directory: {ckpt_dir}\"\n",
+    "        )\n",
+    "        os.makedirs(ckpt_dir_path, exist_ok=True)\n",
+    "\n",
+    "\n",
+    "@task_wrapper\n",
+    "def train_module(\n",
+    "    cfg: DictConfig,\n",
+    "    data_module: L.LightningDataModule,\n",
+    "    model: L.LightningModule,\n",
+    "    trainer: L.Trainer,\n",
+    "):\n",
+    "    \"\"\"Train the model using the provided Trainer and DataModule.\"\"\"\n",
+    "    logger.info(\"Training the model\")\n",
+    "    trainer.fit(model, data_module)\n",
+    "    train_metrics = trainer.callback_metrics\n",
+    "    try:\n",
+    "        logger.info(\n",
+    "            f\"Training completed with the following metrics- train_acc: {train_metrics['train_acc'].item()} and val_acc: {train_metrics['val_acc'].item()}\"\n",
+    "        )\n",
+    "    except KeyError:\n",
+    "        logger.info(f\"Training completed with the following metrics:{train_metrics}\")\n",
+    "\n",
+    "    return train_metrics\n",
+    "\n",
+    "\n",
+    "@task_wrapper\n",
+    "def run_test_module(\n",
+    "    cfg: DictConfig,\n",
+    "    datamodule: L.LightningDataModule,\n",
+    "    model: L.LightningModule,\n",
+    "    trainer: L.Trainer,\n",
+    "):\n",
+    "    \"\"\"Test the model using the best checkpoint or the current model weights.\"\"\"\n",
+    "    logger.info(\"Testing the model\")\n",
+    "    datamodule.setup(stage=\"test\")\n",
+    "\n",
+    "    ckpt_path = load_checkpoint_if_available(cfg.ckpt_path)\n",
+    "\n",
+    "    # If no checkpoint is available, Lightning will use current model weights\n",
+    "    test_metrics = trainer.test(model, datamodule, ckpt_path=ckpt_path)\n",
+    "    logger.info(f\"Test metrics:\\n{test_metrics}\")\n",
+    "\n",
+    "    return test_metrics[0] if test_metrics else {}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_487789/541470590.py:8: UserWarning: \n",
+      "The version_base parameter is not specified.\n",
+      "Please specify a compatability version level, or None.\n",
+      "Will assume defaults for version 1.1\n",
+      "  with hydra.initialize(config_path=\"../configs\"):\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Full Configuration:\n",
+      "task_name: train\n",
+      "tags:\n",
+      "- dev\n",
+      "train: true\n",
+      "test: false\n",
+      "ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt\n",
+      "seed: 42\n",
+      "name: catdog_experiment\n",
+      "data:\n",
+      "  _target_: src.datamodules.catdog_datamodule.CatDogImageDataModule\n",
+      "  data_dir: ${paths.data_dir}\n",
+      "  url: ${paths.data_url}\n",
+      "  num_workers: 8\n",
+      "  batch_size: 64\n",
+      "  train_val_split:\n",
+      "  - 0.8\n",
+      "  - 0.2\n",
+      "  pin_memory: true\n",
+      "  image_size: 160\n",
+      "model:\n",
+      "  _target_: src.models.catdog_model.ViTTinyClassifier\n",
+      "  img_size: 160\n",
+      "  patch_size: 16\n",
+      "  num_classes: 2\n",
+      "  embed_dim: 64\n",
+      "  depth: 6\n",
+      "  num_heads: 2\n",
+      "  mlp_ratio: 3\n",
+      "  pre_norm: false\n",
+      "  lr: 0.001\n",
+      "  weight_decay: 1.0e-05\n",
+      "  factor: 0.1\n",
+      "  patience: 10\n",
+      "  min_lr: 1.0e-06\n",
+      "callbacks:\n",
+      "  model_checkpoint:\n",
+      "    dirpath: ${paths.ckpt_dir}\n",
+      "    filename: best-checkpoint\n",
+      "    monitor: val_acc\n",
+      "    verbose: true\n",
+      "    save_last: true\n",
+      "    save_top_k: 1\n",
+      "    mode: max\n",
+      "    auto_insert_metric_name: false\n",
+      "    save_weights_only: false\n",
+      "    every_n_train_steps: null\n",
+      "    train_time_interval: null\n",
+      "    every_n_epochs: null\n",
+      "    save_on_train_epoch_end: null\n",
+      "  early_stopping:\n",
+      "    monitor: val_acc\n",
+      "    min_delta: 0.0\n",
+      "    patience: 10\n",
+      "    verbose: true\n",
+      "    mode: max\n",
+      "    strict: true\n",
+      "    check_finite: true\n",
+      "    stopping_threshold: null\n",
+      "    divergence_threshold: null\n",
+      "    check_on_train_epoch_end: null\n",
+      "  rich_model_summary:\n",
+      "    max_depth: 1\n",
+      "  rich_progress_bar:\n",
+      "    refresh_rate: 1\n",
+      "logger:\n",
+      "  csv:\n",
+      "    save_dir: ${paths.output_dir}\n",
+      "    name: csv/\n",
+      "    prefix: ''\n",
+      "  tensorboard:\n",
+      "    save_dir: ${paths.output_dir}/tensorboard/\n",
+      "    name: null\n",
+      "    log_graph: false\n",
+      "    default_hp_metric: true\n",
+      "    prefix: ''\n",
+      "trainer:\n",
+      "  _target_: lightning.Trainer\n",
+      "  default_root_dir: ${paths.output_dir}\n",
+      "  min_epochs: 1\n",
+      "  max_epochs: 6\n",
+      "  accelerator: auto\n",
+      "  devices: auto\n",
+      "  deterministic: true\n",
+      "  log_every_n_steps: 10\n",
+      "  fast_dev_run: false\n",
+      "paths:\n",
+      "  root_dir: ${oc.env:PROJECT_ROOT}\n",
+      "  data_dir: ${paths.root_dir}/data/\n",
+      "  log_dir: ${paths.root_dir}/logs/\n",
+      "  ckpt_dir: ${paths.root_dir}/checkpoints\n",
+      "  artifact_dir: ${paths.root_dir}/artifacts/\n",
+      "  data_url: https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip\n",
+      "  output_dir: ${hydra:runtime.output_dir}\n",
+      "  work_dir: ${hydra:runtime.cwd}\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import hydra\n",
+    "from omegaconf import DictConfig, OmegaConf\n",
+    "\n",
+    "\n",
+    "# Function to load the configuration as an object without using the @hydra.main decorator\n",
+    "def load_config() -> DictConfig:\n",
+    "    # Initialize the configuration context (e.g., \"../configs\" directory)\n",
+    "    with hydra.initialize(config_path=\"../configs\"):\n",
+    "        # Compose the configuration object with a specific config name (e.g., \"train\")\n",
+    "        cfg = hydra.compose(config_name=\"train\")\n",
+    "    return cfg\n",
+    "\n",
+    "\n",
+    "# Load the configuration\n",
+    "cfg = load_config()\n",
+    "\n",
+    "# Print the entire configuration for reference\n",
+    "print(\"Full Configuration:\")\n",
+    "print(OmegaConf.to_yaml(cfg))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-11-08 18:25:23\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m8\u001b[0m - \u001b[1mWhole Config:\n",
+      "task_name: train\n",
+      "tags:\n",
+      "- dev\n",
+      "train: true\n",
+      "test: false\n",
+      "ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt\n",
+      "seed: 42\n",
+      "name: catdog_experiment\n",
+      "data:\n",
+      "  _target_: src.datamodules.catdog_datamodule.CatDogImageDataModule\n",
+      "  data_dir: ${paths.data_dir}\n",
+      "  url: ${paths.data_url}\n",
+      "  num_workers: 8\n",
+      "  batch_size: 64\n",
+      "  train_val_split:\n",
+      "  - 0.8\n",
+      "  - 0.2\n",
+      "  pin_memory: true\n",
+      "  image_size: 160\n",
+      "model:\n",
+      "  _target_: src.models.catdog_model.ViTTinyClassifier\n",
+      "  img_size: 160\n",
+      "  patch_size: 16\n",
+      "  num_classes: 2\n",
+      "  embed_dim: 64\n",
+      "  depth: 6\n",
+      "  num_heads: 2\n",
+      "  mlp_ratio: 3\n",
+      "  pre_norm: false\n",
+      "  lr: 0.001\n",
+      "  weight_decay: 1.0e-05\n",
+      "  factor: 0.1\n",
+      "  patience: 10\n",
+      "  min_lr: 1.0e-06\n",
+      "callbacks:\n",
+      "  model_checkpoint:\n",
+      "    dirpath: ${paths.ckpt_dir}\n",
+      "    filename: best-checkpoint\n",
+      "    monitor: val_acc\n",
+      "    verbose: true\n",
+      "    save_last: true\n",
+      "    save_top_k: 1\n",
+      "    mode: max\n",
+      "    auto_insert_metric_name: false\n",
+      "    save_weights_only: false\n",
+      "    every_n_train_steps: null\n",
+      "    train_time_interval: null\n",
+      "    every_n_epochs: null\n",
+      "    save_on_train_epoch_end: null\n",
+      "  early_stopping:\n",
+      "    monitor: val_acc\n",
+      "    min_delta: 0.0\n",
+      "    patience: 10\n",
+      "    verbose: true\n",
+      "    mode: max\n",
+      "    strict: true\n",
+      "    check_finite: true\n",
+      "    stopping_threshold: null\n",
+      "    divergence_threshold: null\n",
+      "    check_on_train_epoch_end: null\n",
+      "  rich_model_summary:\n",
+      "    max_depth: 1\n",
+      "  rich_progress_bar:\n",
+      "    refresh_rate: 1\n",
+      "logger:\n",
+      "  csv:\n",
+      "    save_dir: ${paths.output_dir}\n",
+      "    name: csv/\n",
+      "    prefix: ''\n",
+      "  tensorboard:\n",
+      "    save_dir: ${paths.output_dir}/tensorboard/\n",
+      "    name: null\n",
+      "    log_graph: false\n",
+      "    default_hp_metric: true\n",
+      "    prefix: ''\n",
+      "trainer:\n",
+      "  _target_: lightning.Trainer\n",
+      "  default_root_dir: ${paths.output_dir}\n",
+      "  min_epochs: 1\n",
+      "  max_epochs: 6\n",
+      "  accelerator: auto\n",
+      "  devices: auto\n",
+      "  deterministic: true\n",
+      "  log_every_n_steps: 10\n",
+      "  fast_dev_run: false\n",
+      "paths:\n",
+      "  root_dir: ${oc.env:PROJECT_ROOT}\n",
+      "  data_dir: ${paths.root_dir}/data/\n",
+      "  log_dir: ${paths.root_dir}/logs/\n",
+      "  ckpt_dir: ${paths.root_dir}/checkpoints\n",
+      "  artifact_dir: ${paths.root_dir}/artifacts/\n",
+      "  data_url: https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip\n",
+      "  output_dir: ${hydra:runtime.output_dir}\n",
+      "  work_dir: ${hydra:runtime.cwd}\n",
+      "\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize logger\n",
+    "if cfg.task_name == \"train\":\n",
+    "    log_path = Path(cfg.paths.log_dir) / \"train.log\"\n",
+    "else:\n",
+    "    log_path = Path(cfg.paths.log_dir) / \"eval.log\"\n",
+    "setup_logger(log_path)\n",
+    "\n",
+    "logger.info(f\"Whole Config:\\n{OmegaConf.to_yaml(cfg)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m3\u001b[0m - \u001b[1mRoot directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws\u001b[0m\n",
+      "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m5\u001b[0m - \u001b[1mCurrent working directory: ['.dvc', '.dvcignore', '.env', '.git', '.github', '.gitignore', '.project-root', 'aws', 'basic_setup.md', 'configs', 'data', 'data.dvc', 'docker-compose.yaml', 'Dockerfile', 'ec2_runner_setup.md', 'logs', 'main.py', 'notebooks', 'poetry.lock', 'pyproject.toml', 'README.md', 'setup_aws_ci.md', 'src', 'tests', 'todo.md']\u001b[0m\n",
+      "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m8\u001b[0m - \u001b[1mCheckpoint directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/checkpoints\u001b[0m\n",
+      "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m12\u001b[0m - \u001b[1mData directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/data/\u001b[0m\n",
+      "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m16\u001b[0m - \u001b[1mLog directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/logs/\u001b[0m\n",
+      "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mArtifact directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/artifacts/\u001b[0m\n",
+      "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m28\u001b[0m - \u001b[1mExperiment name: catdog_experiment\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# the path to the checkpoint directory\n",
+    "root_dir = cfg.paths.root_dir\n",
+    "logger.info(f\"Root directory: {root_dir}\")\n",
+    "\n",
+    "logger.info(f\"Current working directory: {os.listdir(root_dir)}\")\n",
+    "\n",
+    "ckpt_dir = cfg.paths.ckpt_dir\n",
+    "logger.info(f\"Checkpoint directory: {ckpt_dir}\")\n",
+    "\n",
+    "# the path to the data directory\n",
+    "data_dir = cfg.paths.data_dir\n",
+    "logger.info(f\"Data directory: {data_dir}\")\n",
+    "\n",
+    "# the path to the log directory\n",
+    "log_dir = cfg.paths.log_dir\n",
+    "logger.info(f\"Log directory: {log_dir}\")\n",
+    "\n",
+    "# the path to the artifact directory\n",
+    "artifact_dir = cfg.paths.artifact_dir\n",
+    "logger.info(f\"Artifact directory: {artifact_dir}\")\n",
+    "\n",
+    "# output directory\n",
+    "# output_dir = cfg.paths.output_dir\n",
+    "# logger.info(f\"Output directory: {output_dir}\")\n",
+    "\n",
+    "# name of the experiment\n",
+    "experiment_name = cfg.name\n",
+    "logger.info(f\"Experiment name: {experiment_name}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-11-08 18:25:28\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m2\u001b[0m - \u001b[1mInstantiating datamodule <src.datamodules.catdog_datamodule.CatDogImageDataModule>\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize DataModule\n",
+    "logger.info(f\"Instantiating datamodule <{cfg.data._target_}>\")\n",
+    "datamodule: L.LightningDataModule = hydra.utils.instantiate(cfg.data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-11-08 18:25:28\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m2\u001b[0m - \u001b[1mNo GPU available\u001b[0m\n",
+      "Seed set to 42\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "42"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Check for GPU availability\n",
+    "logger.info(\"GPU available\" if torch.cuda.is_available() else \"No GPU available\")\n",
+    "\n",
+    "# Set seed for reproducibility\n",
+    "L.seed_everything(cfg.seed, workers=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-11-08 18:25:29\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m2\u001b[0m - \u001b[1mInstantiating model <src.models.catdog_model.ViTTinyClassifier>\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize model\n",
+    "logger.info(f\"Instantiating model <{cfg.model._target_}>\")\n",
+    "model: L.LightningModule = hydra.utils.instantiate(cfg.model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-11-08 18:25:30\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mModel summary:\n",
+      "ViTTinyClassifier(\n",
+      "  (model): VisionTransformer(\n",
+      "    (patch_embed): PatchEmbed(\n",
+      "      (proj): Conv2d(3, 64, kernel_size=(16, 16), stride=(16, 16))\n",
+      "      (norm): Identity()\n",
+      "    )\n",
+      "    (pos_drop): Dropout(p=0.0, inplace=False)\n",
+      "    (patch_drop): Identity()\n",
+      "    (norm_pre): Identity()\n",
+      "    (blocks): Sequential(\n",
+      "      (0): Block(\n",
+      "        (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=64, out_features=192, bias=False)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=64, out_features=64, bias=True)\n",
+      "          (proj_drop): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=64, out_features=192, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=192, out_features=64, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (1): Block(\n",
+      "        (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=64, out_features=192, bias=False)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=64, out_features=64, bias=True)\n",
+      "          (proj_drop): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=64, out_features=192, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=192, out_features=64, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (2): Block(\n",
+      "        (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=64, out_features=192, bias=False)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=64, out_features=64, bias=True)\n",
+      "          (proj_drop): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=64, out_features=192, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=192, out_features=64, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (3): Block(\n",
+      "        (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=64, out_features=192, bias=False)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=64, out_features=64, bias=True)\n",
+      "          (proj_drop): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=64, out_features=192, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=192, out_features=64, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (4): Block(\n",
+      "        (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=64, out_features=192, bias=False)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=64, out_features=64, bias=True)\n",
+      "          (proj_drop): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=64, out_features=192, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=192, out_features=64, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (5): Block(\n",
+      "        (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=64, out_features=192, bias=False)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=64, out_features=64, bias=True)\n",
+      "          (proj_drop): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=64, out_features=192, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=192, out_features=64, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "    )\n",
+      "    (norm): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
+      "    (fc_norm): Identity()\n",
+      "    (head_drop): Dropout(p=0.0, inplace=False)\n",
+      "    (head): Linear(in_features=64, out_features=2, bias=True)\n",
+      "  )\n",
+      "  (train_metrics): ModuleDict(\n",
+      "    (accuracy): MulticlassAccuracy()\n",
+      "    (precision): MulticlassPrecision()\n",
+      "    (recall): MulticlassRecall()\n",
+      "    (f1): MulticlassF1Score()\n",
+      "  )\n",
+      "  (val_metrics): ModuleDict(\n",
+      "    (accuracy): MulticlassAccuracy()\n",
+      "    (precision): MulticlassPrecision()\n",
+      "    (recall): MulticlassRecall()\n",
+      "    (f1): MulticlassF1Score()\n",
+      "  )\n",
+      "  (test_metrics): ModuleDict(\n",
+      "    (accuracy): MulticlassAccuracy()\n",
+      "    (precision): MulticlassPrecision()\n",
+      "    (recall): MulticlassRecall()\n",
+      "    (f1): MulticlassF1Score()\n",
+      "  )\n",
+      "  (criterion): CrossEntropyLoss()\n",
+      ")\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "logger.info(f\"Model summary:\\n{model}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def initialize_callbacks(cfg: DictConfig) -> List[L.Callback]:\n",
+    "    \"\"\"Initialize the callbacks based on the configuration.\"\"\"\n",
+    "    if not cfg:\n",
+    "        logger.warning(\"No callback configs found! Skipping..\")\n",
+    "        return callbacks\n",
+    "\n",
+    "    if not isinstance(cfg, DictConfig):\n",
+    "        raise TypeError(\"Callbacks config must be a DictConfig!\")\n",
+    "    callbacks = []\n",
+    "\n",
+    "    # Initialize the model checkpoint callback\n",
+    "    model_checkpoint = ModelCheckpoint(**cfg.callbacks.model_checkpoint)\n",
+    "    callbacks.append(model_checkpoint)\n",
+    "\n",
+    "    # Initialize the early stopping callback\n",
+    "    early_stopping = EarlyStopping(**cfg.callbacks.early_stopping)\n",
+    "    callbacks.append(early_stopping)\n",
+    "\n",
+    "    # Initialize the rich model summary callback\n",
+    "    model_summary = RichModelSummary(**cfg.callbacks.rich_model_summary)\n",
+    "    callbacks.append(model_summary)\n",
+    "\n",
+    "    # Initialize the rich progress bar callback\n",
+    "    progress_bar = RichProgressBar(**cfg.callbacks.rich_progress_bar)\n",
+    "    callbacks.append(progress_bar)\n",
+    "\n",
+    "    return callbacks\n",
+    "\n",
+    "\n",
+    "def initialize_logger(cfg: DictConfig) -> Logger:\n",
+    "    \"\"\"Initialize the logger based on the configuration.\"\"\"\n",
+    "    if not cfg:\n",
+    "        logger.warning(\"No logger configs found! Skipping..\")\n",
+    "        return None\n",
+    "\n",
+    "    if not isinstance(cfg, DictConfig):\n",
+    "        raise TypeError(\"Logger config must be a DictConfig!\")\n",
+    "\n",
+    "    loggers = []\n",
+    "\n",
+    "    # Initialize the TensorBoard logger\n",
+    "    tensorboard_logger = TensorBoardLogger(**cfg.loggers.tensorboard)\n",
+    "    loggers.append(tensorboard_logger)\n",
+    "\n",
+    "    # Initialize the CSV logger\n",
+    "    csv_logger = CSVLogger(**cfg.loggers.csv)\n",
+    "    loggers.append(csv_logger)\n",
+    "\n",
+    "    return loggers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/anaconda/envs/emlo_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['bat_resnext26ts', 'beit_base_patch16_224', 'beit_base_patch16_384', 'beit_large_patch16_224', 'beit_large_patch16_384', 'beit_large_patch16_512', 'beitv2_base_patch16_224', 'beitv2_large_patch16_224', 'botnet26t_256', 'botnet50ts_256', 'caformer_b36', 'caformer_m36', 'caformer_s18', 'caformer_s36', 'cait_m36_384', 'cait_m48_448', 'cait_s24_224', 'cait_s24_384', 'cait_s36_384', 'cait_xs24_384', 'cait_xxs24_224', 'cait_xxs24_384', 'cait_xxs36_224', 'cait_xxs36_384', 'coat_lite_medium', 'coat_lite_medium_384', 'coat_lite_mini', 'coat_lite_small', 'coat_lite_tiny', 'coat_mini', 'coat_small', 'coat_tiny', 'coatnet_0_224', 'coatnet_0_rw_224', 'coatnet_1_224', 'coatnet_1_rw_224', 'coatnet_2_224', 'coatnet_2_rw_224', 'coatnet_3_224', 'coatnet_3_rw_224', 'coatnet_4_224', 'coatnet_5_224', 'coatnet_bn_0_rw_224', 'coatnet_nano_cc_224', 'coatnet_nano_rw_224', 'coatnet_pico_rw_224', 'coatnet_rmlp_0_rw_224', 'coatnet_rmlp_1_rw2_224', 'coatnet_rmlp_1_rw_224', 'coatnet_rmlp_2_rw_224', 'coatnet_rmlp_2_rw_384', 'coatnet_rmlp_3_rw_224', 'coatnet_rmlp_nano_rw_224', 'coatnext_nano_rw_224', 'convformer_b36', 'convformer_m36', 'convformer_s18', 'convformer_s36', 'convit_base', 'convit_small', 'convit_tiny', 'convmixer_768_32', 'convmixer_1024_20_ks9_p14', 'convmixer_1536_20', 'convnext_atto', 'convnext_atto_ols', 'convnext_base', 'convnext_femto', 'convnext_femto_ols', 'convnext_large', 'convnext_large_mlp', 'convnext_nano', 'convnext_nano_ols', 'convnext_pico', 'convnext_pico_ols', 'convnext_small', 'convnext_tiny', 'convnext_tiny_hnf', 'convnext_xlarge', 'convnext_xxlarge', 'convnextv2_atto', 'convnextv2_base', 'convnextv2_femto', 'convnextv2_huge', 'convnextv2_large', 'convnextv2_nano', 'convnextv2_pico', 'convnextv2_small', 'convnextv2_tiny', 'crossvit_9_240', 'crossvit_9_dagger_240', 'crossvit_15_240', 'crossvit_15_dagger_240', 'crossvit_15_dagger_408', 'crossvit_18_240', 'crossvit_18_dagger_240', 'crossvit_18_dagger_408', 'crossvit_base_240', 'crossvit_small_240', 'crossvit_tiny_240', 'cs3darknet_focus_l', 'cs3darknet_focus_m', 'cs3darknet_focus_s', 'cs3darknet_focus_x', 'cs3darknet_l', 'cs3darknet_m', 'cs3darknet_s', 'cs3darknet_x', 'cs3edgenet_x', 'cs3se_edgenet_x', 'cs3sedarknet_l', 'cs3sedarknet_x', 'cs3sedarknet_xdw', 'cspdarknet53', 'cspresnet50', 'cspresnet50d', 'cspresnet50w', 'cspresnext50', 'darknet17', 'darknet21', 'darknet53', 'darknetaa53', 'davit_base', 'davit_base_fl', 'davit_giant', 'davit_huge', 'davit_huge_fl', 'davit_large', 'davit_small', 'davit_tiny', 'deit3_base_patch16_224', 'deit3_base_patch16_384', 'deit3_huge_patch14_224', 'deit3_large_patch16_224', 'deit3_large_patch16_384', 'deit3_medium_patch16_224', 'deit3_small_patch16_224', 'deit3_small_patch16_384', 'deit_base_distilled_patch16_224', 'deit_base_distilled_patch16_384', 'deit_base_patch16_224', 'deit_base_patch16_384', 'deit_small_distilled_patch16_224', 'deit_small_patch16_224', 'deit_tiny_distilled_patch16_224', 'deit_tiny_patch16_224', 'densenet121', 'densenet161', 'densenet169', 'densenet201', 'densenet264d', 'densenetblur121d', 'dla34', 'dla46_c', 'dla46x_c', 'dla60', 'dla60_res2net', 'dla60_res2next', 'dla60x', 'dla60x_c', 'dla102', 'dla102x', 'dla102x2', 'dla169', 'dm_nfnet_f0', 'dm_nfnet_f1', 'dm_nfnet_f2', 'dm_nfnet_f3', 'dm_nfnet_f4', 'dm_nfnet_f5', 'dm_nfnet_f6', 'dpn48b', 'dpn68', 'dpn68b', 'dpn92', 'dpn98', 'dpn107', 'dpn131', 'eca_botnext26ts_256', 'eca_halonext26ts', 'eca_nfnet_l0', 'eca_nfnet_l1', 'eca_nfnet_l2', 'eca_nfnet_l3', 'eca_resnet33ts', 'eca_resnext26ts', 'eca_vovnet39b', 'ecaresnet26t', 'ecaresnet50d', 'ecaresnet50d_pruned', 'ecaresnet50t', 'ecaresnet101d', 'ecaresnet101d_pruned', 'ecaresnet200d', 'ecaresnet269d', 'ecaresnetlight', 'ecaresnext26t_32x4d', 'ecaresnext50t_32x4d', 'edgenext_base', 'edgenext_small', 'edgenext_small_rw', 'edgenext_x_small', 'edgenext_xx_small', 'efficientformer_l1', 'efficientformer_l3', 'efficientformer_l7', 'efficientformerv2_l', 'efficientformerv2_s0', 'efficientformerv2_s1', 'efficientformerv2_s2', 'efficientnet_b0', 'efficientnet_b0_g8_gn', 'efficientnet_b0_g16_evos', 'efficientnet_b0_gn', 'efficientnet_b1', 'efficientnet_b1_pruned', 'efficientnet_b2', 'efficientnet_b2_pruned', 'efficientnet_b3', 'efficientnet_b3_g8_gn', 'efficientnet_b3_gn', 'efficientnet_b3_pruned', 'efficientnet_b4', 'efficientnet_b5', 'efficientnet_b6', 'efficientnet_b7', 'efficientnet_b8', 'efficientnet_blur_b0', 'efficientnet_cc_b0_4e', 'efficientnet_cc_b0_8e', 'efficientnet_cc_b1_8e', 'efficientnet_el', 'efficientnet_el_pruned', 'efficientnet_em', 'efficientnet_es', 'efficientnet_es_pruned', 'efficientnet_h_b5', 'efficientnet_l2', 'efficientnet_lite0', 'efficientnet_lite1', 'efficientnet_lite2', 'efficientnet_lite3', 'efficientnet_lite4', 'efficientnet_x_b3', 'efficientnet_x_b5', 'efficientnetv2_l', 'efficientnetv2_m', 'efficientnetv2_rw_m', 'efficientnetv2_rw_s', 'efficientnetv2_rw_t', 'efficientnetv2_s', 'efficientnetv2_xl', 'efficientvit_b0', 'efficientvit_b1', 'efficientvit_b2', 'efficientvit_b3', 'efficientvit_l1', 'efficientvit_l2', 'efficientvit_l3', 'efficientvit_m0', 'efficientvit_m1', 'efficientvit_m2', 'efficientvit_m3', 'efficientvit_m4', 'efficientvit_m5', 'ese_vovnet19b_dw', 'ese_vovnet19b_slim', 'ese_vovnet19b_slim_dw', 'ese_vovnet39b', 'ese_vovnet39b_evos', 'ese_vovnet57b', 'ese_vovnet99b', 'eva02_base_patch14_224', 'eva02_base_patch14_448', 'eva02_base_patch16_clip_224', 'eva02_enormous_patch14_clip_224', 'eva02_large_patch14_224', 'eva02_large_patch14_448', 'eva02_large_patch14_clip_224', 'eva02_large_patch14_clip_336', 'eva02_small_patch14_224', 'eva02_small_patch14_336', 'eva02_tiny_patch14_224', 'eva02_tiny_patch14_336', 'eva_giant_patch14_224', 'eva_giant_patch14_336', 'eva_giant_patch14_560', 'eva_giant_patch14_clip_224', 'eva_large_patch14_196', 'eva_large_patch14_336', 'fastvit_ma36', 'fastvit_mci0', 'fastvit_mci1', 'fastvit_mci2', 'fastvit_s12', 'fastvit_sa12', 'fastvit_sa24', 'fastvit_sa36', 'fastvit_t8', 'fastvit_t12', 'fbnetc_100', 'fbnetv3_b', 'fbnetv3_d', 'fbnetv3_g', 'flexivit_base', 'flexivit_large', 'flexivit_small', 'focalnet_base_lrf', 'focalnet_base_srf', 'focalnet_huge_fl3', 'focalnet_huge_fl4', 'focalnet_large_fl3', 'focalnet_large_fl4', 'focalnet_small_lrf', 'focalnet_small_srf', 'focalnet_tiny_lrf', 'focalnet_tiny_srf', 'focalnet_xlarge_fl3', 'focalnet_xlarge_fl4', 'gc_efficientnetv2_rw_t', 'gcresnet33ts', 'gcresnet50t', 'gcresnext26ts', 'gcresnext50ts', 'gcvit_base', 'gcvit_small', 'gcvit_tiny', 'gcvit_xtiny', 'gcvit_xxtiny', 'gernet_l', 'gernet_m', 'gernet_s', 'ghostnet_050', 'ghostnet_100', 'ghostnet_130', 'ghostnetv2_100', 'ghostnetv2_130', 'ghostnetv2_160', 'gmixer_12_224', 'gmixer_24_224', 'gmlp_b16_224', 'gmlp_s16_224', 'gmlp_ti16_224', 'halo2botnet50ts_256', 'halonet26t', 'halonet50ts', 'halonet_h1', 'haloregnetz_b', 'hardcorenas_a', 'hardcorenas_b', 'hardcorenas_c', 'hardcorenas_d', 'hardcorenas_e', 'hardcorenas_f', 'hgnet_base', 'hgnet_small', 'hgnet_tiny', 'hgnetv2_b0', 'hgnetv2_b1', 'hgnetv2_b2', 'hgnetv2_b3', 'hgnetv2_b4', 'hgnetv2_b5', 'hgnetv2_b6', 'hiera_base_224', 'hiera_base_abswin_256', 'hiera_base_plus_224', 'hiera_huge_224', 'hiera_large_224', 'hiera_small_224', 'hiera_small_abswin_256', 'hiera_tiny_224', 'hieradet_small', 'hrnet_w18', 'hrnet_w18_small', 'hrnet_w18_small_v2', 'hrnet_w18_ssld', 'hrnet_w30', 'hrnet_w32', 'hrnet_w40', 'hrnet_w44', 'hrnet_w48', 'hrnet_w48_ssld', 'hrnet_w64', 'inception_next_base', 'inception_next_small', 'inception_next_tiny', 'inception_resnet_v2', 'inception_v3', 'inception_v4', 'lambda_resnet26rpt_256', 'lambda_resnet26t', 'lambda_resnet50ts', 'lamhalobotnet50ts_256', 'lcnet_035', 'lcnet_050', 'lcnet_075', 'lcnet_100', 'lcnet_150', 'legacy_senet154', 'legacy_seresnet18', 'legacy_seresnet34', 'legacy_seresnet50', 'legacy_seresnet101', 'legacy_seresnet152', 'legacy_seresnext26_32x4d', 'legacy_seresnext50_32x4d', 'legacy_seresnext101_32x4d', 'legacy_xception', 'levit_128', 'levit_128s', 'levit_192', 'levit_256', 'levit_256d', 'levit_384', 'levit_384_s8', 'levit_512', 'levit_512_s8', 'levit_512d', 'levit_conv_128', 'levit_conv_128s', 'levit_conv_192', 'levit_conv_256', 'levit_conv_256d', 'levit_conv_384', 'levit_conv_384_s8', 'levit_conv_512', 'levit_conv_512_s8', 'levit_conv_512d', 'maxvit_base_tf_224', 'maxvit_base_tf_384', 'maxvit_base_tf_512', 'maxvit_large_tf_224', 'maxvit_large_tf_384', 'maxvit_large_tf_512', 'maxvit_nano_rw_256', 'maxvit_pico_rw_256', 'maxvit_rmlp_base_rw_224', 'maxvit_rmlp_base_rw_384', 'maxvit_rmlp_nano_rw_256', 'maxvit_rmlp_pico_rw_256', 'maxvit_rmlp_small_rw_224', 'maxvit_rmlp_small_rw_256', 'maxvit_rmlp_tiny_rw_256', 'maxvit_small_tf_224', 'maxvit_small_tf_384', 'maxvit_small_tf_512', 'maxvit_tiny_pm_256', 'maxvit_tiny_rw_224', 'maxvit_tiny_rw_256', 'maxvit_tiny_tf_224', 'maxvit_tiny_tf_384', 'maxvit_tiny_tf_512', 'maxvit_xlarge_tf_224', 'maxvit_xlarge_tf_384', 'maxvit_xlarge_tf_512', 'maxxvit_rmlp_nano_rw_256', 'maxxvit_rmlp_small_rw_256', 'maxxvit_rmlp_tiny_rw_256', 'maxxvitv2_nano_rw_256', 'maxxvitv2_rmlp_base_rw_224', 'maxxvitv2_rmlp_base_rw_384', 'maxxvitv2_rmlp_large_rw_224', 'mixer_b16_224', 'mixer_b32_224', 'mixer_l16_224', 'mixer_l32_224', 'mixer_s16_224', 'mixer_s32_224', 'mixnet_l', 'mixnet_m', 'mixnet_s', 'mixnet_xl', 'mixnet_xxl', 'mnasnet_050', 'mnasnet_075', 'mnasnet_100', 'mnasnet_140', 'mnasnet_small', 'mobilenet_edgetpu_100', 'mobilenet_edgetpu_v2_l', 'mobilenet_edgetpu_v2_m', 'mobilenet_edgetpu_v2_s', 'mobilenet_edgetpu_v2_xs', 'mobilenetv1_100', 'mobilenetv1_100h', 'mobilenetv1_125', 'mobilenetv2_035', 'mobilenetv2_050', 'mobilenetv2_075', 'mobilenetv2_100', 'mobilenetv2_110d', 'mobilenetv2_120d', 'mobilenetv2_140', 'mobilenetv3_large_075', 'mobilenetv3_large_100', 'mobilenetv3_large_150d', 'mobilenetv3_rw', 'mobilenetv3_small_050', 'mobilenetv3_small_075', 'mobilenetv3_small_100', 'mobilenetv4_conv_aa_large', 'mobilenetv4_conv_aa_medium', 'mobilenetv4_conv_blur_medium', 'mobilenetv4_conv_large', 'mobilenetv4_conv_medium', 'mobilenetv4_conv_small', 'mobilenetv4_hybrid_large', 'mobilenetv4_hybrid_large_075', 'mobilenetv4_hybrid_medium', 'mobilenetv4_hybrid_medium_075', 'mobileone_s0', 'mobileone_s1', 'mobileone_s2', 'mobileone_s3', 'mobileone_s4', 'mobilevit_s', 'mobilevit_xs', 'mobilevit_xxs', 'mobilevitv2_050', 'mobilevitv2_075', 'mobilevitv2_100', 'mobilevitv2_125', 'mobilevitv2_150', 'mobilevitv2_175', 'mobilevitv2_200', 'mvitv2_base', 'mvitv2_base_cls', 'mvitv2_huge_cls', 'mvitv2_large', 'mvitv2_large_cls', 'mvitv2_small', 'mvitv2_small_cls', 'mvitv2_tiny', 'nasnetalarge', 'nest_base', 'nest_base_jx', 'nest_small', 'nest_small_jx', 'nest_tiny', 'nest_tiny_jx', 'nextvit_base', 'nextvit_large', 'nextvit_small', 'nf_ecaresnet26', 'nf_ecaresnet50', 'nf_ecaresnet101', 'nf_regnet_b0', 'nf_regnet_b1', 'nf_regnet_b2', 'nf_regnet_b3', 'nf_regnet_b4', 'nf_regnet_b5', 'nf_resnet26', 'nf_resnet50', 'nf_resnet101', 'nf_seresnet26', 'nf_seresnet50', 'nf_seresnet101', 'nfnet_f0', 'nfnet_f1', 'nfnet_f2', 'nfnet_f3', 'nfnet_f4', 'nfnet_f5', 'nfnet_f6', 'nfnet_f7', 'nfnet_l0', 'pit_b_224', 'pit_b_distilled_224', 'pit_s_224', 'pit_s_distilled_224', 'pit_ti_224', 'pit_ti_distilled_224', 'pit_xs_224', 'pit_xs_distilled_224', 'pnasnet5large', 'poolformer_m36', 'poolformer_m48', 'poolformer_s12', 'poolformer_s24', 'poolformer_s36', 'poolformerv2_m36', 'poolformerv2_m48', 'poolformerv2_s12', 'poolformerv2_s24', 'poolformerv2_s36', 'pvt_v2_b0', 'pvt_v2_b1', 'pvt_v2_b2', 'pvt_v2_b2_li', 'pvt_v2_b3', 'pvt_v2_b4', 'pvt_v2_b5', 'rdnet_base', 'rdnet_large', 'rdnet_small', 'rdnet_tiny', 'regnetv_040', 'regnetv_064', 'regnetx_002', 'regnetx_004', 'regnetx_004_tv', 'regnetx_006', 'regnetx_008', 'regnetx_016', 'regnetx_032', 'regnetx_040', 'regnetx_064', 'regnetx_080', 'regnetx_120', 'regnetx_160', 'regnetx_320', 'regnety_002', 'regnety_004', 'regnety_006', 'regnety_008', 'regnety_008_tv', 'regnety_016', 'regnety_032', 'regnety_040', 'regnety_040_sgn', 'regnety_064', 'regnety_080', 'regnety_080_tv', 'regnety_120', 'regnety_160', 'regnety_320', 'regnety_640', 'regnety_1280', 'regnety_2560', 'regnetz_005', 'regnetz_040', 'regnetz_040_h', 'regnetz_b16', 'regnetz_b16_evos', 'regnetz_c16', 'regnetz_c16_evos', 'regnetz_d8', 'regnetz_d8_evos', 'regnetz_d32', 'regnetz_e8', 'repghostnet_050', 'repghostnet_058', 'repghostnet_080', 'repghostnet_100', 'repghostnet_111', 'repghostnet_130', 'repghostnet_150', 'repghostnet_200', 'repvgg_a0', 'repvgg_a1', 'repvgg_a2', 'repvgg_b0', 'repvgg_b1', 'repvgg_b1g4', 'repvgg_b2', 'repvgg_b2g4', 'repvgg_b3', 'repvgg_b3g4', 'repvgg_d2se', 'repvit_m0_9', 'repvit_m1', 'repvit_m1_0', 'repvit_m1_1', 'repvit_m1_5', 'repvit_m2', 'repvit_m2_3', 'repvit_m3', 'res2net50_14w_8s', 'res2net50_26w_4s', 'res2net50_26w_6s', 'res2net50_26w_8s', 'res2net50_48w_2s', 'res2net50d', 'res2net101_26w_4s', 'res2net101d', 'res2next50', 'resmlp_12_224', 'resmlp_24_224', 'resmlp_36_224', 'resmlp_big_24_224', 'resnest14d', 'resnest26d', 'resnest50d', 'resnest50d_1s4x24d', 'resnest50d_4s2x40d', 'resnest101e', 'resnest200e', 'resnest269e', 'resnet10t', 'resnet14t', 'resnet18', 'resnet18d', 'resnet26', 'resnet26d', 'resnet26t', 'resnet32ts', 'resnet33ts', 'resnet34', 'resnet34d', 'resnet50', 'resnet50_clip', 'resnet50_clip_gap', 'resnet50_gn', 'resnet50_mlp', 'resnet50c', 'resnet50d', 'resnet50s', 'resnet50t', 'resnet50x4_clip', 'resnet50x4_clip_gap', 'resnet50x16_clip', 'resnet50x16_clip_gap', 'resnet50x64_clip', 'resnet50x64_clip_gap', 'resnet51q', 'resnet61q', 'resnet101', 'resnet101_clip', 'resnet101_clip_gap', 'resnet101c', 'resnet101d', 'resnet101s', 'resnet152', 'resnet152c', 'resnet152d', 'resnet152s', 'resnet200', 'resnet200d', 'resnetaa34d', 'resnetaa50', 'resnetaa50d', 'resnetaa101d', 'resnetblur18', 'resnetblur50', 'resnetblur50d', 'resnetblur101d', 'resnetrs50', 'resnetrs101', 'resnetrs152', 'resnetrs200', 'resnetrs270', 'resnetrs350', 'resnetrs420', 'resnetv2_50', 'resnetv2_50d', 'resnetv2_50d_evos', 'resnetv2_50d_frn', 'resnetv2_50d_gn', 'resnetv2_50t', 'resnetv2_50x1_bit', 'resnetv2_50x3_bit', 'resnetv2_101', 'resnetv2_101d', 'resnetv2_101x1_bit', 'resnetv2_101x3_bit', 'resnetv2_152', 'resnetv2_152d', 'resnetv2_152x2_bit', 'resnetv2_152x4_bit', 'resnext26ts', 'resnext50_32x4d', 'resnext50d_32x4d', 'resnext101_32x4d', 'resnext101_32x8d', 'resnext101_32x16d', 'resnext101_32x32d', 'resnext101_64x4d', 'rexnet_100', 'rexnet_130', 'rexnet_150', 'rexnet_200', 'rexnet_300', 'rexnetr_100', 'rexnetr_130', 'rexnetr_150', 'rexnetr_200', 'rexnetr_300', 'sam2_hiera_base_plus', 'sam2_hiera_large', 'sam2_hiera_small', 'sam2_hiera_tiny', 'samvit_base_patch16', 'samvit_base_patch16_224', 'samvit_huge_patch16', 'samvit_large_patch16', 'sebotnet33ts_256', 'sedarknet21', 'sehalonet33ts', 'selecsls42', 'selecsls42b', 'selecsls60', 'selecsls60b', 'selecsls84', 'semnasnet_050', 'semnasnet_075', 'semnasnet_100', 'semnasnet_140', 'senet154', 'sequencer2d_l', 'sequencer2d_m', 'sequencer2d_s', 'seresnet18', 'seresnet33ts', 'seresnet34', 'seresnet50', 'seresnet50t', 'seresnet101', 'seresnet152', 'seresnet152d', 'seresnet200d', 'seresnet269d', 'seresnetaa50d', 'seresnext26d_32x4d', 'seresnext26t_32x4d', 'seresnext26ts', 'seresnext50_32x4d', 'seresnext101_32x4d', 'seresnext101_32x8d', 'seresnext101_64x4d', 'seresnext101d_32x8d', 'seresnextaa101d_32x8d', 'seresnextaa201d_32x8d', 'skresnet18', 'skresnet34', 'skresnet50', 'skresnet50d', 'skresnext50_32x4d', 'spnasnet_100', 'swin_base_patch4_window7_224', 'swin_base_patch4_window12_384', 'swin_large_patch4_window7_224', 'swin_large_patch4_window12_384', 'swin_s3_base_224', 'swin_s3_small_224', 'swin_s3_tiny_224', 'swin_small_patch4_window7_224', 'swin_tiny_patch4_window7_224', 'swinv2_base_window8_256', 'swinv2_base_window12_192', 'swinv2_base_window12to16_192to256', 'swinv2_base_window12to24_192to384', 'swinv2_base_window16_256', 'swinv2_cr_base_224', 'swinv2_cr_base_384', 'swinv2_cr_base_ns_224', 'swinv2_cr_giant_224', 'swinv2_cr_giant_384', 'swinv2_cr_huge_224', 'swinv2_cr_huge_384', 'swinv2_cr_large_224', 'swinv2_cr_large_384', 'swinv2_cr_small_224', 'swinv2_cr_small_384', 'swinv2_cr_small_ns_224', 'swinv2_cr_small_ns_256', 'swinv2_cr_tiny_224', 'swinv2_cr_tiny_384', 'swinv2_cr_tiny_ns_224', 'swinv2_large_window12_192', 'swinv2_large_window12to16_192to256', 'swinv2_large_window12to24_192to384', 'swinv2_small_window8_256', 'swinv2_small_window16_256', 'swinv2_tiny_window8_256', 'swinv2_tiny_window16_256', 'test_byobnet', 'test_efficientnet', 'test_vit', 'tf_efficientnet_b0', 'tf_efficientnet_b1', 'tf_efficientnet_b2', 'tf_efficientnet_b3', 'tf_efficientnet_b4', 'tf_efficientnet_b5', 'tf_efficientnet_b6', 'tf_efficientnet_b7', 'tf_efficientnet_b8', 'tf_efficientnet_cc_b0_4e', 'tf_efficientnet_cc_b0_8e', 'tf_efficientnet_cc_b1_8e', 'tf_efficientnet_el', 'tf_efficientnet_em', 'tf_efficientnet_es', 'tf_efficientnet_l2', 'tf_efficientnet_lite0', 'tf_efficientnet_lite1', 'tf_efficientnet_lite2', 'tf_efficientnet_lite3', 'tf_efficientnet_lite4', 'tf_efficientnetv2_b0', 'tf_efficientnetv2_b1', 'tf_efficientnetv2_b2', 'tf_efficientnetv2_b3', 'tf_efficientnetv2_l', 'tf_efficientnetv2_m', 'tf_efficientnetv2_s', 'tf_efficientnetv2_xl', 'tf_mixnet_l', 'tf_mixnet_m', 'tf_mixnet_s', 'tf_mobilenetv3_large_075', 'tf_mobilenetv3_large_100', 'tf_mobilenetv3_large_minimal_100', 'tf_mobilenetv3_small_075', 'tf_mobilenetv3_small_100', 'tf_mobilenetv3_small_minimal_100', 'tiny_vit_5m_224', 'tiny_vit_11m_224', 'tiny_vit_21m_224', 'tiny_vit_21m_384', 'tiny_vit_21m_512', 'tinynet_a', 'tinynet_b', 'tinynet_c', 'tinynet_d', 'tinynet_e', 'tnt_b_patch16_224', 'tnt_s_patch16_224', 'tresnet_l', 'tresnet_m', 'tresnet_v2_l', 'tresnet_xl', 'twins_pcpvt_base', 'twins_pcpvt_large', 'twins_pcpvt_small', 'twins_svt_base', 'twins_svt_large', 'twins_svt_small', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn', 'visformer_small', 'visformer_tiny', 'vit_base_mci_224', 'vit_base_patch8_224', 'vit_base_patch14_dinov2', 'vit_base_patch14_reg4_dinov2', 'vit_base_patch16_18x2_224', 'vit_base_patch16_224', 'vit_base_patch16_224_miil', 'vit_base_patch16_384', 'vit_base_patch16_clip_224', 'vit_base_patch16_clip_384', 'vit_base_patch16_clip_quickgelu_224', 'vit_base_patch16_gap_224', 'vit_base_patch16_plus_240', 'vit_base_patch16_reg4_gap_256', 'vit_base_patch16_rope_reg1_gap_256', 'vit_base_patch16_rpn_224', 'vit_base_patch16_siglip_224', 'vit_base_patch16_siglip_256', 'vit_base_patch16_siglip_384', 'vit_base_patch16_siglip_512', 'vit_base_patch16_siglip_gap_224', 'vit_base_patch16_siglip_gap_256', 'vit_base_patch16_siglip_gap_384', 'vit_base_patch16_siglip_gap_512', 'vit_base_patch16_xp_224', 'vit_base_patch32_224', 'vit_base_patch32_384', 'vit_base_patch32_clip_224', 'vit_base_patch32_clip_256', 'vit_base_patch32_clip_384', 'vit_base_patch32_clip_448', 'vit_base_patch32_clip_quickgelu_224', 'vit_base_patch32_plus_256', 'vit_base_r26_s32_224', 'vit_base_r50_s16_224', 'vit_base_r50_s16_384', 'vit_base_resnet26d_224', 'vit_base_resnet50d_224', 'vit_betwixt_patch16_gap_256', 'vit_betwixt_patch16_reg1_gap_256', 'vit_betwixt_patch16_reg4_gap_256', 'vit_betwixt_patch16_reg4_gap_384', 'vit_betwixt_patch16_rope_reg4_gap_256', 'vit_betwixt_patch32_clip_224', 'vit_giant_patch14_224', 'vit_giant_patch14_clip_224', 'vit_giant_patch14_dinov2', 'vit_giant_patch14_reg4_dinov2', 'vit_giant_patch16_gap_224', 'vit_gigantic_patch14_224', 'vit_gigantic_patch14_clip_224', 'vit_huge_patch14_224', 'vit_huge_patch14_clip_224', 'vit_huge_patch14_clip_336', 'vit_huge_patch14_clip_378', 'vit_huge_patch14_clip_quickgelu_224', 'vit_huge_patch14_clip_quickgelu_378', 'vit_huge_patch14_gap_224', 'vit_huge_patch14_xp_224', 'vit_huge_patch16_gap_448', 'vit_large_patch14_224', 'vit_large_patch14_clip_224', 'vit_large_patch14_clip_336', 'vit_large_patch14_clip_quickgelu_224', 'vit_large_patch14_clip_quickgelu_336', 'vit_large_patch14_dinov2', 'vit_large_patch14_reg4_dinov2', 'vit_large_patch14_xp_224', 'vit_large_patch16_224', 'vit_large_patch16_384', 'vit_large_patch16_siglip_256', 'vit_large_patch16_siglip_384', 'vit_large_patch16_siglip_gap_256', 'vit_large_patch16_siglip_gap_384', 'vit_large_patch32_224', 'vit_large_patch32_384', 'vit_large_r50_s32_224', 'vit_large_r50_s32_384', 'vit_little_patch16_reg1_gap_256', 'vit_little_patch16_reg4_gap_256', 'vit_medium_patch16_clip_224', 'vit_medium_patch16_gap_240', 'vit_medium_patch16_gap_256', 'vit_medium_patch16_gap_384', 'vit_medium_patch16_reg1_gap_256', 'vit_medium_patch16_reg4_gap_256', 'vit_medium_patch16_rope_reg1_gap_256', 'vit_medium_patch32_clip_224', 'vit_mediumd_patch16_reg4_gap_256', 'vit_mediumd_patch16_reg4_gap_384', 'vit_mediumd_patch16_rope_reg1_gap_256', 'vit_pwee_patch16_reg1_gap_256', 'vit_relpos_base_patch16_224', 'vit_relpos_base_patch16_cls_224', 'vit_relpos_base_patch16_clsgap_224', 'vit_relpos_base_patch16_plus_240', 'vit_relpos_base_patch16_rpn_224', 'vit_relpos_base_patch32_plus_rpn_256', 'vit_relpos_medium_patch16_224', 'vit_relpos_medium_patch16_cls_224', 'vit_relpos_medium_patch16_rpn_224', 'vit_relpos_small_patch16_224', 'vit_relpos_small_patch16_rpn_224', 'vit_small_patch8_224', 'vit_small_patch14_dinov2', 'vit_small_patch14_reg4_dinov2', 'vit_small_patch16_18x2_224', 'vit_small_patch16_36x1_224', 'vit_small_patch16_224', 'vit_small_patch16_384', 'vit_small_patch32_224', 'vit_small_patch32_384', 'vit_small_r26_s32_224', 'vit_small_r26_s32_384', 'vit_small_resnet26d_224', 'vit_small_resnet50d_s16_224', 'vit_so150m_patch16_reg4_gap_256', 'vit_so150m_patch16_reg4_map_256', 'vit_so400m_patch14_siglip_224', 'vit_so400m_patch14_siglip_384', 'vit_so400m_patch14_siglip_gap_224', 'vit_so400m_patch14_siglip_gap_384', 'vit_so400m_patch14_siglip_gap_448', 'vit_so400m_patch14_siglip_gap_896', 'vit_srelpos_medium_patch16_224', 'vit_srelpos_small_patch16_224', 'vit_tiny_patch16_224', 'vit_tiny_patch16_384', 'vit_tiny_r_s16_p8_224', 'vit_tiny_r_s16_p8_384', 'vit_wee_patch16_reg1_gap_256', 'vit_xsmall_patch16_clip_224', 'vitamin_base_224', 'vitamin_large2_224', 'vitamin_large2_256', 'vitamin_large2_336', 'vitamin_large2_384', 'vitamin_large_224', 'vitamin_large_256', 'vitamin_large_336', 'vitamin_large_384', 'vitamin_small_224', 'vitamin_xlarge_256', 'vitamin_xlarge_336', 'vitamin_xlarge_384', 'volo_d1_224', 'volo_d1_384', 'volo_d2_224', 'volo_d2_384', 'volo_d3_224', 'volo_d3_448', 'volo_d4_224', 'volo_d4_448', 'volo_d5_224', 'volo_d5_448', 'volo_d5_512', 'vovnet39a', 'vovnet57a', 'wide_resnet50_2', 'wide_resnet101_2', 'xception41', 'xception41p', 'xception65', 'xception65p', 'xception71', 'xcit_large_24_p8_224', 'xcit_large_24_p8_384', 'xcit_large_24_p16_224', 'xcit_large_24_p16_384', 'xcit_medium_24_p8_224', 'xcit_medium_24_p8_384', 'xcit_medium_24_p16_224', 'xcit_medium_24_p16_384', 'xcit_nano_12_p8_224', 'xcit_nano_12_p8_384', 'xcit_nano_12_p16_224', 'xcit_nano_12_p16_384', 'xcit_small_12_p8_224', 'xcit_small_12_p8_384', 'xcit_small_12_p16_224', 'xcit_small_12_p16_384', 'xcit_small_24_p8_224', 'xcit_small_24_p8_384', 'xcit_small_24_p16_224', 'xcit_small_24_p16_384', 'xcit_tiny_12_p8_224', 'xcit_tiny_12_p8_384', 'xcit_tiny_12_p16_224', 'xcit_tiny_12_p16_384', 'xcit_tiny_24_p8_224', 'xcit_tiny_24_p8_384', 'xcit_tiny_24_p16_224', 'xcit_tiny_24_p16_384']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import timm\n",
+    "print(timm.list_models())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### testing the litserve model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from urllib.request import urlopen\n",
+    "import base64"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'bytes'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "url = \"https://media.istockphoto.com/id/541844008/photo/portland-grand-floral-parade-2016.jpg?s=2048x2048&w=is&k=20&c=ZuvR6oDv5WxwL5dhXKAbevysEXhXV47shJdpzkqen5Y=\"\n",
+    "img_data = urlopen(url).read()\n",
+    "print(type(img_data))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'str'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Convert to base64 string\n",
+    "img_bytes = base64.b64encode(img_data).decode('utf-8')\n",
+    "print(type(img_bytes))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = requests.post(\n",
+    "    \"http://localhost:8080/predict\", json={\"image\": img_bytes}  # image is the key\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\\nTop 5 Predictions:\n",
+      "mountain_bike, all-terrain_bike, off-roader: 82.13%\n",
+      "maillot: 5.09%\n",
+      "crash_helmet: 1.84%\n",
+      "bicycle-built-for-two, tandem_bicycle, tandem: 1.83%\n",
+      "alp: 0.69%\n"
+     ]
+    }
+   ],
+   "source": [
+    "if response.status_code == 200:\n",
+    "    predictions = response.json()[\"predictions\"]\n",
+    "    print(\"\\\\nTop 5 Predictions:\")\n",
+    "    for pred in predictions:\n",
+    "        print(f\"{pred['label']}: {pred['probability']:.2%}\")\n",
+    "else:\n",
+    "    print(f\"Error: {response.status_code}\")\n",
+    "    print(response.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "########################################## End of the script ##########################################"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "emlo_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,94 @@

+[tool.poetry]
+name = "pytorch_fastapi_project"
+version = "0.1.0"
+description = "Consolidated PyTorch and FastAPI project for AWS deployment and GHA testing"
+authors = ["soutrik71 <[email protected]>"]
+license = "Apache-2.0"
+readme = "README.md"
+[tool.poetry.dependencies]
+python = ">=3.10,<3.11"
+black = "24.8.0"
+coverage = ">=7.6.1"
+hydra-colorlog = "1.2.0"
+hydra-core = "1.3.2"
+lightning = {version = "2.4.0", extras = ["extra"]}
+loguru = "0.7.2"
+pytest = "^8.3.3"
+rich = "13.8.1"
+rootutils = "1.0.7"
+tensorboard = "2.17.1"
+timm = "1.0.9"
+pandas = "^2.2.3"
+numpy = "^1.26.0"
+ruff = "*"
+torch = {version = "^2.4.1", source = "pytorch_cuda"}
+torchvision = {version = "^0.19.1", source = "pytorch_cuda"}
+torchaudio = {version = "^2.4.1", source = "pytorch_cuda"}
+seaborn = "^0.13.2"
+pydantic = "^2.9.2"
+kaggle = "^1.6.17"
+pytest-cov = "^5.0.0"
+pytest-mock = "^3.14.0"
+flake8 = "^7.1.1"
+dvc-gdrive = "^3.0.1"
+dvc-azure = "^3.1.0"
+transformers = "^4.45.2"
+fastapi = "^0.115.4"
+pydantic-settings = "^2.6.1"
+uvicorn = "^0.32.0"
+tenacity = "^9.0.0"
+gunicorn = "^23.0.0"
+aim = "^3.25.0"
+mlflow = "^2.17.1"
+hydra-optuna-sweeper = "^1.2.0"
+dvc = "^3.56.0"
+platformdirs = "3.10"
+fastapi-utils = "^0.7.0"
+httpx = "^0.27.2"
+typing-inspect = "^0.9.0"
+requests = "^2.32.3"
+fastapi-restful = {extras = ["all"], version = "^0.6.0"}
+aioredis = "^2.0.1"
+psycopg2-binary = "^2.9.10"
+asyncpg = "^0.30.0"
+confluent-kafka = "^2.6.0"
+aiokafka = "^0.12.0"
+azure-servicebus = "^7.12.3"
+aiohttp = "^3.10.10"
+aiofiles = "*"
+aiologger = "^0.7.0"
+pyyaml = "^6.0.2"
+sqlalchemy-utils = "^0.41.2"
+sqlalchemy = "^2.0.36"
+alembic = "^1.13.3"
+fastapi-limiter = "^0.1.6"
+redis = "5.0.8"
+redisearch = "2.0.0"
+python-multipart = "*"
+python-dotenv = "^1.0.1"
+celery = "^5.4.0"
+fastapi-cache2 = "^0.2.2"
+aiocache = "^0.12.3"
+dvc-s3 = "^3.2.0"
+litserve = "^0.2.4"
+gpustat = "^1.1.1"
+nvitop = "^1.3.2"
+gradio = "5.7.1"
+gradio-client = "^1.5.0"
+accelerate = "^1.1.1"
+cryptography = "^44.0.0"
+boto3 = "*"
+pyopenssl = "^24.3.0"
+[tool.poetry.dev-dependencies]
+pytest-asyncio = "^0.20.3"
+[[tool.poetry.source]]
+name = "pytorch_cuda"
+url = "https://download.pytorch.org/whl/cu124"
+priority = "explicit"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+torch==2.4.1
+torchvision==0.19.1
+hydra-colorlog==1.2.0
+hydra-core==1.3.2
+lightning[extra]==2.4.0
+loguru==0.7.2
+rich==13.8.1
+rootutils==1.0.7
+tensorboard==2.17.1
+timm==1.0.9
+pandas>=2.2.3
+numpy>=1.26.0
+transformers>=4.45.2
+aim>=3.25.0
+mlflow>=2.17.1
+hydra-optuna-sweeper>=1.2.0
+aiologger>=0.7.0
+pyyaml>=6.0.2
+dvc-s3>=3.2.0
+litserve>=0.2.4
+gpustat>=1.1.1
+nvitop>=1.3.2
+gradio==5.7.1
+gradio-client>=1.5.0
+accelerate>=1.1.1
+cryptography>=44.0.0
+boto3
+pyopenssl>=24.3.0