run PR e2e docker CI tests in Modal (#1217) [skip ci]
Browse files* wip modal for ci
* handle falcon layernorms better
* update
* rebuild the template each time with the pseudo-ARGS
* fix ref
* update tests to use modal
* cleanup ci script
* make sure to install jinja2 also
* kickoff the gh action on gh hosted runners and specify num gpus
- .github/workflows/tests.yml +21 -30
- cicd/Dockerfile.jinja +38 -0
- cicd/tests.py +69 -0
- docker/{Dockerfile-tests → Dockerfile-modal} +8 -11
- requirements.txt +1 -0
- src/axolotl/utils/models.py +5 -2
.github/workflows/tests.yml
CHANGED
|
@@ -58,10 +58,15 @@ jobs:
|
|
| 58 |
docker-e2e-tests:
|
| 59 |
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
| 60 |
# this job needs to be run on self-hosted GPU runners...
|
| 61 |
-
runs-on:
|
| 62 |
timeout-minutes: 30
|
| 63 |
needs: [pre-commit, pytest]
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
strategy:
|
| 66 |
fail-fast: false
|
| 67 |
matrix:
|
|
@@ -70,43 +75,29 @@ jobs:
|
|
| 70 |
cuda_version: 11.8.0
|
| 71 |
python_version: "3.10"
|
| 72 |
pytorch: 2.0.1
|
|
|
|
| 73 |
- cuda: 121
|
| 74 |
cuda_version: 12.1.0
|
| 75 |
python_version: "3.10"
|
| 76 |
pytorch: 2.1.2
|
|
|
|
| 77 |
steps:
|
| 78 |
- name: Checkout
|
| 79 |
uses: actions/checkout@v4
|
| 80 |
-
- name:
|
| 81 |
-
|
| 82 |
-
uses: docker/metadata-action@v5
|
| 83 |
with:
|
| 84 |
-
|
| 85 |
-
- name:
|
| 86 |
-
run: |
|
| 87 |
-
# Set up build arguments
|
| 88 |
-
BASE_TAG="main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}"
|
| 89 |
-
CUDA="${{ matrix.cuda }}"
|
| 90 |
-
PYTORCH_VERSION="${{ matrix.pytorch }}"
|
| 91 |
-
# Build the Docker image
|
| 92 |
-
docker build . \
|
| 93 |
-
--file ./docker/Dockerfile-tests \
|
| 94 |
-
--build-arg BASE_TAG=$BASE_TAG \
|
| 95 |
-
--build-arg CUDA=$CUDA \
|
| 96 |
-
--build-arg GITHUB_REF=$GITHUB_REF \
|
| 97 |
-
--build-arg PYTORCH_VERSION=$PYTORCH_VERSION \
|
| 98 |
-
--tag ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} \
|
| 99 |
-
--no-cache
|
| 100 |
-
- name: Unit Tests w docker image
|
| 101 |
-
run: |
|
| 102 |
-
docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
|
| 103 |
-
- name: GPU Unit Tests w docker image
|
| 104 |
run: |
|
| 105 |
-
|
| 106 |
-
|
|
|
|
| 107 |
run: |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
| 111 |
run: |
|
| 112 |
-
|
|
|
|
| 58 |
docker-e2e-tests:
|
| 59 |
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
| 60 |
# this job needs to be run on self-hosted GPU runners...
|
| 61 |
+
runs-on: ubuntu-latest
|
| 62 |
timeout-minutes: 30
|
| 63 |
needs: [pre-commit, pytest]
|
| 64 |
|
| 65 |
+
env:
|
| 66 |
+
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
|
| 67 |
+
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
|
| 68 |
+
MODAL_ENVIRONMENT: axolotl-ci-cd
|
| 69 |
+
|
| 70 |
strategy:
|
| 71 |
fail-fast: false
|
| 72 |
matrix:
|
|
|
|
| 75 |
cuda_version: 11.8.0
|
| 76 |
python_version: "3.10"
|
| 77 |
pytorch: 2.0.1
|
| 78 |
+
num_gpus: 1
|
| 79 |
- cuda: 121
|
| 80 |
cuda_version: 12.1.0
|
| 81 |
python_version: "3.10"
|
| 82 |
pytorch: 2.1.2
|
| 83 |
+
num_gpus: 1
|
| 84 |
steps:
|
| 85 |
- name: Checkout
|
| 86 |
uses: actions/checkout@v4
|
| 87 |
+
- name: Install Python
|
| 88 |
+
uses: actions/setup-python@v5
|
|
|
|
| 89 |
with:
|
| 90 |
+
python-version: "3.10"
|
| 91 |
+
- name: Install Modal
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
run: |
|
| 93 |
+
python -m pip install --upgrade pip
|
| 94 |
+
pip install modal jinja2
|
| 95 |
+
- name: Update env vars
|
| 96 |
run: |
|
| 97 |
+
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
| 98 |
+
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
| 99 |
+
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
| 100 |
+
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
| 101 |
+
- name: Run training job on Modal
|
| 102 |
run: |
|
| 103 |
+
modal run cicd.tests
|
cicd/Dockerfile.jinja
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM winglian/axolotl-base:{{ BASE_TAG }}
|
| 2 |
+
|
| 3 |
+
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
| 4 |
+
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
|
| 5 |
+
ENV CUDA="{{ CUDA }}"
|
| 6 |
+
ENV BNB_CUDA_VERSION="{{ CUDA }}"
|
| 7 |
+
ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
|
| 8 |
+
ENV GITHUB_REF="{{ GITHUB_REF }}"
|
| 9 |
+
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
|
| 10 |
+
|
| 11 |
+
RUN apt-get update && \
|
| 12 |
+
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
| 13 |
+
|
| 14 |
+
WORKDIR /workspace
|
| 15 |
+
|
| 16 |
+
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
| 17 |
+
|
| 18 |
+
WORKDIR /workspace/axolotl
|
| 19 |
+
|
| 20 |
+
RUN git fetch origin +$GITHUB_REF && \
|
| 21 |
+
git checkout FETCH_HEAD
|
| 22 |
+
|
| 23 |
+
# If AXOLOTL_EXTRAS is set, append it in brackets
|
| 24 |
+
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
| 25 |
+
pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
|
| 26 |
+
else \
|
| 27 |
+
pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
|
| 28 |
+
fi
|
| 29 |
+
|
| 30 |
+
# So we can test the Docker image
|
| 31 |
+
RUN pip install pytest
|
| 32 |
+
|
| 33 |
+
# fix so that git fetch/pull from remote works
|
| 34 |
+
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
| 35 |
+
git config --get remote.origin.fetch
|
| 36 |
+
|
| 37 |
+
# helper for huggingface-login cli
|
| 38 |
+
RUN git config --global credential.helper store
|
cicd/tests.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
modal application to run axolotl gpu tests in Modal
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
import pathlib
|
| 6 |
+
import tempfile
|
| 7 |
+
|
| 8 |
+
import jinja2
|
| 9 |
+
import modal
|
| 10 |
+
from jinja2 import select_autoescape
|
| 11 |
+
from modal import Image, Stub
|
| 12 |
+
|
| 13 |
+
cicd_path = pathlib.Path(__file__).parent.resolve()
|
| 14 |
+
|
| 15 |
+
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
|
| 16 |
+
template_env = jinja2.Environment(
|
| 17 |
+
loader=template_loader, autoescape=select_autoescape()
|
| 18 |
+
)
|
| 19 |
+
df_template = template_env.get_template("Dockerfile.jinja")
|
| 20 |
+
|
| 21 |
+
df_args = {
|
| 22 |
+
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
| 23 |
+
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.0.1"),
|
| 24 |
+
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.10-cu118-2.0.1"),
|
| 25 |
+
"CUDA": os.environ.get("CUDA", "118"),
|
| 26 |
+
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
| 27 |
+
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
dockerfile_contents = df_template.render(**df_args)
|
| 31 |
+
|
| 32 |
+
temp_dir = tempfile.mkdtemp()
|
| 33 |
+
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
|
| 34 |
+
f.write(dockerfile_contents)
|
| 35 |
+
|
| 36 |
+
cicd_image = Image.from_dockerfile(
|
| 37 |
+
pathlib.Path(temp_dir) / "Dockerfile",
|
| 38 |
+
force_build=True,
|
| 39 |
+
gpu="A10G",
|
| 40 |
+
).env(df_args)
|
| 41 |
+
|
| 42 |
+
stub = Stub("Axolotl CI/CD", secrets=[])
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
N_GPUS = int(os.environ.get("N_GPUS", 1))
|
| 46 |
+
GPU_CONFIG = modal.gpu.A10G(count=N_GPUS)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def run_cmd(cmd: str, run_folder: str):
|
| 50 |
+
import subprocess # nosec
|
| 51 |
+
|
| 52 |
+
# Propagate errors from subprocess.
|
| 53 |
+
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
|
| 54 |
+
exit(exit_code) # pylint: disable=consider-using-sys-exit
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@stub.function(
|
| 58 |
+
image=cicd_image,
|
| 59 |
+
gpu=GPU_CONFIG,
|
| 60 |
+
timeout=60 * 30,
|
| 61 |
+
)
|
| 62 |
+
def cicd_pytest():
|
| 63 |
+
cmd = "pytest /workspace/axolotl/tests/e2e/patched/"
|
| 64 |
+
run_cmd(cmd, "/workspace/axolotl")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@stub.local_entrypoint()
|
| 68 |
+
def main():
|
| 69 |
+
cicd_pytest.remote()
|
docker/{Dockerfile-tests → Dockerfile-modal}
RENAMED
|
@@ -1,14 +1,11 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
ENV
|
| 8 |
-
|
| 9 |
-
ARG GITHUB_REF="main"
|
| 10 |
-
|
| 11 |
-
ENV PYTORCH_VERSION=$PYTORCH_VERSION
|
| 12 |
|
| 13 |
RUN apt-get update && \
|
| 14 |
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
|
|
|
| 1 |
+
FROM winglian/axolotl-base:main-base
|
| 2 |
+
|
| 3 |
+
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
| 4 |
+
ENV AXOLOTL_EXTRAS=""
|
| 5 |
+
ENV CUDA="118"
|
| 6 |
+
ENV BNB_CUDA_VERSION="118"
|
| 7 |
+
ENV PYTORCH_VERSION="2.0.1"
|
| 8 |
+
ENV GITHUB_REF="main"
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
RUN apt-get update && \
|
| 11 |
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
requirements.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
|
|
|
| 2 |
packaging==23.2
|
| 3 |
peft==0.7.1
|
| 4 |
transformers==4.37.0
|
|
|
|
| 1 |
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
| 2 |
+
jinja2
|
| 3 |
packaging==23.2
|
| 4 |
peft==0.7.1
|
| 5 |
transformers==4.37.0
|
src/axolotl/utils/models.py
CHANGED
|
@@ -645,7 +645,10 @@ def load_model(
|
|
| 645 |
if not cfg.fsdp:
|
| 646 |
# FSDP doesn't like mixed Float and BFloat16
|
| 647 |
for name, module in model.named_modules():
|
| 648 |
-
if
|
|
|
|
|
|
|
|
|
|
| 649 |
module.to(torch.float32)
|
| 650 |
if model_config.model_type == "btlm":
|
| 651 |
# don't upcast lm_head for btlm
|
|
@@ -684,7 +687,7 @@ def load_model(
|
|
| 684 |
if needs_fa2_dtype or cfg.flash_attention:
|
| 685 |
LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
|
| 686 |
for name, module in model.named_modules():
|
| 687 |
-
if "norm" in name:
|
| 688 |
module.to(cfg.torch_dtype)
|
| 689 |
if any(m in name for m in embedding_modules):
|
| 690 |
if hasattr(module, "weight"):
|
|
|
|
| 645 |
if not cfg.fsdp:
|
| 646 |
# FSDP doesn't like mixed Float and BFloat16
|
| 647 |
for name, module in model.named_modules():
|
| 648 |
+
if (
|
| 649 |
+
any(m in name for m in ["norm", "gate"])
|
| 650 |
+
or "LayerNorm" in module.__class__.__name__
|
| 651 |
+
):
|
| 652 |
module.to(torch.float32)
|
| 653 |
if model_config.model_type == "btlm":
|
| 654 |
# don't upcast lm_head for btlm
|
|
|
|
| 687 |
if needs_fa2_dtype or cfg.flash_attention:
|
| 688 |
LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
|
| 689 |
for name, module in model.named_modules():
|
| 690 |
+
if "norm" in name or "LayerNorm" in module.__class__.__name__:
|
| 691 |
module.to(cfg.torch_dtype)
|
| 692 |
if any(m in name for m in embedding_modules):
|
| 693 |
if hasattr(module, "weight"):
|