Skip to content

Enable ROCM in CI #6504

Enable ROCM in CI

Enable ROCM in CI #6504

name: Run Regression Tests
on:
push:
branches:
- main
- 'gh/**'
pull_request:
branches:
- main
- 'gh/**'
concurrency:
group: regression_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
permissions:
id-token: write
contents: read
jobs:
test-nightly:
strategy:
fail-fast: false
matrix:
include:
- name: CUDA Nightly
runs-on: linux.g5.12xlarge.nvidia.gpu
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu124'
gpu-arch-type: "cuda"
gpu-arch-version: "12.4"
- name: CPU Nightly
runs-on: linux.4xlarge
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cpu'
gpu-arch-type: "cpu"
gpu-arch-version: ""
- name: ROCM Nightly
runs-on: linux.rocm.gpu
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3'
gpu-arch-type: "rocm"
gpu-arch-version: "6.3"
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@rocm_experiment
with:
timeout: 120
no-sudo: ${{ matrix.gpu-arch-type == 'rocm' }}
continue-on-error: ${{ matrix.gpu-arch-type == 'rocm' }}
test-infra-ref: rocm_experiment
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
script: |
conda create -n venv python=3.9 -y
conda activate venv
python -m pip install --upgrade pip
pip install ${{ matrix.torch-spec }}
pip install -r dev-requirements.txt
pip install .
export CONDA=$(dirname $(dirname $(which conda)))
export LD_LIBRARY_PATH=$CONDA/lib/:$LD_LIBRARY_PATH
pytest test --verbose -s
test:
strategy:
fail-fast: false
matrix:
include:
- name: CUDA 2.3
runs-on: linux.g5.12xlarge.nvidia.gpu
torch-spec: 'torch==2.3.0'
gpu-arch-type: "cuda"
gpu-arch-version: "12.1"
- name: CUDA 2.4
runs-on: linux.g5.12xlarge.nvidia.gpu
torch-spec: 'torch==2.4.0'
gpu-arch-type: "cuda"
gpu-arch-version: "12.1"
- name: CUDA 2.5.1
runs-on: linux.g5.12xlarge.nvidia.gpu
torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121'
gpu-arch-type: "cuda"
gpu-arch-version: "12.1"
- name: CPU 2.3
runs-on: linux.4xlarge
torch-spec: 'torch==2.3.0 --index-url https://download.pytorch.org/whl/cpu'
gpu-arch-type: "cpu"
gpu-arch-version: ""
- name: CPU 2.4
runs-on: linux.4xlarge
torch-spec: 'torch==2.4.0 --index-url https://download.pytorch.org/whl/cpu'
gpu-arch-type: "cpu"
gpu-arch-version: ""
- name: CPU 2.5.1
runs-on: linux.4xlarge
torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu'
gpu-arch-type: "cpu"
gpu-arch-version: ""
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
timeout: 120
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
script: |
conda create -n venv python=3.9 -y
conda activate venv
echo "::group::Install newer objcopy that supports --set-section-alignment"
python -m pip install --upgrade pip
pip install ${{ matrix.torch-spec }}
pip install -r dev-requirements.txt
pip install .
export CONDA=$(dirname $(dirname $(which conda)))
export LD_LIBRARY_PATH=$CONDA/lib/:$LD_LIBRARY_PATH
pytest test --verbose -s