Merge branch 'branch-25.02' into remove-legacy-support

rapidsai · Jan 6, 2025 · fc6cf87 · fc6cf87
2 parents 524aed4 + d9c6996
commit fc6cf87
Show file tree

Hide file tree

Showing 8 changed files with 73 additions and 8 deletions.
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -20,7 +20,7 @@ dependencies:
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
-- pynvml>=11.0.0,<12.0.0a0
+- pynvml>=12.0.0,<13.0.0a0
 - pytest
 - pytest-cov
 - python>=3.10,<3.13

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -20,7 +20,7 @@ dependencies:
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
-- pynvml>=11.0.0,<12.0.0a0
+- pynvml>=12.0.0,<13.0.0a0
 - pytest
 - pytest-cov
 - python>=3.10,<3.13

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - numpydoc>=1.1.0
 - pandas>=1.3
 - pre-commit
-- pynvml>=11.0.0,<12.0.0a0
+- pynvml>=12.0.0,<13.0.0a0
 - pytest
 - pytest-cov
 - python>=3.10,<3.13

diff --git a/dask_cuda/plugins.py b/dask_cuda/plugins.py
@@ -1,4 +1,5 @@
 import importlib
+import logging
 import os
 from typing import Callable, Dict
 
@@ -12,7 +13,15 @@ def __init__(self, cores):
         self.cores = cores
 
     def setup(self, worker=None):
-        os.sched_setaffinity(0, self.cores)
+        try:
+            os.sched_setaffinity(0, self.cores)
+        except Exception:
+            logger = logging.getLogger("distributed.worker")
+            logger.warning(
+                "Setting CPU affinity for GPU failed. Please refer to the following "
+                "link for troubleshooting information: "
+                "https://docs.rapids.ai/api/dask-cuda/nightly/troubleshooting/#setting-cpu-affinity-failure"  # noqa: E501
+            )
 
 
 class CUDFSetup(WorkerPlugin):

diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py
@@ -1,6 +1,7 @@
 import os
 from unittest.mock import patch
 
+import pynvml
 import pytest
 from numba import cuda
 
@@ -197,7 +198,6 @@ def test_get_ucx_config(enable_tcp_over_ucx, enable_infiniband, enable_nvlink):
 
 
 def test_parse_visible_devices():
-    pynvml = pytest.importorskip("pynvml")
     pynvml.nvmlInit()
     indices = []
     uuids = []
@@ -250,7 +250,6 @@ def test_parse_device_memory_limit():
 
 
 def test_parse_visible_mig_devices():
-    pynvml = pytest.importorskip("pynvml")
     pynvml.nvmlInit()
     for index in range(get_gpu_count()):
         handle = pynvml.nvmlDeviceGetHandleByIndex(index)

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -157,7 +157,7 @@ dependencies:
           - numba>=0.57
           - numpy>=1.23,<3.0a0
           - pandas>=1.3
-          - pynvml>=11.0.0,<12.0.0a0
+          - pynvml>=12.0.0,<13.0.0a0
           - rapids-dask-dependency==25.2.*,>=0.0.0a0
           - zict>=2.0.0
   test_python:

diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst
@@ -30,3 +30,60 @@ For the DGX Station A100, the display GPU is commonly the fourth in the PCI Bus
 
     >>> from dask_cuda import LocalCUDACluster
     >>> cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES=[0, 1, 2, 4])
+
+Setting CPU Affinity Failure
+----------------------------
+
+Setting the proper CPU affinity for a Dask-CUDA worker is important to ensure optimal performance, particularly when
+memory transfers to/from system memory is necessary. In Dask-CUDA this is an automatic feature that attempts to
+determine the appropriate CPU affinity for each worker according to the GPU that worker is targeting.
+
+There are situations where setting the CPU affinity may fail, the more common case involves workload managers and job
+schedulers used by large compute clusters, such as Slurm.
+
+Within a node with multiple physical CPU (i.e., multiple CPU sockets) and multiple GPUs, in such systems it is common
+for GPUs to be directly connected to a specific physical CPU to balance resources. Consider for example a node with 4
+GPUs and 40 CPU cores, where the CPU cores are split between two physical CPUs, in this case GPUs 0 and 1 may be
+connected to CPUs 0-19 and GPUs 2 and 3 may be connected to CPUs 20-39. In a setup like this, if the node is entirely
+assigned to the Dask-CUDA job, most likely setting CPU affinity will succeed, however, it is still possible that the
+job assigns the wrong CPUs 20-39 to GPUs 0 and 1, or CPUs 0-19 to GPUs 2 and 3, in this case setting the CPU affinity
+will be impossible, since the correct CPU/GPU resources are not available to the job. When this happens, the best
+Dask-CUDA can do is raise a warning that redirects you to this sections and not set any CPU affinity, letting the
+operating system handle all transfers as it sees fit, even if they may follow a suboptimal path.
+
+If after following the instructions contained in this section, including consulting your cluster's manual and
+administrators, please [file an issue under the Dask-CUDA repository](https://github.com/rapidsai/dask-cuda/issues),
+including the output for all commands below, they must be executed from the allocated cluster job:
+
+- ``conda list``, if environment was installed with conda or uses a RAPIDS provided Docker image;
+- ``pip list``, if environment was installed with pip;
+- ``nvidia-smi``;
+- ``nvidia-smi topo -m``;
+- ``python print_affinity.py``, the code for ``print_affinity.py`` immediately follows.
+
+.. code-block:: python
+
+    # print_affinity.py
+    import math
+    from multiprocessing import cpu_count
+
+    import pynvml
+
+    pynvml.nvmlInit()
+    for i in range(pynvml.nvmlDeviceGetCount()):
+        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+        cpu_affinity = pynvml.nvmlDeviceGetCpuAffinity(handle, math.ceil(cpu_count() / 64))
+        print(f"GPU {i}: list(cpu_affinity)")
+
+Slurm
+~~~~~
+
+The more commonly observed cases of this issue have been reported on Slurm clusters. Common ways to resolve this
+normally involve providing a specific subset of CPUs to the job with one of the following arguments:
+
+- `--cpus-per-task=N`: the number of CPUs the job will have allocated, you may need to ask for all CPUs to ensure
+  the GPUs have all CPUs relevant to them available;
+- `--exclusive`: to ensure exclusive allocation of CPUs to the job.
+
+Unfortunately, providing exact solutions for all existing clust configurations is not possible, therefore make
+make sure to consult your cluster's manual and administrator for detailed information and further troubleshooting.
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
     "numba>=0.57",
     "numpy>=1.23,<3.0a0",
     "pandas>=1.3",
-    "pynvml>=11.0.0,<12.0.0a0",
+    "pynvml>=12.0.0,<13.0.0a0",
     "rapids-dask-dependency==25.2.*,>=0.0.0a0",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.