Merge branch 'branch-24.12' into sync-scatter-stream

rapidsai · Nov 22, 2024 · d017e22 · d017e22
2 parents 8ba6cdf + 4807986
commit d017e22
Show file tree

Hide file tree

Showing 21 changed files with 223 additions and 86 deletions.
diff --git a/ci/test_python.sh b/ci/test_python.sh
@@ -73,7 +73,7 @@ if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then
     --channel nvidia \
     "pylibwholegraph=${RAPIDS_VERSION}" \
     "cugraph-dgl=${RAPIDS_VERSION}" \
-    'pytorch::pytorch>=2.3,<2.4' \
+    'pytorch>=2.3' \
     "ogb"
 
   rapids-print-env
@@ -111,7 +111,7 @@ if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then
     --channel pytorch \
     "pylibwholegraph=${RAPIDS_VERSION}" \
     "cugraph-pyg=${RAPIDS_VERSION}" \
-    'pytorch::pytorch>=2.3,<2.4' \
+    'pytorch>=2.3' \
     'ogb'
 
   rapids-print-env
@@ -149,7 +149,7 @@ if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then
     --channel pytorch \
     'mkl<2024.1.0' \
     "pylibwholegraph=${RAPIDS_VERSION}" \
-    'pytorch::pytorch>=2.3,<2.4' \
+    'pytorch>=2.3' \
     'pytest-forked' \
     'ogb'
 

diff --git a/ci/test_wheel_cugraph-dgl.sh b/ci/test_wheel_cugraph-dgl.sh
@@ -31,7 +31,7 @@ python -m pip install \
     "$(echo ./local-deps/pylibwholegraph_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
     "$(echo ./dist/cugraph_dgl_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
     'dgl==2.4.0' \
-    'torch>=2.0,<2.4.0a0'
+    'torch>=2.3'
 
 # RAPIDS_DATASET_ROOT_DIR is used by test scripts
 export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"

diff --git a/ci/test_wheel_pylibwholegraph.sh b/ci/test_wheel_pylibwholegraph.sh
@@ -26,7 +26,7 @@ rapids-logger "Installing Packages"
 rapids-retry python -m pip install \
     --extra-index-url ${INDEX_URL} \
     "$(echo ./dist/pylibwholegraph*.whl)[test]" \
-    'torch>=2.0,<2.4.0a0'
+    'torch>=2.3'
 
 rapids-logger "pytest pylibwholegraph"
 cd python/pylibwholegraph/pylibwholegraph/tests

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -42,7 +42,7 @@ dependencies:
 - pytest-forked
 - pytest-xdist
 - pytorch-cuda=11.8
-- pytorch::pytorch>=2.3,<2.4.0a0
+- pytorch>=2.3
 - pytorch_geometric>=2.5,<2.6
 - raft-dask==24.12.*,>=0.0.0a0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0

diff --git a/conda/environments/all_cuda-121_arch-x86_64.yaml b/conda/environments/all_cuda-121_arch-x86_64.yaml
@@ -48,7 +48,7 @@ dependencies:
 - pytest-forked
 - pytest-xdist
 - pytorch-cuda=12.1
-- pytorch::pytorch>=2.3,<2.4.0a0
+- pytorch>=2.3
 - pytorch_geometric>=2.5,<2.6
 - raft-dask==24.12.*,>=0.0.0a0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0

diff --git a/conda/environments/all_cuda-124_arch-x86_64.yaml b/conda/environments/all_cuda-124_arch-x86_64.yaml
@@ -48,7 +48,7 @@ dependencies:
 - pytest-forked
 - pytest-xdist
 - pytorch-cuda=12.4
-- pytorch::pytorch>=2.3,<2.4.0a0
+- pytorch>=2.3
 - pytorch_geometric>=2.5,<2.6
 - raft-dask==24.12.*,>=0.0.0a0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0

diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
@@ -32,7 +32,7 @@ requirements:
     - pylibcugraphops ={{ minor_version }}
     - tensordict >=0.1.2
     - python
-    - pytorch >=2.3,<2.4.0a0
+    - pytorch >=2.3
     - cupy >=12.0.0
 
 tests:

diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
@@ -32,7 +32,7 @@ requirements:
     - numpy >=1.23,<3.0a0
     - pandas
     - python
-    - pytorch >=2.3,<2.4.0a0
+    - pytorch >=2.3
     - cupy >=12.0.0
     - cugraph ={{ minor_version }}
     - pylibcugraphops ={{ minor_version }}

diff --git a/cpp/src/nvml_wrap.cpp b/cpp/src/nvml_wrap.cpp
@@ -0,0 +1,77 @@
+// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "nvml_wrap.h"
+
+#if CUDA_VERSION >= 12030
+#include <dlfcn.h>
+#include <mutex>
+#include <stdio.h>
+
+namespace {
+
+void* nvml_handle = nullptr;
+std::mutex nvml_mutex;
+bool nvml_loaded = false;
+
+bool LoadNvmlLibrary()
+{
+  nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
+  if (!nvml_handle) {
+    nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW);
+    if (!nvml_handle) {
+      fprintf(stderr, "Failed to load NVML library: %s\n", dlerror());
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+T LoadNvmlSymbol(const char* name)
+{
+  void* symbol = dlsym(nvml_handle, name);
+  if (!symbol) { return nullptr; }
+  return reinterpret_cast<T>(symbol);
+}
+
+}  // namespace
+
+// Global function pointers
+nvmlDeviceGetHandleByIndexFunc nvmlDeviceGetHandleByIndexPtr = nullptr;
+nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr = nullptr;
+
+// Ensure NVML is loaded and symbols are initialized
+bool NvmlFabricSymbolLoaded()
+{
+  std::lock_guard<std::mutex> lock(nvml_mutex);
+  if (nvml_loaded) {
+    return true;  // Already loaded
+  }
+
+  if (LoadNvmlLibrary()) {
+    nvmlDeviceGetHandleByIndexPtr =
+      LoadNvmlSymbol<nvmlDeviceGetHandleByIndexFunc>("nvmlDeviceGetHandleByIndex");
+    nvmlDeviceGetGpuFabricInfoPtr =
+      LoadNvmlSymbol<nvmlDeviceGetGpuFabricInfoFunc>("nvmlDeviceGetGpuFabricInfo");
+
+    if (!nvmlDeviceGetHandleByIndexPtr || !nvmlDeviceGetGpuFabricInfoPtr) {
+      dlclose(nvml_handle);
+      nvml_handle = nullptr;
+    } else {
+      nvml_loaded = true;
+    }
+  }
+  return nvml_loaded;
+}
+#endif  // CUDA_VERSION >= 12030
diff --git a/cpp/src/nvml_wrap.h b/cpp/src/nvml_wrap.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cuda.h>
+
+#if CUDA_VERSION >= 12030
+#include <nvml.h>
+
+bool NvmlFabricSymbolLoaded();
+
+typedef nvmlReturn_t (*nvmlDeviceGetHandleByIndexFunc)(unsigned int, nvmlDevice_t*);
+typedef nvmlReturn_t (*nvmlDeviceGetGpuFabricInfoFunc)(nvmlDevice_t, nvmlGpuFabricInfo_t*);
+
+extern nvmlDeviceGetHandleByIndexFunc nvmlDeviceGetHandleByIndexPtr;
+extern nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr;
+#endif  // CUDA_VERSION >= 12030
diff --git a/cpp/src/wholememory/communicator.cpp b/cpp/src/wholememory/communicator.cpp
@@ -497,6 +497,7 @@ void get_host_info(host_info* phi)
 bool comm_support_mnnvl(wholememory_comm_t wm_comm, const std::unique_ptr<rank_info[]>& p_rank_info)
 {
 #if CUDA_VERSION >= 12030
+  if (!nvmlFabricSymbolLoaded) return 0;
   int flag = 0;
   CUdevice currentDev;
   WM_CU_CHECK_NO_THROW(cuDeviceGet(&currentDev, wm_comm->dev_id));
@@ -534,16 +535,22 @@ void exchange_rank_info(wholememory_comm_t wm_comm)
   wm_comm->clique_info.is_in_clique = 0;
 
 #if CUDA_VERSION >= 12030
-  memset(&ri.fabric_info, 0, sizeof(ri.fabric_info));
-  WHOLEMEMORY_CHECK_NOTHROW(GetGpuFabricInfo(wm_comm->dev_id, &ri.fabric_info) ==
-                            WHOLEMEMORY_SUCCESS);
+  if (nvmlFabricSymbolLoaded) {
+    memset(&ri.fabric_info, 0, sizeof(ri.fabric_info));
+    WHOLEMEMORY_CHECK_NOTHROW(GetGpuFabricInfo(wm_comm->dev_id, &ri.fabric_info) ==
+                              WHOLEMEMORY_SUCCESS);
 
-  //    // A zero UUID means we don't have MNNVL fabric info
-  if (((((long*)ri.fabric_info.clusterUuid)[0] | ((long*)ri.fabric_info.clusterUuid)[1]) == 0)) {
-    wm_comm->clique_info.is_in_clique = 0;
+    //    // A zero UUID means we don't have MNNVL fabric info
+    if (((((long*)ri.fabric_info.clusterUuid)[0] | ((long*)ri.fabric_info.clusterUuid)[1]) == 0)) {
+      wm_comm->clique_info.is_in_clique = 0;
 
+    } else {
+      wm_comm->clique_info.is_in_clique = 1;
+    }
   } else {
-    wm_comm->clique_info.is_in_clique = 1;
+    WHOLEMEMORY_WARN(
+      "Some required NVML symbols are missing, likely due to an outdated GPU display driver. MNNVL "
+      "support will be disabled.");
   }
 
 #endif
@@ -573,38 +580,41 @@ void exchange_rank_info(wholememory_comm_t wm_comm)
     }
 
 #if CUDA_VERSION >= 12030
-
-    if ((memcmp(ri.fabric_info.clusterUuid,
-                p_rank_info.get()[r].fabric_info.clusterUuid,
-                NVML_GPU_FABRIC_UUID_LEN) == 0) &&
-        (ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) {
-      if (r == wm_comm->world_rank) {
-        wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num;
+    if (nvmlFabricSymbolLoaded) {
+      if ((memcmp(ri.fabric_info.clusterUuid,
+                  p_rank_info.get()[r].fabric_info.clusterUuid,
+                  NVML_GPU_FABRIC_UUID_LEN) == 0) &&
+          (ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) {
+        if (r == wm_comm->world_rank) {
+          wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num;
+        }
+        if (wm_comm->clique_info.clique_rank_num == 0) {
+          wm_comm->clique_info.clique_first_rank = r;
+        }
+        wm_comm->clique_info.clique_rank_num++;
       }
-      if (wm_comm->clique_info.clique_rank_num == 0) { wm_comm->clique_info.clique_first_rank = r; }
-      wm_comm->clique_info.clique_rank_num++;
+      clique_uuids.insert(
+        std::string(reinterpret_cast<const char*>(p_rank_info.get()[r].fabric_info.clusterUuid),
+                    NVML_GPU_FABRIC_UUID_LEN));
     }
-    clique_uuids.insert(
-      std::string(reinterpret_cast<const char*>(p_rank_info.get()[r].fabric_info.clusterUuid),
-                  NVML_GPU_FABRIC_UUID_LEN));
-
 #endif
   }
 
 #if CUDA_VERSION >= 12030
-  wm_comm->clique_info.clique_num = clique_uuids.size();
-
-  std::string uuid = std::string(reinterpret_cast<const char*>(ri.fabric_info.clusterUuid),
-                                 NVML_GPU_FABRIC_UUID_LEN);
-  int id           = 0;
-  for (auto clique_uuid : clique_uuids) {
-    if (clique_uuid == uuid) { wm_comm->clique_info.clique_id = id; }
-    id++;
-  }
-
-  wm_comm->support_mnnvl = (comm_support_mnnvl(wm_comm, p_rank_info)) &&
-                           (wm_comm->clique_info.clique_rank_num == wm_comm->world_size);
+  if (nvmlFabricSymbolLoaded) {
+    wm_comm->clique_info.clique_num = clique_uuids.size();
+
+    std::string uuid = std::string(reinterpret_cast<const char*>(ri.fabric_info.clusterUuid),
+                                   NVML_GPU_FABRIC_UUID_LEN);
+    int id           = 0;
+    for (auto clique_uuid : clique_uuids) {
+      if (clique_uuid == uuid) { wm_comm->clique_info.clique_id = id; }
+      id++;
+    }
 
+    wm_comm->support_mnnvl = (comm_support_mnnvl(wm_comm, p_rank_info)) &&
+                             (wm_comm->clique_info.clique_rank_num == wm_comm->world_size);
+  }
 #endif
 }
 

diff --git a/cpp/src/wholememory/system_info.cpp b/cpp/src/wholememory/system_info.cpp
@@ -13,8 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "system_info.hpp"
-
 #include <string>
 
 #include "cuda_macros.hpp"
@@ -140,17 +138,19 @@ wholememory_error_code_t NvmlEnsureInitialized()
 wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo)
 {
   WHOLEMEMORY_CHECK_NOTHROW(NvmlEnsureInitialized() == WHOLEMEMORY_SUCCESS);
-  std::lock_guard<std::mutex> locked(lock);
-  // gpuFabricInfo->version = nvmlGpuFabricInfo_v2;
-  nvmlDevice_t nvml_device;
-  nvmlReturn_t ret = nvmlDeviceGetHandleByIndex(dev, &nvml_device);
-  WHOLEMEMORY_EXPECTS_NOTHROW(
-    ret == NVML_SUCCESS, "nvmlDeviceGetHandleByIndex error:%s", nvmlErrorString(ret));
-  ret = nvmlDeviceGetGpuFabricInfo(nvml_device, gpuFabricInfo);
-  WHOLEMEMORY_EXPECTS_NOTHROW(
-    ret == NVML_SUCCESS, "nvmlDeviceGetGpuFabricInfo error:%s", nvmlErrorString(ret));
-
-  return WHOLEMEMORY_SUCCESS;
+  if (wholememory::nvmlFabricSymbolLoaded) {
+    std::lock_guard<std::mutex> locked(lock);
+    // gpuFabricInfo->version = nvmlGpuFabricInfo_v2;
+    nvmlDevice_t nvml_device;
+    nvmlReturn_t ret = nvmlDeviceGetHandleByIndexPtr(dev, &nvml_device);
+    WHOLEMEMORY_EXPECTS_NOTHROW(
+      ret == NVML_SUCCESS, "nvmlDeviceGetHandleByIndex error:%s", nvmlErrorString(ret));
+    ret = nvmlDeviceGetGpuFabricInfoPtr(nvml_device, gpuFabricInfo);
+    WHOLEMEMORY_EXPECTS_NOTHROW(
+      ret == NVML_SUCCESS, "nvmlDeviceGetGpuFabricInfo error:%s", nvmlErrorString(ret));
+    return WHOLEMEMORY_SUCCESS;
+  }
+  return WHOLEMEMORY_SYSTEM_ERROR;
 }
 
 };  // namespace wholememory

diff --git a/cpp/src/wholememory/system_info.hpp b/cpp/src/wholememory/system_info.hpp
@@ -18,6 +18,7 @@
 #include "wholememory/wholememory.h"
 
 #if CUDA_VERSION >= 12030
+#include "nvml_wrap.h"
 #include <nvml.h>
 #endif
 bool DevAttrPagebleMemoryAccess();
@@ -37,7 +38,9 @@ bool SupportEGM();
 // bool SupportMNNVLForEGM();
 #if CUDA_VERSION >= 12030
 namespace wholememory {
+
+inline bool nvmlFabricSymbolLoaded = NvmlFabricSymbolLoaded();
 wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo);
-}
+}  // namespace wholememory
 
 #endif
diff --git a/dependencies.yaml b/dependencies.yaml
@@ -388,7 +388,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - pytorch::pytorch>=2.3,<2.4.0a0
+          - pytorch>=2.3
           - torchdata
           - pydantic
     specific:
@@ -405,7 +405,7 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - &pytorch_pip torch>=2.3,<2.4.0a0
+              - &pytorch_pip torch>=2.3
               - *tensordict
           - matrix: {cuda: "11.*"}
             packages:

diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
@@ -18,7 +18,7 @@ dependencies:
 - pytest-cov
 - pytest-xdist
 - pytorch-cuda=11.8
-- pytorch::pytorch>=2.3,<2.4.0a0
+- pytorch>=2.3
 - tensordict>=0.1.2
 - torchdata
 name: cugraph_dgl_dev_cuda-118
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
@@ -38,7 +38,7 @@ test = [
     "pytest-cov",
     "pytest-xdist",
     "tensordict>=0.1.2",
-    "torch>=2.3,<2.4.0a0",
+    "torch>=2.3",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]