Skip to content

Commit

Permalink
Merge branch 'branch-24.12' into sync-scatter-stream
Browse files Browse the repository at this point in the history
  • Loading branch information
alexbarghi-nv authored Nov 22, 2024
2 parents 8ba6cdf + 4807986 commit d017e22
Show file tree
Hide file tree
Showing 21 changed files with 223 additions and 86 deletions.
6 changes: 3 additions & 3 deletions ci/test_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then
--channel nvidia \
"pylibwholegraph=${RAPIDS_VERSION}" \
"cugraph-dgl=${RAPIDS_VERSION}" \
'pytorch::pytorch>=2.3,<2.4' \
'pytorch>=2.3' \
"ogb"

rapids-print-env
Expand Down Expand Up @@ -111,7 +111,7 @@ if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then
--channel pytorch \
"pylibwholegraph=${RAPIDS_VERSION}" \
"cugraph-pyg=${RAPIDS_VERSION}" \
'pytorch::pytorch>=2.3,<2.4' \
'pytorch>=2.3' \
'ogb'

rapids-print-env
Expand Down Expand Up @@ -149,7 +149,7 @@ if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then
--channel pytorch \
'mkl<2024.1.0' \
"pylibwholegraph=${RAPIDS_VERSION}" \
'pytorch::pytorch>=2.3,<2.4' \
'pytorch>=2.3' \
'pytest-forked' \
'ogb'

Expand Down
2 changes: 1 addition & 1 deletion ci/test_wheel_cugraph-dgl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ python -m pip install \
"$(echo ./local-deps/pylibwholegraph_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
"$(echo ./dist/cugraph_dgl_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
'dgl==2.4.0' \
'torch>=2.0,<2.4.0a0'
'torch>=2.3'

# RAPIDS_DATASET_ROOT_DIR is used by test scripts
export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
Expand Down
2 changes: 1 addition & 1 deletion ci/test_wheel_pylibwholegraph.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ rapids-logger "Installing Packages"
rapids-retry python -m pip install \
--extra-index-url ${INDEX_URL} \
"$(echo ./dist/pylibwholegraph*.whl)[test]" \
'torch>=2.0,<2.4.0a0'
'torch>=2.3'

rapids-logger "pytest pylibwholegraph"
cd python/pylibwholegraph/pylibwholegraph/tests
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ dependencies:
- pytest-forked
- pytest-xdist
- pytorch-cuda=11.8
- pytorch::pytorch>=2.3,<2.4.0a0
- pytorch>=2.3
- pytorch_geometric>=2.5,<2.6
- raft-dask==24.12.*,>=0.0.0a0
- rapids-build-backend>=0.3.0,<0.4.0.dev0
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-121_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ dependencies:
- pytest-forked
- pytest-xdist
- pytorch-cuda=12.1
- pytorch::pytorch>=2.3,<2.4.0a0
- pytorch>=2.3
- pytorch_geometric>=2.5,<2.6
- raft-dask==24.12.*,>=0.0.0a0
- rapids-build-backend>=0.3.0,<0.4.0.dev0
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-124_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ dependencies:
- pytest-forked
- pytest-xdist
- pytorch-cuda=12.4
- pytorch::pytorch>=2.3,<2.4.0a0
- pytorch>=2.3
- pytorch_geometric>=2.5,<2.6
- raft-dask==24.12.*,>=0.0.0a0
- rapids-build-backend>=0.3.0,<0.4.0.dev0
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cugraph-dgl/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ requirements:
- pylibcugraphops ={{ minor_version }}
- tensordict >=0.1.2
- python
- pytorch >=2.3,<2.4.0a0
- pytorch >=2.3
- cupy >=12.0.0

tests:
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cugraph-pyg/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ requirements:
- numpy >=1.23,<3.0a0
- pandas
- python
- pytorch >=2.3,<2.4.0a0
- pytorch >=2.3
- cupy >=12.0.0
- cugraph ={{ minor_version }}
- pylibcugraphops ={{ minor_version }}
Expand Down
77 changes: 77 additions & 0 deletions cpp/src/nvml_wrap.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "nvml_wrap.h"

#if CUDA_VERSION >= 12030
#include <dlfcn.h>
#include <mutex>
#include <stdio.h>

namespace {

void* nvml_handle = nullptr;
std::mutex nvml_mutex;
bool nvml_loaded = false;

bool LoadNvmlLibrary()
{
nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
if (!nvml_handle) {
nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW);
if (!nvml_handle) {
fprintf(stderr, "Failed to load NVML library: %s\n", dlerror());
return false;
}
}
return true;
}

template <typename T>
T LoadNvmlSymbol(const char* name)
{
void* symbol = dlsym(nvml_handle, name);
if (!symbol) { return nullptr; }
return reinterpret_cast<T>(symbol);
}

} // namespace

// Global function pointers
nvmlDeviceGetHandleByIndexFunc nvmlDeviceGetHandleByIndexPtr = nullptr;
nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr = nullptr;

// Ensure NVML is loaded and symbols are initialized
bool NvmlFabricSymbolLoaded()
{
std::lock_guard<std::mutex> lock(nvml_mutex);
if (nvml_loaded) {
return true; // Already loaded
}

if (LoadNvmlLibrary()) {
nvmlDeviceGetHandleByIndexPtr =
LoadNvmlSymbol<nvmlDeviceGetHandleByIndexFunc>("nvmlDeviceGetHandleByIndex");
nvmlDeviceGetGpuFabricInfoPtr =
LoadNvmlSymbol<nvmlDeviceGetGpuFabricInfoFunc>("nvmlDeviceGetGpuFabricInfo");

if (!nvmlDeviceGetHandleByIndexPtr || !nvmlDeviceGetGpuFabricInfoPtr) {
dlclose(nvml_handle);
nvml_handle = nullptr;
} else {
nvml_loaded = true;
}
}
return nvml_loaded;
}
#endif // CUDA_VERSION >= 12030
27 changes: 27 additions & 0 deletions cpp/src/nvml_wrap.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda.h>

#if CUDA_VERSION >= 12030
#include <nvml.h>

bool NvmlFabricSymbolLoaded();

typedef nvmlReturn_t (*nvmlDeviceGetHandleByIndexFunc)(unsigned int, nvmlDevice_t*);
typedef nvmlReturn_t (*nvmlDeviceGetGpuFabricInfoFunc)(nvmlDevice_t, nvmlGpuFabricInfo_t*);

extern nvmlDeviceGetHandleByIndexFunc nvmlDeviceGetHandleByIndexPtr;
extern nvmlDeviceGetGpuFabricInfoFunc nvmlDeviceGetGpuFabricInfoPtr;
#endif // CUDA_VERSION >= 12030
74 changes: 42 additions & 32 deletions cpp/src/wholememory/communicator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,7 @@ void get_host_info(host_info* phi)
bool comm_support_mnnvl(wholememory_comm_t wm_comm, const std::unique_ptr<rank_info[]>& p_rank_info)
{
#if CUDA_VERSION >= 12030
if (!nvmlFabricSymbolLoaded) return 0;
int flag = 0;
CUdevice currentDev;
WM_CU_CHECK_NO_THROW(cuDeviceGet(&currentDev, wm_comm->dev_id));
Expand Down Expand Up @@ -534,16 +535,22 @@ void exchange_rank_info(wholememory_comm_t wm_comm)
wm_comm->clique_info.is_in_clique = 0;

#if CUDA_VERSION >= 12030
memset(&ri.fabric_info, 0, sizeof(ri.fabric_info));
WHOLEMEMORY_CHECK_NOTHROW(GetGpuFabricInfo(wm_comm->dev_id, &ri.fabric_info) ==
WHOLEMEMORY_SUCCESS);
if (nvmlFabricSymbolLoaded) {
memset(&ri.fabric_info, 0, sizeof(ri.fabric_info));
WHOLEMEMORY_CHECK_NOTHROW(GetGpuFabricInfo(wm_comm->dev_id, &ri.fabric_info) ==
WHOLEMEMORY_SUCCESS);

// // A zero UUID means we don't have MNNVL fabric info
if (((((long*)ri.fabric_info.clusterUuid)[0] | ((long*)ri.fabric_info.clusterUuid)[1]) == 0)) {
wm_comm->clique_info.is_in_clique = 0;
// // A zero UUID means we don't have MNNVL fabric info
if (((((long*)ri.fabric_info.clusterUuid)[0] | ((long*)ri.fabric_info.clusterUuid)[1]) == 0)) {
wm_comm->clique_info.is_in_clique = 0;

} else {
wm_comm->clique_info.is_in_clique = 1;
}
} else {
wm_comm->clique_info.is_in_clique = 1;
WHOLEMEMORY_WARN(
"Some required NVML symbols are missing, likely due to an outdated GPU display driver. MNNVL "
"support will be disabled.");
}

#endif
Expand Down Expand Up @@ -573,38 +580,41 @@ void exchange_rank_info(wholememory_comm_t wm_comm)
}

#if CUDA_VERSION >= 12030

if ((memcmp(ri.fabric_info.clusterUuid,
p_rank_info.get()[r].fabric_info.clusterUuid,
NVML_GPU_FABRIC_UUID_LEN) == 0) &&
(ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) {
if (r == wm_comm->world_rank) {
wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num;
if (nvmlFabricSymbolLoaded) {
if ((memcmp(ri.fabric_info.clusterUuid,
p_rank_info.get()[r].fabric_info.clusterUuid,
NVML_GPU_FABRIC_UUID_LEN) == 0) &&
(ri.fabric_info.cliqueId == p_rank_info.get()[r].fabric_info.cliqueId)) {
if (r == wm_comm->world_rank) {
wm_comm->clique_info.clique_rank = wm_comm->clique_info.clique_rank_num;
}
if (wm_comm->clique_info.clique_rank_num == 0) {
wm_comm->clique_info.clique_first_rank = r;
}
wm_comm->clique_info.clique_rank_num++;
}
if (wm_comm->clique_info.clique_rank_num == 0) { wm_comm->clique_info.clique_first_rank = r; }
wm_comm->clique_info.clique_rank_num++;
clique_uuids.insert(
std::string(reinterpret_cast<const char*>(p_rank_info.get()[r].fabric_info.clusterUuid),
NVML_GPU_FABRIC_UUID_LEN));
}
clique_uuids.insert(
std::string(reinterpret_cast<const char*>(p_rank_info.get()[r].fabric_info.clusterUuid),
NVML_GPU_FABRIC_UUID_LEN));

#endif
}

#if CUDA_VERSION >= 12030
wm_comm->clique_info.clique_num = clique_uuids.size();

std::string uuid = std::string(reinterpret_cast<const char*>(ri.fabric_info.clusterUuid),
NVML_GPU_FABRIC_UUID_LEN);
int id = 0;
for (auto clique_uuid : clique_uuids) {
if (clique_uuid == uuid) { wm_comm->clique_info.clique_id = id; }
id++;
}

wm_comm->support_mnnvl = (comm_support_mnnvl(wm_comm, p_rank_info)) &&
(wm_comm->clique_info.clique_rank_num == wm_comm->world_size);
if (nvmlFabricSymbolLoaded) {
wm_comm->clique_info.clique_num = clique_uuids.size();

std::string uuid = std::string(reinterpret_cast<const char*>(ri.fabric_info.clusterUuid),
NVML_GPU_FABRIC_UUID_LEN);
int id = 0;
for (auto clique_uuid : clique_uuids) {
if (clique_uuid == uuid) { wm_comm->clique_info.clique_id = id; }
id++;
}

wm_comm->support_mnnvl = (comm_support_mnnvl(wm_comm, p_rank_info)) &&
(wm_comm->clique_info.clique_rank_num == wm_comm->world_size);
}
#endif
}

Expand Down
26 changes: 13 additions & 13 deletions cpp/src/wholememory/system_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "system_info.hpp"

#include <string>

#include "cuda_macros.hpp"
Expand Down Expand Up @@ -140,17 +138,19 @@ wholememory_error_code_t NvmlEnsureInitialized()
wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo)
{
WHOLEMEMORY_CHECK_NOTHROW(NvmlEnsureInitialized() == WHOLEMEMORY_SUCCESS);
std::lock_guard<std::mutex> locked(lock);
// gpuFabricInfo->version = nvmlGpuFabricInfo_v2;
nvmlDevice_t nvml_device;
nvmlReturn_t ret = nvmlDeviceGetHandleByIndex(dev, &nvml_device);
WHOLEMEMORY_EXPECTS_NOTHROW(
ret == NVML_SUCCESS, "nvmlDeviceGetHandleByIndex error:%s", nvmlErrorString(ret));
ret = nvmlDeviceGetGpuFabricInfo(nvml_device, gpuFabricInfo);
WHOLEMEMORY_EXPECTS_NOTHROW(
ret == NVML_SUCCESS, "nvmlDeviceGetGpuFabricInfo error:%s", nvmlErrorString(ret));

return WHOLEMEMORY_SUCCESS;
if (wholememory::nvmlFabricSymbolLoaded) {
std::lock_guard<std::mutex> locked(lock);
// gpuFabricInfo->version = nvmlGpuFabricInfo_v2;
nvmlDevice_t nvml_device;
nvmlReturn_t ret = nvmlDeviceGetHandleByIndexPtr(dev, &nvml_device);
WHOLEMEMORY_EXPECTS_NOTHROW(
ret == NVML_SUCCESS, "nvmlDeviceGetHandleByIndex error:%s", nvmlErrorString(ret));
ret = nvmlDeviceGetGpuFabricInfoPtr(nvml_device, gpuFabricInfo);
WHOLEMEMORY_EXPECTS_NOTHROW(
ret == NVML_SUCCESS, "nvmlDeviceGetGpuFabricInfo error:%s", nvmlErrorString(ret));
return WHOLEMEMORY_SUCCESS;
}
return WHOLEMEMORY_SYSTEM_ERROR;
}

}; // namespace wholememory
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/wholememory/system_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "wholememory/wholememory.h"

#if CUDA_VERSION >= 12030
#include "nvml_wrap.h"
#include <nvml.h>
#endif
bool DevAttrPagebleMemoryAccess();
Expand All @@ -37,7 +38,9 @@ bool SupportEGM();
// bool SupportMNNVLForEGM();
#if CUDA_VERSION >= 12030
namespace wholememory {

inline bool nvmlFabricSymbolLoaded = NvmlFabricSymbolLoaded();
wholememory_error_code_t GetGpuFabricInfo(int dev, nvmlGpuFabricInfo_t* gpuFabricInfo);
}
} // namespace wholememory

#endif
4 changes: 2 additions & 2 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ dependencies:
common:
- output_types: [conda]
packages:
- pytorch::pytorch>=2.3,<2.4.0a0
- pytorch>=2.3
- torchdata
- pydantic
specific:
Expand All @@ -405,7 +405,7 @@ dependencies:
matrices:
- matrix: {cuda: "12.*"}
packages:
- &pytorch_pip torch>=2.3,<2.4.0a0
- &pytorch_pip torch>=2.3
- *tensordict
- matrix: {cuda: "11.*"}
packages:
Expand Down
2 changes: 1 addition & 1 deletion python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies:
- pytest-cov
- pytest-xdist
- pytorch-cuda=11.8
- pytorch::pytorch>=2.3,<2.4.0a0
- pytorch>=2.3
- tensordict>=0.1.2
- torchdata
name: cugraph_dgl_dev_cuda-118
2 changes: 1 addition & 1 deletion python/cugraph-dgl/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ test = [
"pytest-cov",
"pytest-xdist",
"tensordict>=0.1.2",
"torch>=2.3,<2.4.0a0",
"torch>=2.3",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

[project.urls]
Expand Down
Loading

0 comments on commit d017e22

Please sign in to comment.