Skip to content

Commit

Permalink
Merge pull request #2395 from rapidsai/branch-24.08
Browse files Browse the repository at this point in the history
Forward-merge branch-24.08 into branch-24.10
  • Loading branch information
raydouglass authored Jul 25, 2024
2 parents 1682700 + 7bffdac commit 401935b
Show file tree
Hide file tree
Showing 11 changed files with 62 additions and 18 deletions.
9 changes: 8 additions & 1 deletion cpp/bench/prims/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,14 @@ if(BUILD_PRIMS_BENCH)
PATH
core/bitset.cu
core/copy.cu
core/popc.cu
main.cpp
)

ConfigureBench(
NAME
UTIL_BENCH
PATH
util/popc.cu
main.cpp
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#include <common/benchmark.hpp>

#include <raft/core/popc.hpp>
#include <raft/util/popc.cuh>

namespace raft::bench::core {

Expand Down
9 changes: 6 additions & 3 deletions cpp/include/raft/comms/detail/std_comms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,13 +307,16 @@ class std_comms : public comms_iface {
bool restart = false; // resets the timeout when any progress was made

if (worker->isProgressThreadRunning()) {
// Wait for a UCXX progress thread roundtrip
// Wait for a UCXX progress thread roundtrip, prevent waiting for longer
// than 10ms for each operation, will retry in next iteration.
ucxx::utils::CallbackNotifier callbackNotifierPre{};
worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); });
worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); },
10000000 /* 10ms */);
callbackNotifierPre.wait();

ucxx::utils::CallbackNotifier callbackNotifierPost{};
worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); });
worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); },
10000000 /* 10ms */);
callbackNotifierPost.wait();
} else {
// Causes UCXX to progress through the send/recv message queue
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/core/bitset.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@
#include <raft/core/bitset.hpp>
#include <raft/core/device_container_policy.hpp>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/popc.hpp>
#include <raft/core/resource/thrust_policy.hpp>
#include <raft/core/resources.hpp>
#include <raft/linalg/map.cuh>
#include <raft/linalg/reduce.cuh>
#include <raft/util/device_atomics.cuh>
#include <raft/util/popc.cuh>

#include <thrust/for_each.h>

Expand Down
33 changes: 25 additions & 8 deletions cpp/include/raft/linalg/detail/eig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@
#include "cusolver_wrappers.hpp"

#include <raft/core/resource/cusolver_dn_handle.hpp>
#include <raft/core/resource/detail/stream_sync_event.hpp>
#include <raft/core/resources.hpp>
#include <raft/matrix/copy.cuh>
#include <raft/util/cudart_utils.hpp>

#include <rmm/cuda_stream.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>

Expand Down Expand Up @@ -90,7 +92,19 @@ void eigDC(raft::resources const& handle,
{
#if CUDART_VERSION < 11010
eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
return;
#endif

#if CUDART_VERSION <= 12040
// Use a new stream instead of `cudaStreamPerThread` to avoid cusolver bug # 4580093.
rmm::cuda_stream stream_new_wrapper;
cudaStream_t stream_new = stream_new_wrapper.value();
cudaEvent_t sync_event = resource::detail::get_cuda_stream_sync_event(handle);
RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream));
RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_new, sync_event));
#else
cudaStream_t stream_new = stream;
#endif
cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle);

cusolverDnParams_t dn_params = nullptr;
Expand All @@ -108,15 +122,13 @@ void eigDC(raft::resources const& handle,
eig_vals,
&workspaceDevice,
&workspaceHost,
stream));
stream_new));

rmm::device_uvector<math_t> d_work(workspaceDevice / sizeof(math_t), stream);
rmm::device_scalar<int> d_dev_info(stream);
rmm::device_uvector<math_t> d_work(workspaceDevice / sizeof(math_t), stream_new);
rmm::device_scalar<int> d_dev_info(stream_new);
std::vector<math_t> h_work(workspaceHost / sizeof(math_t));

raft::matrix::copy(handle,
make_device_matrix_view<const math_t>(in, n_rows, n_cols),
make_device_matrix_view<math_t>(eig_vectors, n_rows, n_cols));
raft::copy(eig_vectors, in, n_rows * n_cols, stream_new);

RAFT_CUSOLVER_TRY(cusolverDnxsyevd(cusolverH,
dn_params,
Expand All @@ -131,14 +143,19 @@ void eigDC(raft::resources const& handle,
h_work.data(),
workspaceHost,
d_dev_info.data(),
stream));
stream_new));

RAFT_CUDA_TRY(cudaGetLastError());
RAFT_CUSOLVER_TRY(cusolverDnDestroyParams(dn_params));
int dev_info = d_dev_info.value(stream);
int dev_info = d_dev_info.value(stream_new);
ASSERT(dev_info == 0,
"eig.cuh: eigensolver couldn't converge to a solution. "
"This usually occurs when some of the features do not vary enough.");

#if CUDART_VERSION <= 12040
// Synchronize the created stream with the original stream before return
RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new));
RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event));
#endif
}

Expand Down
1 change: 0 additions & 1 deletion cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#pragma once

#include <raft/core/bitmap.cuh>
#include <raft/core/detail/popc.cuh>
#include <raft/core/device_csr_matrix.hpp>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/device_mdspan.hpp>
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/

#pragma once
#include <raft/core/detail/popc.cuh>
#include <raft/util/detail/popc.cuh>
namespace raft {

/**
Expand Down
2 changes: 1 addition & 1 deletion cpp/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ if(BUILD_TESTS)
core/math_host.cpp
core/operators_device.cu
core/operators_host.cpp
core/popc.cu
core/handle.cpp
core/interruptible.cu
core/nvtx.cpp
Expand Down Expand Up @@ -509,6 +508,7 @@ if(BUILD_TESTS)
util/integer_utils.cpp
util/integer_utils.cu
util/memory_type_dispatcher.cu
util/popc.cu
util/pow2_utils.cu
util/reduction.cu
)
Expand Down
18 changes: 18 additions & 0 deletions cpp/test/linalg/eig.cu
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,24 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
eig_vals_large, eig_vals_jacobi_large;
};

TEST(Raft, EigStream)
{
// Separate test to check eig_dc stream workaround for CUDA 12+
raft::resources handle;
auto n_rows = 5000;
auto cov_matrix_stream =
raft::make_device_matrix<float, std::uint32_t, raft::col_major>(handle, n_rows, n_rows);
auto eig_vectors_stream =
raft::make_device_matrix<float, std::uint32_t, raft::col_major>(handle, n_rows, n_rows);
auto eig_vals_stream = raft::make_device_vector<float, std::uint32_t>(handle, n_rows);

raft::linalg::eig_dc(handle,
raft::make_const_mdspan(cov_matrix_stream.view()),
eig_vectors_stream.view(),
eig_vals_stream.view());
raft::resource::sync_stream(handle, raft::resource::get_cuda_stream(handle));
}

const std::vector<EigInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}};

const std::vector<EigInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}};
Expand Down
2 changes: 1 addition & 1 deletion cpp/test/core/popc.cu → cpp/test/util/popc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
#include <raft/core/device_mdarray.hpp>
#include <raft/core/device_mdspan.hpp>
#include <raft/core/host_mdspan.hpp>
#include <raft/core/popc.hpp>
#include <raft/core/resource/cuda_stream.hpp>
#include <raft/core/resources.hpp>
#include <raft/util/cuda_utils.cuh>
#include <raft/util/popc.cuh>

#include <gtest/gtest.h>

Expand Down

0 comments on commit 401935b

Please sign in to comment.