Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: change calls to deprecated CUB functions and fix compiler warnings about return values #15

Merged
merged 1 commit into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 16 additions & 21 deletions src/bellman-cuda-cub.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,28 @@ namespace common {
using namespace cub;

cudaError_t sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, int num_items, int begin_bit,
int end_bit, cudaStream_t stream, bool debug_synchronous) {
return DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream, debug_synchronous);
int end_bit, cudaStream_t stream) {
return DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream);
}

cudaError_t sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, const unsigned *d_values_in,
unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) {
return DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream,
debug_synchronous);
unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream) {
return DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream);
}

cudaError_t sort_pairs_descending(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out,
const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream,
bool debug_synchronous) {
const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream) {
return DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit,
end_bit, stream, debug_synchronous);
end_bit, stream);
}

cudaError_t run_length_encode(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_unique_out, unsigned *d_counts_out,
unsigned *d_num_runs_out, int num_items, cudaStream_t stream, bool debug_synchronous) {
return DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream,
debug_synchronous);
unsigned *d_num_runs_out, int num_items, cudaStream_t stream) {
return DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
}

cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream,
bool debug_synchronous) {
return DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream) {
return DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
}

} // namespace common
Expand All @@ -51,21 +47,20 @@ struct fq_mul {
__device__ __forceinline__ storage operator()(const storage &a, const storage &b) const { return fd_q::mul(a, b); }
};

cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream,
bool debug_synchronous) {
return DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, fq_add(), fd_q::storage(), stream, debug_synchronous);
cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream) {
return DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, fq_add(), fd_q::storage(), stream);
}

cudaError_t inclusive_prefix_product(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
cudaStream_t stream, bool debug_synchronous) {
return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, fq_mul(), num_items, stream, debug_synchronous);
cudaStream_t stream) {
return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, fq_mul(), num_items, stream);
}

cudaError_t inclusive_prefix_product_reverse(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
cudaStream_t stream, bool debug_synchronous) {
cudaStream_t stream) {
auto i_in = std::reverse_iterator(d_in + num_items);
auto i_out = std::reverse_iterator(d_out + num_items);
return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, i_in, i_out, fq_mul(), num_items, stream, debug_synchronous);
return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, i_in, i_out, fq_mul(), num_items, stream);
}

} // namespace ff
21 changes: 10 additions & 11 deletions src/bellman-cuda-cub.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,32 @@
namespace common {

cudaError_t sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, int num_items, int begin_bit = 0,
int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr, bool debug_synchronous = false);
int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr);

cudaError_t sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, const unsigned *d_values_in,
unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr,
bool debug_synchronous = false);
unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr);

cudaError_t sort_pairs_descending(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out,
const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8,
cudaStream_t stream = nullptr, bool debug_synchronous = false);
cudaStream_t stream = nullptr);

cudaError_t run_length_encode(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_unique_out, unsigned *d_counts_out,
unsigned *d_num_runs_out, int num_items, cudaStream_t stream = nullptr, bool debug_synchronous = false);
unsigned *d_num_runs_out, int num_items, cudaStream_t stream = nullptr);

cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream = nullptr,
bool debug_synchronous = false);
cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items,
cudaStream_t stream = nullptr);

} // namespace common

namespace ff {

cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream = nullptr,
bool debug_synchronous = false);
cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
cudaStream_t stream = nullptr);

cudaError_t inclusive_prefix_product(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
cudaStream_t stream = nullptr, bool debug_synchronous = false);
cudaStream_t stream = nullptr);

cudaError_t inclusive_prefix_product_reverse(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
cudaStream_t stream = nullptr, bool debug_synchronous = false);
cudaStream_t stream = nullptr);

} // namespace ff
1 change: 1 addition & 0 deletions src/memory.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ template <typename T, ld_modifier MODIFIER> static constexpr __device__ __forcei
case ld_modifier::cv:
return __ldcv(ptr);
}
return *ptr;
}

enum class st_modifier { none, wb, cg, cs, wt };
Expand Down
3 changes: 1 addition & 2 deletions src/ntt_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,8 @@ __device__ __forceinline__ typename FD::storage *index_to_addr(const per_device_
// "addrs" passed from ntt_smem_stages_kernel should be in constant memory, which is dynamically indexable.
// I guess nvcc moved ntt_smem_stages_kernel "inputs" and "outputs" to registers then tried to dynamically
// index addr.data here in index_to_addr. Smart :eyeroll: Whatever, switch statement works.
} else {
return addrs.data[0] + idx;
}
return addrs.data[0] + idx;
}

// Carries out up to MAX_SMEM_STAGES - log_tile_sz C-T stages in shared memory.
Expand Down
Loading