matter-labs · robik75 · Sep 25, 2024 · Sep 25, 2024
@@ -7,32 +7,28 @@ namespace common {
 using namespace cub;
 
 cudaError_t sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, int num_items, int begin_bit,
-                      int end_bit, cudaStream_t stream, bool debug_synchronous) {
-  return DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream, debug_synchronous);
+                      int end_bit, cudaStream_t stream) {
+  return DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream);
 }
 
 cudaError_t sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, const unsigned *d_values_in,
-                       unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) {
-  return DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream,
-                                    debug_synchronous);
+                       unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream) {
+  return DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream);
 }
 
 cudaError_t sort_pairs_descending(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out,
-                                  const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream,
-                                  bool debug_synchronous) {
+                                  const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream) {
   return DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit,
-                                              end_bit, stream, debug_synchronous);
+                                              end_bit, stream);
 }
 
 cudaError_t run_length_encode(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_unique_out, unsigned *d_counts_out,
-                              unsigned *d_num_runs_out, int num_items, cudaStream_t stream, bool debug_synchronous) {
-  return DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream,
-                                       debug_synchronous);
+                              unsigned *d_num_runs_out, int num_items, cudaStream_t stream) {
+  return DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
 }
 
-cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream,
-                          bool debug_synchronous) {
-  return DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream) {
+  return DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
 }
 
 } // namespace common
@@ -51,21 +47,20 @@ struct fq_mul {
   __device__ __forceinline__ storage operator()(const storage &a, const storage &b) const { return fd_q::mul(a, b); }
 };
 
-cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream,
-                bool debug_synchronous) {
-  return DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, fq_add(), fd_q::storage(), stream, debug_synchronous);
+cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream) {
+  return DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, fq_add(), fd_q::storage(), stream);
 }
 
 cudaError_t inclusive_prefix_product(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
-                                     cudaStream_t stream, bool debug_synchronous) {
-  return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, fq_mul(), num_items, stream, debug_synchronous);
+                                     cudaStream_t stream) {
+  return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, fq_mul(), num_items, stream);
 }
 
 cudaError_t inclusive_prefix_product_reverse(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
-                                             cudaStream_t stream, bool debug_synchronous) {
+                                             cudaStream_t stream) {
   auto i_in = std::reverse_iterator(d_in + num_items);
   auto i_out = std::reverse_iterator(d_out + num_items);
-  return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, i_in, i_out, fq_mul(), num_items, stream, debug_synchronous);
+  return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, i_in, i_out, fq_mul(), num_items, stream);
 }
 
 } // namespace ff
@@ -4,33 +4,32 @@
 namespace common {
 
 cudaError_t sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, int num_items, int begin_bit = 0,
-                      int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr, bool debug_synchronous = false);
+                      int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr);
 
 cudaError_t sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, const unsigned *d_values_in,
-                       unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr,
-                       bool debug_synchronous = false);
+                       unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr);
 
 cudaError_t sort_pairs_descending(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out,
                                   const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8,
-                                  cudaStream_t stream = nullptr, bool debug_synchronous = false);
+                                  cudaStream_t stream = nullptr);
 
 cudaError_t run_length_encode(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_unique_out, unsigned *d_counts_out,
-                              unsigned *d_num_runs_out, int num_items, cudaStream_t stream = nullptr, bool debug_synchronous = false);
+                              unsigned *d_num_runs_out, int num_items, cudaStream_t stream = nullptr);
 
-cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream = nullptr,
-                          bool debug_synchronous = false);
+cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items,
+                          cudaStream_t stream = nullptr);
 
 } // namespace common
 
 namespace ff {
 
-cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream = nullptr,
-                bool debug_synchronous = false);
+cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
+                cudaStream_t stream = nullptr);
 
 cudaError_t inclusive_prefix_product(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
-                                     cudaStream_t stream = nullptr, bool debug_synchronous = false);
+                                     cudaStream_t stream = nullptr);
 
 cudaError_t inclusive_prefix_product_reverse(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
-                                             cudaStream_t stream = nullptr, bool debug_synchronous = false);
+                                             cudaStream_t stream = nullptr);
 
 } // namespace ff
@@ -23,6 +23,7 @@ template <typename T, ld_modifier MODIFIER> static constexpr __device__ __forcei
   case ld_modifier::cv:
     return __ldcv(ptr);
   }
+  return *ptr;
 }
 
 enum class st_modifier { none, wb, cg, cs, wt };

@@ -109,9 +109,8 @@ __device__ __forceinline__ typename FD::storage *index_to_addr(const per_device_
     // "addrs" passed from ntt_smem_stages_kernel should be in constant memory, which is dynamically indexable.
     // I guess nvcc moved ntt_smem_stages_kernel "inputs" and "outputs" to registers then tried to dynamically
     // index addr.data here in index_to_addr. Smart :eyeroll: Whatever, switch statement works.
-  } else {
-    return addrs.data[0] + idx;
   }
+  return addrs.data[0] + idx;
 }
 
 // Carries out up to MAX_SMEM_STAGES - log_tile_sz C-T stages in shared memory.