diff --git a/src/runtime/HalideBuffer.h b/src/runtime/HalideBuffer.h index 768b5df89fc9..ba37c59e0ca0 100644 --- a/src/runtime/HalideBuffer.h +++ b/src/runtime/HalideBuffer.h @@ -2055,9 +2055,7 @@ class Buffer { } template - HALIDE_ALWAYS_INLINE - storage_T * - address_of(Args... args) const { + HALIDE_ALWAYS_INLINE storage_T *address_of(Args... args) const { if (T_is_void) { return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes(); } else { @@ -2112,8 +2110,7 @@ class Buffer { } HALIDE_ALWAYS_INLINE - const not_void_T & - operator()() const { + const not_void_T &operator()() const { static_assert(!T_is_void, "Cannot use operator() on Buffer types"); constexpr int expected_dims = 0; @@ -2133,9 +2130,8 @@ class Buffer { template::value>::type> - HALIDE_ALWAYS_INLINE - not_void_T & - operator()(int first, Args... rest) { + HALIDE_ALWAYS_INLINE not_void_T & + operator()(int first, Args... rest) { static_assert(!T_is_void, "Cannot use operator() on Buffer types"); constexpr int expected_dims = 1 + (int)(sizeof...(rest)); diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index a5170c55d256..90a4b9d47cfd 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -1010,7 +1010,9 @@ WEAK int halide_cuda_buffer_copy(void *user_context, struct halide_buffer_t *src } } - auto result = cuda_do_multidimensional_copy(user_context, c, c.src + c.src_begin, c.dst, dst->dimensions, from_host, to_host, stream); + auto result = cuda_do_multidimensional_copy( + user_context, c, c.src + c.src_begin, c.dst + c.dst_begin, + dst->dimensions, from_host, to_host, stream); if (result) { return result; } diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index 8d472e22a0ba..c7f85038e0d8 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -3263,7 +3263,7 @@ WEAK int halide_d3d12compute_buffer_copy(void *user_context, struct halide_buffe d3d12_buffer *dsrc = peel_buffer(src); d3d12_buffer *ddst = peel_buffer(dst); size_t src_offset = dsrc->offsetInBytes + c.src_begin; - size_t dst_offset = ddst->offsetInBytes; + size_t dst_offset = ddst->offsetInBytes + c.dst_begin; D3D12ContextHolder d3d12_context(user_context, true); if (d3d12_context.error()) { return d3d12_context.error(); diff --git a/src/runtime/device_buffer_utils.h b/src/runtime/device_buffer_utils.h index 15cf76776d92..36d1bcf98058 100644 --- a/src/runtime/device_buffer_utils.h +++ b/src/runtime/device_buffer_utils.h @@ -31,10 +31,10 @@ namespace Internal { // The struct that describes a host <-> dev copy to perform. #define MAX_COPY_DIMS 16 struct device_copy { - // opaque handles for source and device memory. + // opaque handles (host or device) for source and destination memory. uint64_t src, dst; - // The offset in the source memory to start - uint64_t src_begin; + // The offset in the source and destination memory to start + uint64_t src_begin, dst_begin; // The multidimensional array of contiguous copy tasks that need to be done. uint64_t extent[MAX_COPY_DIMS]; // The strides (in bytes) that separate adjacent copy tasks in each dimension. @@ -70,13 +70,18 @@ WEAK void copy_memory_helper(const device_copy ©, int d, int64_t src_off, in WEAK void copy_memory(const device_copy ©, void *user_context) { // If this is a zero copy buffer, these pointers will be the same. if (copy.src != copy.dst) { - copy_memory_helper(copy, MAX_COPY_DIMS - 1, copy.src_begin, 0); + copy_memory_helper(copy, MAX_COPY_DIMS - 1, copy.src_begin, copy.dst_begin); } else { debug(user_context) << "copy_memory: no copy needed as pointers are the same.\n"; } } -// Fills the entire dst buffer, which must be contained within src +// All crops are supported. It copies the maximum amount of pixels from src to dst. +// That maximum number of pixels is determined by the overlapping region of the two +// buffers. This means that you can use it in scenarios: +// 1) Fill the entire dst buffer, when the dst buffer bounds are contained within src. +// 2) Copy the entire src buffer, when the src buffer bounds are contained within dst, to dst. +// 3) Copy only the overlapping region between two buffers, from src to dst. WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host, const halide_buffer_t *dst, bool dst_host) { // Make a copy job representing copying the first pixel only. @@ -90,12 +95,19 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host, c.dst_stride_bytes[i] = 0; } - // Offset the src base pointer to the right point in its buffer. + // Offset the src and dst base pointer to the right point in their buffer. c.src_begin = 0; + c.dst_begin = 0; for (int i = 0; i < src->dimensions; i++) { - c.src_begin += (int64_t)src->dim[i].stride * (int64_t)(dst->dim[i].min - src->dim[i].min); + int64_t dim_diff = int64_t(dst->dim[i].min - src->dim[i].min); + if (dim_diff > 0) { + c.src_begin += (int64_t)src->dim[i].stride * dim_diff; + } else { + c.dst_begin += (int64_t)dst->dim[i].stride * (-dim_diff); + } } c.src_begin *= c.chunk_size; + c.dst_begin *= c.chunk_size; if (src->dimensions != dst->dimensions || src->type.bytes() != dst->type.bytes() || @@ -134,7 +146,7 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host, c.dst_stride_bytes[j] = c.dst_stride_bytes[j - 1]; c.src_stride_bytes[j] = c.src_stride_bytes[j - 1]; } - c.extent[insert] = dst->dim[i].extent; + c.extent[insert] = min(src->dim[i].extent, dst->dim[i].extent); // debug(nullptr) << "c.extent[" << insert << "] = " << (int)(c.extent[insert]) << "\n"; c.dst_stride_bytes[insert] = dst_stride_bytes; c.src_stride_bytes[insert] = src_stride_bytes; diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp index 1fe7d895561b..ab18ba82e318 100644 --- a/src/runtime/metal.cpp +++ b/src/runtime/metal.cpp @@ -301,7 +301,7 @@ namespace { void do_device_to_device_copy(void *user_context, mtl_blit_command_encoder *encoder, const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int d) { if (d == 0) { - buffer_to_buffer_1d_copy(encoder, ((device_handle *)c.src)->buf, c.src_begin + src_offset, + buffer_to_buffer_1d_copy(encoder, ((device_handle *)c.src)->buf, src_offset, ((device_handle *)c.dst)->buf, dst_offset, c.chunk_size); } else { // TODO: deal with negative strides. Currently the code in @@ -1108,8 +1108,9 @@ WEAK int halide_metal_buffer_copy(void *user_context, struct halide_buffer_t *sr const char *buffer_label = "halide_metal_buffer_copy"; mtl_command_buffer *blit_command_buffer = new_command_buffer(metal_context.queue, buffer_label, strlen(buffer_label)); mtl_blit_command_encoder *blit_encoder = new_blit_command_encoder(blit_command_buffer); - do_device_to_device_copy(user_context, blit_encoder, c, ((device_handle *)c.src)->offset, - ((device_handle *)c.dst)->offset, dst->dimensions); + do_device_to_device_copy(user_context, blit_encoder, c, + ((device_handle *)c.src)->offset + c.src_begin, + ((device_handle *)c.dst)->offset + c.dst_begin, dst->dimensions); end_encoding(blit_encoder); commit_command_buffer(blit_command_buffer); } else { diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index 8aaba7f6a707..f4b17d3030e7 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -503,6 +503,7 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q size_t max_work_item_sizes[4] = { 0, }; + cl_uint mem_base_addr_align = 0; struct { void *dst; @@ -521,6 +522,7 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q {&max_work_group_size, sizeof(max_work_group_size), CL_DEVICE_MAX_WORK_GROUP_SIZE}, {&max_work_item_dimensions, sizeof(max_work_item_dimensions), CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS}, {&max_work_item_sizes[0], sizeof(max_work_item_sizes), CL_DEVICE_MAX_WORK_ITEM_SIZES}, + {&mem_base_addr_align, sizeof(mem_base_addr_align), CL_DEVICE_MEM_BASE_ADDR_ALIGN}, {nullptr}}; // Do all the queries. @@ -544,7 +546,8 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q << " max work item sizes: " << (uint64_t)max_work_item_sizes[0] << "x" << (uint64_t)max_work_item_sizes[1] << "x" << (uint64_t)max_work_item_sizes[2] - << "x" << (uint64_t)max_work_item_sizes[3] << "\n"; + << "x" << (uint64_t)max_work_item_sizes[3] << "\n" + << " mem base addr align: " << mem_base_addr_align << "\n"; #endif // Create context and command queue. @@ -1035,7 +1038,9 @@ WEAK int halide_opencl_buffer_copy(void *user_context, struct halide_buffer_t *s } #endif - auto result = opencl_do_multidimensional_copy(user_context, ctx, c, c.src_begin, 0, dst->dimensions, from_host, to_host); + auto result = opencl_do_multidimensional_copy( + user_context, ctx, c, c.src_begin, c.dst_begin, + dst->dimensions, from_host, to_host); if (result) { return result; } @@ -1155,6 +1160,8 @@ WEAK int halide_opencl_run(void *user_context, // span the crop. mem = clCreateSubBuffer(mem, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); sub_buffers[sub_buffers_saved++] = mem; + debug(user_context) << "Create subbuffer " << (void *)mem << ": " + << "offset=" << region.origin << ", size=" << region.size << "\n"; } if (err == CL_SUCCESS) { debug(user_context) << "Mapped dev handle is: " << (void *)mem << "\n"; diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp index cb79950f7421..5057be65886a 100644 --- a/src/runtime/vulkan.cpp +++ b/src/runtime/vulkan.cpp @@ -462,7 +462,7 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid copy_helper.src = (uint64_t)(staging_buffer); copy_helper.dst = (uint64_t)(device_buffer); uint64_t src_offset = copy_helper.src_begin; - uint64_t dst_offset = device_region->range.head_offset; + uint64_t dst_offset = copy_helper.dst_begin + device_region->range.head_offset; // enqueue the copy operation, using the allocated buffers error_code = vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, @@ -639,7 +639,7 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_ copy_helper.src = (uint64_t)(device_buffer); copy_helper.dst = (uint64_t)(staging_buffer); uint64_t src_offset = copy_helper.src_begin + device_region->range.head_offset; - uint64_t dst_offset = 0; + uint64_t dst_offset = copy_helper.dst_begin; // enqueue the copy operation, using the allocated buffers error_code = vk_do_multidimensional_copy(user_context, command_buffer, copy_helper, @@ -918,11 +918,7 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s copy_helper.src = (uint64_t)(src_device_buffer); copy_helper.dst = (uint64_t)(dst_device_buffer); uint64_t src_offset = copy_helper.src_begin + src_buffer_region->range.head_offset; - uint64_t dst_offset = dst_buffer_region->range.head_offset; - if (!from_host && !to_host) { - src_offset = src_buffer_region->range.head_offset; - dst_offset = dst_buffer_region->range.head_offset; - } + uint64_t dst_offset = copy_helper.dst_begin + dst_buffer_region->range.head_offset; debug(user_context) << " src region=" << (void *)src_memory_region << " buffer=" << (void *)src_device_buffer << " crop_offset=" << (uint64_t)src_buffer_region->range.head_offset << " copy_offset=" << src_offset << "\n"; debug(user_context) << " dst region=" << (void *)dst_memory_region << " buffer=" << (void *)dst_device_buffer << " crop_offset=" << (uint64_t)dst_buffer_region->range.head_offset << " copy_offset=" << dst_offset << "\n"; diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h index f28548700457..0dfa975a91d7 100644 --- a/src/runtime/vulkan_resources.h +++ b/src/runtime/vulkan_resources.h @@ -1548,9 +1548,9 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff (!from_host && !to_host)) { VkBufferCopy buffer_copy = { - c.src_begin + src_offset, // srcOffset - dst_offset, // dstOffset - c.chunk_size // size + src_offset, // srcOffset + dst_offset, // dstOffset + c.chunk_size // size }; VkBuffer *src_buffer = reinterpret_cast(c.src); diff --git a/src/runtime/webgpu.cpp b/src/runtime/webgpu.cpp index 0a0ae240eade..b57113db3d0e 100644 --- a/src/runtime/webgpu.cpp +++ b/src/runtime/webgpu.cpp @@ -720,7 +720,7 @@ WEAK int halide_webgpu_buffer_copy(void *user_context, ErrorScope error_scope(user_context, context.device); err = do_multidimensional_copy(user_context, &context, c, - c.src_begin, 0, dst->dimensions, + c.src_begin, c.dst_begin, dst->dimensions, from_host, to_host); if (err == halide_error_code_success) { err = error_scope.wait(); diff --git a/test/correctness/device_buffer_copy.cpp b/test/correctness/device_buffer_copy.cpp index 31ff92b4ae85..23513883d738 100644 --- a/test/correctness/device_buffer_copy.cpp +++ b/test/correctness/device_buffer_copy.cpp @@ -98,11 +98,11 @@ int main(int argc, char **argv) { for (int i = 0; i < 128; i++) { for (int j = 0; j < 128; j++) { - bool in_gpu3 = (i >= gpu_buf2.dim(0).min()) && - (i < (gpu_buf2.dim(0).min() + gpu_buf2.dim(0).extent())) && + bool in_gpu2 = (i >= gpu_buf2.dim(0).min()) && + (i <= gpu_buf2.dim(0).max()) && (j >= gpu_buf2.dim(1).min()) && - (j < (gpu_buf2.dim(1).min() + gpu_buf2.dim(1).extent())); - assert(gpu_buf1(i, j) == (in_gpu3 ? 0 : (i + j * 256))); + (j <= gpu_buf2.dim(1).max()); + assert(gpu_buf1(i, j) == (in_gpu2 ? 0 : (i + j * 256))); } } } @@ -121,15 +121,15 @@ int main(int argc, char **argv) { for (int i = 0; i < 128; i++) { for (int j = 0; j < 128; j++) { bool in_cpu1 = (i >= cpu_buf1.dim(0).min()) && - (i < (cpu_buf1.dim(0).min() + cpu_buf1.dim(0).extent())) && + (i <= cpu_buf1.dim(0).max()) && (j >= cpu_buf1.dim(1).min()) && - (j < (cpu_buf1.dim(1).min() + cpu_buf1.dim(1).extent())); + (j <= cpu_buf1.dim(1).max()); assert(cpu_buf(i, j) == (in_cpu1 ? (i + j * 256) : 0)); } } } - printf("Test copy device to device -- subset area.\n"); + printf("Test copy device to device -- subset area (from bigger to smaller buffer).\n"); { Halide::Runtime::Buffer gpu_buf1 = make_gpu_buffer(hexagon_rpc); assert(gpu_buf1.raw_buffer()->device_interface != nullptr); @@ -147,14 +147,75 @@ int main(int argc, char **argv) { for (int i = 0; i < 128; i++) { for (int j = 0; j < 128; j++) { bool in_gpu3 = (i >= gpu_buf3.dim(0).min()) && - (i < (gpu_buf3.dim(0).min() + gpu_buf3.dim(0).extent())) && + (i <= gpu_buf3.dim(0).max()) && (j >= gpu_buf3.dim(1).min()) && - (j < (gpu_buf3.dim(1).min() + gpu_buf3.dim(1).extent())); + (j <= gpu_buf3.dim(1).max()); assert(gpu_buf2(i, j) == (i + j * 256 + (in_gpu3 ? 0 : 256000))); } } } + printf("Test copy device to device -- subset area (from smaller to bigger buffer).\n"); + { + Halide::Runtime::Buffer gpu_buf1 = make_gpu_buffer(hexagon_rpc); + assert(gpu_buf1.raw_buffer()->device_interface != nullptr); + + Halide::Runtime::Buffer gpu_buf2 = make_gpu_buffer(hexagon_rpc, 256000); + assert(gpu_buf2.raw_buffer()->device_interface != nullptr); + + Halide::Runtime::Buffer gpu_buf3 = gpu_buf2.cropped({{32, 64}, {32, 64}}); + assert(gpu_buf3.raw_buffer()->device_interface != nullptr); + + assert(gpu_buf3.raw_buffer()->device_interface->buffer_copy(nullptr, gpu_buf3, gpu_buf1.raw_buffer()->device_interface, gpu_buf1) == 0); + gpu_buf1.set_device_dirty(); + gpu_buf1.copy_to_host(); + + for (int i = 0; i < 128; i++) { + for (int j = 0; j < 128; j++) { + bool in_gpu3 = (i >= gpu_buf3.dim(0).min()) && + (i <= gpu_buf3.dim(0).max()) && + (j >= gpu_buf3.dim(1).min()) && + (j <= gpu_buf3.dim(1).max()); + assert(gpu_buf1(i, j) == (i + j * 256 + (in_gpu3 ? 256000 : 0))); + } + } + } + + printf("Test copy device to device -- subset area (from and to not-contained-within-each-other crops).\n"); + { + Halide::Runtime::Buffer gpu_buf1 = make_gpu_buffer(hexagon_rpc); + assert(gpu_buf1.raw_buffer()->device_interface != nullptr); + + Halide::Runtime::Buffer gpu_buf2 = make_gpu_buffer(hexagon_rpc, 256000); + assert(gpu_buf2.raw_buffer()->device_interface != nullptr); + + // Two crops: one horizontal, and one vertical rectangle. + Halide::Runtime::Buffer gpu_buf1_crop = gpu_buf1.cropped({{32, 64}, {32, 16}}); + Halide::Runtime::Buffer gpu_buf2_crop = gpu_buf2.cropped({{32, 16}, {32, 64}}); + assert(gpu_buf1_crop.raw_buffer()->device_interface != nullptr); + assert(gpu_buf2_crop.raw_buffer()->device_interface != nullptr); + + // Copy from the crop to another crop. + assert(gpu_buf2_crop.raw_buffer()->device_interface->buffer_copy(nullptr, gpu_buf2_crop, gpu_buf1_crop.raw_buffer()->device_interface, gpu_buf1_crop) == 0); + gpu_buf1.set_device_dirty(); + gpu_buf1.copy_to_host(); + + for (int i = 0; i < 128; i++) { + for (int j = 0; j < 128; j++) { + bool in_gpu1_crop = (i >= gpu_buf1_crop.dim(0).min()) && + (i <= gpu_buf1_crop.dim(0).max()) && + (j >= gpu_buf1_crop.dim(1).min()) && + (j <= gpu_buf1_crop.dim(1).max()); + bool in_gpu2_crop = (i >= gpu_buf2_crop.dim(0).min()) && + (i <= gpu_buf2_crop.dim(0).max()) && + (j >= gpu_buf2_crop.dim(1).min()) && + (j <= gpu_buf2_crop.dim(1).max()); + // printf("gpu_buf1(%d, %d) = %d (in_gpu1_crop=%d in_gpu2_crop=%d)\n", i, j, gpu_buf1(i, j), in_gpu1_crop, in_gpu2_crop); + assert(gpu_buf1(i, j) == (i + j * 256 + (in_gpu1_crop && in_gpu2_crop ? 256000 : 0))); + } + } + } + printf("Test copy from device no src host.\n"); { Halide::Runtime::Buffer gpu_buf = make_gpu_buffer(hexagon_rpc);