diff --git a/src/runtime/HalideBuffer.h b/src/runtime/HalideBuffer.h
index 768b5df89fc9..ba37c59e0ca0 100644
--- a/src/runtime/HalideBuffer.h
+++ b/src/runtime/HalideBuffer.h
@@ -2055,9 +2055,7 @@ class Buffer {
     }
 
     template<typename... Args>
-    HALIDE_ALWAYS_INLINE
-        storage_T *
-        address_of(Args... args) const {
+    HALIDE_ALWAYS_INLINE storage_T *address_of(Args... args) const {
         if (T_is_void) {
             return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
         } else {
@@ -2112,8 +2110,7 @@ class Buffer {
     }
 
     HALIDE_ALWAYS_INLINE
-    const not_void_T &
-    operator()() const {
+    const not_void_T &operator()() const {
         static_assert(!T_is_void,
                       "Cannot use operator() on Buffer<void> types");
         constexpr int expected_dims = 0;
@@ -2133,9 +2130,8 @@ class Buffer {
 
     template<typename... Args,
              typename = typename std::enable_if<AllInts<Args...>::value>::type>
-    HALIDE_ALWAYS_INLINE
-        not_void_T &
-        operator()(int first, Args... rest) {
+    HALIDE_ALWAYS_INLINE not_void_T &
+    operator()(int first, Args... rest) {
         static_assert(!T_is_void,
                       "Cannot use operator() on Buffer<void> types");
         constexpr int expected_dims = 1 + (int)(sizeof...(rest));
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index a5170c55d256..90a4b9d47cfd 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1010,7 +1010,9 @@ WEAK int halide_cuda_buffer_copy(void *user_context, struct halide_buffer_t *src
             }
         }
 
-        auto result = cuda_do_multidimensional_copy(user_context, c, c.src + c.src_begin, c.dst, dst->dimensions, from_host, to_host, stream);
+        auto result = cuda_do_multidimensional_copy(
+            user_context, c, c.src + c.src_begin, c.dst + c.dst_begin,
+            dst->dimensions, from_host, to_host, stream);
         if (result) {
             return result;
         }
diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index 8d472e22a0ba..c7f85038e0d8 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -3263,7 +3263,7 @@ WEAK int halide_d3d12compute_buffer_copy(void *user_context, struct halide_buffe
             d3d12_buffer *dsrc = peel_buffer(src);
             d3d12_buffer *ddst = peel_buffer(dst);
             size_t src_offset = dsrc->offsetInBytes + c.src_begin;
-            size_t dst_offset = ddst->offsetInBytes;
+            size_t dst_offset = ddst->offsetInBytes + c.dst_begin;
             D3D12ContextHolder d3d12_context(user_context, true);
             if (d3d12_context.error()) {
                 return d3d12_context.error();
diff --git a/src/runtime/device_buffer_utils.h b/src/runtime/device_buffer_utils.h
index 15cf76776d92..36d1bcf98058 100644
--- a/src/runtime/device_buffer_utils.h
+++ b/src/runtime/device_buffer_utils.h
@@ -31,10 +31,10 @@ namespace Internal {
 // The struct that describes a host <-> dev copy to perform.
 #define MAX_COPY_DIMS 16
 struct device_copy {
-    // opaque handles for source and device memory.
+    // opaque handles (host or device) for source and destination memory.
     uint64_t src, dst;
-    // The offset in the source memory to start
-    uint64_t src_begin;
+    // The offset in the source and destination memory to start
+    uint64_t src_begin, dst_begin;
     // The multidimensional array of contiguous copy tasks that need to be done.
     uint64_t extent[MAX_COPY_DIMS];
     // The strides (in bytes) that separate adjacent copy tasks in each dimension.
@@ -70,13 +70,18 @@ WEAK void copy_memory_helper(const device_copy &copy, int d, int64_t src_off, in
 WEAK void copy_memory(const device_copy &copy, void *user_context) {
     // If this is a zero copy buffer, these pointers will be the same.
     if (copy.src != copy.dst) {
-        copy_memory_helper(copy, MAX_COPY_DIMS - 1, copy.src_begin, 0);
+        copy_memory_helper(copy, MAX_COPY_DIMS - 1, copy.src_begin, copy.dst_begin);
     } else {
         debug(user_context) << "copy_memory: no copy needed as pointers are the same.\n";
     }
 }
 
-// Fills the entire dst buffer, which must be contained within src
+// All crops are supported. It copies the maximum amount of pixels from src to dst.
+// That maximum number of pixels is determined by the overlapping region of the two
+// buffers. This means that you can use it in scenarios:
+//  1) Fill the entire dst buffer, when the dst buffer bounds are contained within src.
+//  2) Copy the entire src buffer, when the src buffer bounds are contained within dst, to dst.
+//  3) Copy only the overlapping region between two buffers, from src to dst.
 WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
                                   const halide_buffer_t *dst, bool dst_host) {
     // Make a copy job representing copying the first pixel only.
@@ -90,12 +95,19 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
         c.dst_stride_bytes[i] = 0;
     }
 
-    // Offset the src base pointer to the right point in its buffer.
+    // Offset the src and dst base pointer to the right point in their buffer.
     c.src_begin = 0;
+    c.dst_begin = 0;
     for (int i = 0; i < src->dimensions; i++) {
-        c.src_begin += (int64_t)src->dim[i].stride * (int64_t)(dst->dim[i].min - src->dim[i].min);
+        int64_t dim_diff = int64_t(dst->dim[i].min - src->dim[i].min);
+        if (dim_diff > 0) {
+            c.src_begin += (int64_t)src->dim[i].stride * dim_diff;
+        } else {
+            c.dst_begin += (int64_t)dst->dim[i].stride * (-dim_diff);
+        }
     }
     c.src_begin *= c.chunk_size;
+    c.dst_begin *= c.chunk_size;
 
     if (src->dimensions != dst->dimensions ||
         src->type.bytes() != dst->type.bytes() ||
@@ -134,7 +146,7 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
             c.dst_stride_bytes[j] = c.dst_stride_bytes[j - 1];
             c.src_stride_bytes[j] = c.src_stride_bytes[j - 1];
         }
-        c.extent[insert] = dst->dim[i].extent;
+        c.extent[insert] = min(src->dim[i].extent, dst->dim[i].extent);
         // debug(nullptr) << "c.extent[" << insert << "] = " << (int)(c.extent[insert]) << "\n";
         c.dst_stride_bytes[insert] = dst_stride_bytes;
         c.src_stride_bytes[insert] = src_stride_bytes;
diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp
index 1fe7d895561b..ab18ba82e318 100644
--- a/src/runtime/metal.cpp
+++ b/src/runtime/metal.cpp
@@ -301,7 +301,7 @@ namespace {
 void do_device_to_device_copy(void *user_context, mtl_blit_command_encoder *encoder,
                               const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int d) {
     if (d == 0) {
-        buffer_to_buffer_1d_copy(encoder, ((device_handle *)c.src)->buf, c.src_begin + src_offset,
+        buffer_to_buffer_1d_copy(encoder, ((device_handle *)c.src)->buf, src_offset,
                                  ((device_handle *)c.dst)->buf, dst_offset, c.chunk_size);
     } else {
         // TODO: deal with negative strides. Currently the code in
@@ -1108,8 +1108,9 @@ WEAK int halide_metal_buffer_copy(void *user_context, struct halide_buffer_t *sr
             const char *buffer_label = "halide_metal_buffer_copy";
             mtl_command_buffer *blit_command_buffer = new_command_buffer(metal_context.queue, buffer_label, strlen(buffer_label));
             mtl_blit_command_encoder *blit_encoder = new_blit_command_encoder(blit_command_buffer);
-            do_device_to_device_copy(user_context, blit_encoder, c, ((device_handle *)c.src)->offset,
-                                     ((device_handle *)c.dst)->offset, dst->dimensions);
+            do_device_to_device_copy(user_context, blit_encoder, c,
+                                     ((device_handle *)c.src)->offset + c.src_begin,
+                                     ((device_handle *)c.dst)->offset + c.dst_begin, dst->dimensions);
             end_encoding(blit_encoder);
             commit_command_buffer(blit_command_buffer);
         } else {
diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp
index 8aaba7f6a707..f4b17d3030e7 100644
--- a/src/runtime/opencl.cpp
+++ b/src/runtime/opencl.cpp
@@ -503,6 +503,7 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q
     size_t max_work_item_sizes[4] = {
         0,
     };
+    cl_uint mem_base_addr_align = 0;
 
     struct {
         void *dst;
@@ -521,6 +522,7 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q
         {&max_work_group_size, sizeof(max_work_group_size), CL_DEVICE_MAX_WORK_GROUP_SIZE},
         {&max_work_item_dimensions, sizeof(max_work_item_dimensions), CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS},
         {&max_work_item_sizes[0], sizeof(max_work_item_sizes), CL_DEVICE_MAX_WORK_ITEM_SIZES},
+        {&mem_base_addr_align, sizeof(mem_base_addr_align), CL_DEVICE_MEM_BASE_ADDR_ALIGN},
         {nullptr}};
 
     // Do all the queries.
@@ -544,7 +546,8 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q
         << "      max work item sizes: " << (uint64_t)max_work_item_sizes[0]
         << "x" << (uint64_t)max_work_item_sizes[1]
         << "x" << (uint64_t)max_work_item_sizes[2]
-        << "x" << (uint64_t)max_work_item_sizes[3] << "\n";
+        << "x" << (uint64_t)max_work_item_sizes[3] << "\n"
+        << "      mem base addr align: " << mem_base_addr_align << "\n";
 #endif
 
     // Create context and command queue.
@@ -1035,7 +1038,9 @@ WEAK int halide_opencl_buffer_copy(void *user_context, struct halide_buffer_t *s
         }
 #endif
 
-        auto result = opencl_do_multidimensional_copy(user_context, ctx, c, c.src_begin, 0, dst->dimensions, from_host, to_host);
+        auto result = opencl_do_multidimensional_copy(
+            user_context, ctx, c, c.src_begin, c.dst_begin,
+            dst->dimensions, from_host, to_host);
         if (result) {
             return result;
         }
@@ -1155,6 +1160,8 @@ WEAK int halide_opencl_run(void *user_context,
                 // span the crop.
                 mem = clCreateSubBuffer(mem, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
                 sub_buffers[sub_buffers_saved++] = mem;
+                debug(user_context) << "Create subbuffer " << (void *)mem << ": "
+                                    << "offset=" << region.origin << ", size=" << region.size << "\n";
             }
             if (err == CL_SUCCESS) {
                 debug(user_context) << "Mapped dev handle is: " << (void *)mem << "\n";
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index cb79950f7421..5057be65886a 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -462,7 +462,7 @@ WEAK int halide_vulkan_copy_to_device(void *user_context, halide_buffer_t *halid
     copy_helper.src = (uint64_t)(staging_buffer);
     copy_helper.dst = (uint64_t)(device_buffer);
     uint64_t src_offset = copy_helper.src_begin;
-    uint64_t dst_offset = device_region->range.head_offset;
+    uint64_t dst_offset = copy_helper.dst_begin + device_region->range.head_offset;
 
     // enqueue the copy operation, using the allocated buffers
     error_code = vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
@@ -639,7 +639,7 @@ WEAK int halide_vulkan_copy_to_host(void *user_context, halide_buffer_t *halide_
     copy_helper.src = (uint64_t)(device_buffer);
     copy_helper.dst = (uint64_t)(staging_buffer);
     uint64_t src_offset = copy_helper.src_begin + device_region->range.head_offset;
-    uint64_t dst_offset = 0;
+    uint64_t dst_offset = copy_helper.dst_begin;
 
     // enqueue the copy operation, using the allocated buffers
     error_code = vk_do_multidimensional_copy(user_context, command_buffer, copy_helper,
@@ -918,11 +918,7 @@ WEAK int halide_vulkan_buffer_copy(void *user_context, struct halide_buffer_t *s
         copy_helper.src = (uint64_t)(src_device_buffer);
         copy_helper.dst = (uint64_t)(dst_device_buffer);
         uint64_t src_offset = copy_helper.src_begin + src_buffer_region->range.head_offset;
-        uint64_t dst_offset = dst_buffer_region->range.head_offset;
-        if (!from_host && !to_host) {
-            src_offset = src_buffer_region->range.head_offset;
-            dst_offset = dst_buffer_region->range.head_offset;
-        }
+        uint64_t dst_offset = copy_helper.dst_begin + dst_buffer_region->range.head_offset;
 
         debug(user_context) << " src region=" << (void *)src_memory_region << " buffer=" << (void *)src_device_buffer << " crop_offset=" << (uint64_t)src_buffer_region->range.head_offset << " copy_offset=" << src_offset << "\n";
         debug(user_context) << " dst region=" << (void *)dst_memory_region << " buffer=" << (void *)dst_device_buffer << " crop_offset=" << (uint64_t)dst_buffer_region->range.head_offset << " copy_offset=" << dst_offset << "\n";
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index f28548700457..0dfa975a91d7 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -1548,9 +1548,9 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff
             (!from_host && !to_host)) {
 
             VkBufferCopy buffer_copy = {
-                c.src_begin + src_offset,  // srcOffset
-                dst_offset,                // dstOffset
-                c.chunk_size               // size
+                src_offset,   // srcOffset
+                dst_offset,   // dstOffset
+                c.chunk_size  // size
             };
 
             VkBuffer *src_buffer = reinterpret_cast<VkBuffer *>(c.src);
diff --git a/src/runtime/webgpu.cpp b/src/runtime/webgpu.cpp
index 0a0ae240eade..b57113db3d0e 100644
--- a/src/runtime/webgpu.cpp
+++ b/src/runtime/webgpu.cpp
@@ -720,7 +720,7 @@ WEAK int halide_webgpu_buffer_copy(void *user_context,
         ErrorScope error_scope(user_context, context.device);
 
         err = do_multidimensional_copy(user_context, &context, c,
-                                       c.src_begin, 0, dst->dimensions,
+                                       c.src_begin, c.dst_begin, dst->dimensions,
                                        from_host, to_host);
         if (err == halide_error_code_success) {
             err = error_scope.wait();
diff --git a/test/correctness/device_buffer_copy.cpp b/test/correctness/device_buffer_copy.cpp
index 31ff92b4ae85..23513883d738 100644
--- a/test/correctness/device_buffer_copy.cpp
+++ b/test/correctness/device_buffer_copy.cpp
@@ -98,11 +98,11 @@ int main(int argc, char **argv) {
 
         for (int i = 0; i < 128; i++) {
             for (int j = 0; j < 128; j++) {
-                bool in_gpu3 = (i >= gpu_buf2.dim(0).min()) &&
-                               (i < (gpu_buf2.dim(0).min() + gpu_buf2.dim(0).extent())) &&
+                bool in_gpu2 = (i >= gpu_buf2.dim(0).min()) &&
+                               (i <= gpu_buf2.dim(0).max()) &&
                                (j >= gpu_buf2.dim(1).min()) &&
-                               (j < (gpu_buf2.dim(1).min() + gpu_buf2.dim(1).extent()));
-                assert(gpu_buf1(i, j) == (in_gpu3 ? 0 : (i + j * 256)));
+                               (j <= gpu_buf2.dim(1).max());
+                assert(gpu_buf1(i, j) == (in_gpu2 ? 0 : (i + j * 256)));
             }
         }
     }
@@ -121,15 +121,15 @@ int main(int argc, char **argv) {
         for (int i = 0; i < 128; i++) {
             for (int j = 0; j < 128; j++) {
                 bool in_cpu1 = (i >= cpu_buf1.dim(0).min()) &&
-                               (i < (cpu_buf1.dim(0).min() + cpu_buf1.dim(0).extent())) &&
+                               (i <= cpu_buf1.dim(0).max()) &&
                                (j >= cpu_buf1.dim(1).min()) &&
-                               (j < (cpu_buf1.dim(1).min() + cpu_buf1.dim(1).extent()));
+                               (j <= cpu_buf1.dim(1).max());
                 assert(cpu_buf(i, j) == (in_cpu1 ? (i + j * 256) : 0));
             }
         }
     }
 
-    printf("Test copy device to device -- subset area.\n");
+    printf("Test copy device to device -- subset area (from bigger to smaller buffer).\n");
     {
         Halide::Runtime::Buffer<int32_t> gpu_buf1 = make_gpu_buffer(hexagon_rpc);
         assert(gpu_buf1.raw_buffer()->device_interface != nullptr);
@@ -147,14 +147,75 @@ int main(int argc, char **argv) {
         for (int i = 0; i < 128; i++) {
             for (int j = 0; j < 128; j++) {
                 bool in_gpu3 = (i >= gpu_buf3.dim(0).min()) &&
-                               (i < (gpu_buf3.dim(0).min() + gpu_buf3.dim(0).extent())) &&
+                               (i <= gpu_buf3.dim(0).max()) &&
                                (j >= gpu_buf3.dim(1).min()) &&
-                               (j < (gpu_buf3.dim(1).min() + gpu_buf3.dim(1).extent()));
+                               (j <= gpu_buf3.dim(1).max());
                 assert(gpu_buf2(i, j) == (i + j * 256 + (in_gpu3 ? 0 : 256000)));
             }
         }
     }
 
+    printf("Test copy device to device -- subset area (from smaller to bigger buffer).\n");
+    {
+        Halide::Runtime::Buffer<int32_t> gpu_buf1 = make_gpu_buffer(hexagon_rpc);
+        assert(gpu_buf1.raw_buffer()->device_interface != nullptr);
+
+        Halide::Runtime::Buffer<int32_t> gpu_buf2 = make_gpu_buffer(hexagon_rpc, 256000);
+        assert(gpu_buf2.raw_buffer()->device_interface != nullptr);
+
+        Halide::Runtime::Buffer<int32_t> gpu_buf3 = gpu_buf2.cropped({{32, 64}, {32, 64}});
+        assert(gpu_buf3.raw_buffer()->device_interface != nullptr);
+
+        assert(gpu_buf3.raw_buffer()->device_interface->buffer_copy(nullptr, gpu_buf3, gpu_buf1.raw_buffer()->device_interface, gpu_buf1) == 0);
+        gpu_buf1.set_device_dirty();
+        gpu_buf1.copy_to_host();
+
+        for (int i = 0; i < 128; i++) {
+            for (int j = 0; j < 128; j++) {
+                bool in_gpu3 = (i >= gpu_buf3.dim(0).min()) &&
+                               (i <= gpu_buf3.dim(0).max()) &&
+                               (j >= gpu_buf3.dim(1).min()) &&
+                               (j <= gpu_buf3.dim(1).max());
+                assert(gpu_buf1(i, j) == (i + j * 256 + (in_gpu3 ? 256000 : 0)));
+            }
+        }
+    }
+
+    printf("Test copy device to device -- subset area (from and to not-contained-within-each-other crops).\n");
+    {
+        Halide::Runtime::Buffer<int32_t> gpu_buf1 = make_gpu_buffer(hexagon_rpc);
+        assert(gpu_buf1.raw_buffer()->device_interface != nullptr);
+
+        Halide::Runtime::Buffer<int32_t> gpu_buf2 = make_gpu_buffer(hexagon_rpc, 256000);
+        assert(gpu_buf2.raw_buffer()->device_interface != nullptr);
+
+        // Two crops: one horizontal, and one vertical rectangle.
+        Halide::Runtime::Buffer<int32_t> gpu_buf1_crop = gpu_buf1.cropped({{32, 64}, {32, 16}});
+        Halide::Runtime::Buffer<int32_t> gpu_buf2_crop = gpu_buf2.cropped({{32, 16}, {32, 64}});
+        assert(gpu_buf1_crop.raw_buffer()->device_interface != nullptr);
+        assert(gpu_buf2_crop.raw_buffer()->device_interface != nullptr);
+
+        // Copy from the crop to another crop.
+        assert(gpu_buf2_crop.raw_buffer()->device_interface->buffer_copy(nullptr, gpu_buf2_crop, gpu_buf1_crop.raw_buffer()->device_interface, gpu_buf1_crop) == 0);
+        gpu_buf1.set_device_dirty();
+        gpu_buf1.copy_to_host();
+
+        for (int i = 0; i < 128; i++) {
+            for (int j = 0; j < 128; j++) {
+                bool in_gpu1_crop = (i >= gpu_buf1_crop.dim(0).min()) &&
+                                    (i <= gpu_buf1_crop.dim(0).max()) &&
+                                    (j >= gpu_buf1_crop.dim(1).min()) &&
+                                    (j <= gpu_buf1_crop.dim(1).max());
+                bool in_gpu2_crop = (i >= gpu_buf2_crop.dim(0).min()) &&
+                                    (i <= gpu_buf2_crop.dim(0).max()) &&
+                                    (j >= gpu_buf2_crop.dim(1).min()) &&
+                                    (j <= gpu_buf2_crop.dim(1).max());
+                // printf("gpu_buf1(%d, %d) = %d  (in_gpu1_crop=%d in_gpu2_crop=%d)\n", i, j, gpu_buf1(i, j), in_gpu1_crop, in_gpu2_crop);
+                assert(gpu_buf1(i, j) == (i + j * 256 + (in_gpu1_crop && in_gpu2_crop ? 256000 : 0)));
+            }
+        }
+    }
+
     printf("Test copy from device no src host.\n");
     {
         Halide::Runtime::Buffer<int32_t> gpu_buf = make_gpu_buffer(hexagon_rpc);