diff --git a/modules/cudaarithm/src/cuda/polar_cart.cu b/modules/cudaarithm/src/cuda/polar_cart.cu index 2fb1315e619..12980e424ff 100644 --- a/modules/cudaarithm/src/cuda/polar_cart.cu +++ b/modules/cudaarithm/src/cuda/polar_cart.cu @@ -133,23 +133,9 @@ void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, Outpu GpuMat_ anglec(angle.reshape(1)); if (angleInDegrees) - { - gridTransformTuple(zipPtr(xc, yc), - tie(magc, anglec), - make_tuple( - binaryTupleAdapter<0, 1>(magnitude_func()), - binaryTupleAdapter<0, 1>(direction_func())), - stream); - } + gridTransformBinary(xc, yc, magc, anglec, magnitude_func(), direction_func(), stream); else - { - gridTransformTuple(zipPtr(xc, yc), - tie(magc, anglec), - make_tuple( - binaryTupleAdapter<0, 1>(magnitude_func()), - binaryTupleAdapter<0, 1>(direction_func())), - stream); - } + gridTransformBinary(xc, yc, magc, anglec, magnitude_func(), direction_func(), stream); syncOutput(mag, _mag, stream); syncOutput(angle, _angle, stream); diff --git a/modules/cudaarithm/src/cuda/split_merge.cu b/modules/cudaarithm/src/cuda/split_merge.cu index 5b3af10775d..f0acb840a9e 100644 --- a/modules/cudaarithm/src/cuda/split_merge.cu +++ b/modules/cudaarithm/src/cuda/split_merge.cu @@ -67,7 +67,8 @@ namespace { static void call(const GpuMat* src, GpuMat& dst, Stream& stream) { - gridMerge(zipPtr(globPtr(src[0]), globPtr(src[1])), + const std::array, 2> d_src = {globPtr(src[0]), globPtr(src[1])}; + gridMerge(d_src, globPtr::type>(dst), stream); } @@ -77,7 +78,8 @@ namespace { static void call(const GpuMat* src, GpuMat& dst, Stream& stream) { - gridMerge(zipPtr(globPtr(src[0]), globPtr(src[1]), globPtr(src[2])), + const std::array, 3> d_src = {globPtr(src[0]), globPtr(src[1]), globPtr(src[2])}; + gridMerge(d_src, globPtr::type>(dst), stream); } @@ -87,7 +89,8 @@ namespace { static void call(const GpuMat* src, GpuMat& dst, Stream& stream) { - gridMerge(zipPtr(globPtr(src[0]), globPtr(src[1]), globPtr(src[2]), globPtr(src[3])), + const std::array, 4 > d_src = {globPtr(src[0]), globPtr(src[1]), globPtr(src[2]), globPtr(src[3])}; + gridMerge(d_src, globPtr::type>(dst), stream); } diff --git a/modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp b/modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp index 151e949a617..5bd1737aa3c 100644 --- a/modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp +++ b/modules/cudev/include/opencv2/cudev/block/detail/reduce.hpp @@ -154,6 +154,17 @@ namespace block_reduce_detail val = smem[tid]; } + + // merge + + template + __device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op) + { + T reg = smem[tid + delta]; + smem[tid] = val = op(val, reg); + } + +#if (CUDART_VERSION < 12040) template __device__ __forceinline__ void loadToSmem(const tuple& smem, @@ -172,15 +183,6 @@ namespace block_reduce_detail For<0, tuple_size >::value>::loadFromSmem(smem, val, tid); } - // merge - - template - __device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op) - { - T reg = smem[tid + delta]; - smem[tid] = val = op(val, reg); - } - template @@ -214,6 +216,41 @@ namespace block_reduce_detail } #endif +#else + template + __device__ __forceinline__ void loadToSmem(const tuple& smem, const tuple& val, uint tid) + { + For<0, tuple_size >::value>::loadToSmem(smem, val, tid); + } + + template + __device__ __forceinline__ void loadFromSmem(const tuple& smem, const tuple& val, uint tid) + { + For<0, tuple_size >::value>::loadFromSmem(smem, val, tid); + } + + template + __device__ __forceinline__ void merge(const tuple& smem, const tuple& val, uint tid, uint delta, const tuple& op) + { + For<0, tuple_size >::value>::merge(smem, val, tid, delta, op); + } + + // mergeShfl + + template + __device__ __forceinline__ void mergeShfl(T& val, uint delta, uint width, const Op& op) + { + T reg = shfl_down(val, delta, width); + val = op(val, reg); + } + + template + __device__ __forceinline__ void mergeShfl(const tuple& val, uint delta, uint width, const tuple& op) + { + For<0, tuple_size >::value>::mergeShfl(val, delta, width, op); + } +#endif + // Generic template struct Generic diff --git a/modules/cudev/include/opencv2/cudev/block/detail/reduce_key_val.hpp b/modules/cudev/include/opencv2/cudev/block/detail/reduce_key_val.hpp index 4af834a446e..43876decc92 100644 --- a/modules/cudev/include/opencv2/cudev/block/detail/reduce_key_val.hpp +++ b/modules/cudev/include/opencv2/cudev/block/detail/reduce_key_val.hpp @@ -160,6 +160,7 @@ namespace block_reduce_key_val_detail data = smem[tid]; } +#if (CUDART_VERSION < 12040) template __device__ __forceinline__ void loadToSmem(const tuple& smem, @@ -241,6 +242,67 @@ namespace block_reduce_key_val_detail { For<0, tuple_size >::value>::merge(skeys, key, svals, val, cmp, tid, delta); } +#else + template + __device__ __forceinline__ void loadToSmem(const tuple& smem, const tuple& data, uint tid) + { + For<0, tuple_size >::value>::loadToSmem(smem, data, tid); + } + + template + __device__ __forceinline__ void loadFromSmem(const tuple& smem, const tuple& data, uint tid) + { + For<0, tuple_size >::value>::loadFromSmem(smem, data, tid); + } + + // copyVals + + template + __device__ __forceinline__ void copyVals(volatile V* svals, V& val, uint tid, uint delta) + { + svals[tid] = val = svals[tid + delta]; + } + + template + __device__ __forceinline__ void copyVals(const tuple& svals, const tuple& val, uint tid, uint delta) + { + For<0, tuple_size >::value>::copy(svals, val, tid, delta); + } + + // merge + + template + __device__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, uint tid, uint delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + + template + __device__ void merge(volatile K* skeys, K& key, const tuple& svals, const tuple& val, const Cmp& cmp, uint tid, uint delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + + template + __device__ __forceinline__ void merge(const tuple& skeys, const tuple& key, + const tuple& svals, const tuple& val, + const tuple& cmp, uint tid, uint delta) + { + For<0, tuple_size >::value>::merge(skeys, key, svals, val, cmp, tid, delta); + } +#endif // Generic diff --git a/modules/cudev/include/opencv2/cudev/block/reduce.hpp b/modules/cudev/include/opencv2/cudev/block/reduce.hpp index 06f59a16ae9..9dde278da84 100644 --- a/modules/cudev/include/opencv2/cudev/block/reduce.hpp +++ b/modules/cudev/include/opencv2/cudev/block/reduce.hpp @@ -51,6 +51,7 @@ #include "../warp/reduce.hpp" #include "detail/reduce.hpp" #include "detail/reduce_key_val.hpp" +#include namespace cv { namespace cudev { @@ -65,6 +66,7 @@ __device__ __forceinline__ void blockReduce(volatile T* smem, T& val, uint tid, block_reduce_detail::Dispatcher::reductor::template reduce(smem, val, tid, op); } +#if (CUDART_VERSION < 12040) template (skeys, key, svals, val, tid, cmp); } +#else + +template +__device__ __forceinline__ void blockReduce(const tuple& smem, + const tuple& val, + uint tid, + const tuple& op) +{ + block_reduce_detail::Dispatcher::reductor::template reduce&, const tuple&, const tuple&>(smem, val, tid, op); +} + +// blockReduceKeyVal + +template +__device__ __forceinline__ void blockReduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, uint tid, const Cmp& cmp) +{ + block_reduce_key_val_detail::Dispatcher::reductor::template reduce(skeys, key, svals, val, tid, cmp); +} + +template +__device__ __forceinline__ void blockReduceKeyVal(volatile K* skeys, K& key, const tuple& svals, const tuple& val, uint tid, const Cmp& cmp) +{ + block_reduce_key_val_detail::Dispatcher::reductor::template reduce&, const tuple&, const Cmp&>(skeys, key, svals, val, tid, cmp); +} + +template +__device__ __forceinline__ void blockReduceKeyVal(const tuple& skeys, const tuple& key, const tuple& svals, const tuple& val, uint tid, const tuple& cmp) +{ + block_reduce_key_val_detail::Dispatcher::reductor::template reduce< const tuple&, const tuple&, const tuple&, const tuple&, const tuple&>(skeys, key, svals, val, tid, cmp); +} + +#endif + //! @} }} diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp index 3f512060165..df8bed3a948 100644 --- a/modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/detail/split_merge.hpp @@ -157,28 +157,47 @@ namespace grid_split_merge_detail template struct MergeImpl<2, Policy> { template - __host__ static void merge(const SrcPtrTuple& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + __host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { mergeC2(get<0>(src), get<1>(src), dst, mask, rows, cols, stream); } + + template + __host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + mergeC2(src[0], src[1], dst, mask, rows, cols, stream); + } + }; template struct MergeImpl<3, Policy> { template - __host__ static void merge(const SrcPtrTuple& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + __host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { mergeC3(get<0>(src), get<1>(src), get<2>(src), dst, mask, rows, cols, stream); } + + template + __host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + mergeC3(src[0], src[1], src[2], dst, mask, rows, cols, stream); + } }; template struct MergeImpl<4, Policy> { template - __host__ static void merge(const SrcPtrTuple& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + __host__ static void mergeTuple(const SrcPtrTuple& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { mergeC4(get<0>(src), get<1>(src), get<2>(src), get<3>(src), dst, mask, rows, cols, stream); } + + template + __host__ static void mergeArray(const SrcPtrArray& src, const GlobPtr& dst, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + mergeC4(src[0], src[1], src[2], src[3], dst, mask, rows, cols, stream); + } }; // split diff --git a/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp b/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp index 557797d7c85..4e901ac751d 100644 --- a/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp @@ -179,6 +179,23 @@ namespace grid_transform_detail dst(y, x) = saturate_cast(op(src1(y, x), src2(y, x))); } + // transformSimple, 2 outputs + // The overloads are added for polar_cart.cu to compute magnitude and phase with single call + // the previous implementation with touple causes cuda namespace clash. See https://github.com/opencv/opencv_contrib/issues/3690 + template + __global__ void transformSimple(const SrcPtr1 src1, const SrcPtr2 src2, GlobPtr dst1, GlobPtr dst2, + const BinOp1 op1, const BinOp2 op2, const MaskPtr mask, const int rows, const int cols) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= cols || y >= rows || !mask(y, x)) + return; + + dst1(y, x) = saturate_cast(op1(src1(y, x), src2(y, x))); + dst2(y, x) = saturate_cast(op2(src1(y, x), src2(y, x))); + } + // transformSmart template @@ -248,6 +265,52 @@ namespace grid_transform_detail } } + // transformSmart, 2 outputs + // The overloads are added for polar_cart.cu to compute magnitude and phase with single call + // the previous implementation with touple causes cuda namespace clash. See https://github.com/opencv/opencv_contrib/issues/3690 + template + __global__ void transformSmart(const GlobPtr src1_, const GlobPtr src2_, + GlobPtr dst1_, GlobPtr dst2_, + const BinOp1 op1, const BinOp2 op2, const MaskPtr mask, const int rows, const int cols) + { + typedef typename MakeVec::type read_type1; + typedef typename MakeVec::type read_type2; + typedef typename MakeVec::type write_type1; + typedef typename MakeVec::type write_type2; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x_shifted = x * SHIFT; + + if (y < rows) + { + const SrcType1* src1 = src1_.row(y); + const SrcType2* src2 = src2_.row(y); + DstType1* dst1 = dst1_.row(y); + DstType2* dst2 = dst2_.row(y); + + if (x_shifted + SHIFT - 1 < cols) + { + const read_type1 src1_n_el = ((const read_type1*)src1)[x]; + const read_type2 src2_n_el = ((const read_type2*)src2)[x]; + + OpUnroller::unroll(src1_n_el, src2_n_el, ((write_type1*)dst1)[x], op1, mask, x_shifted, y); + OpUnroller::unroll(src1_n_el, src2_n_el, ((write_type2*)dst2)[x], op2, mask, x_shifted, y); + } + else + { + for (int real_x = x_shifted; real_x < cols; ++real_x) + { + if (mask(y, real_x)) + { + dst1[real_x] = op1(src1[real_x], src2[real_x]); + dst2[real_x] = op2(src1[real_x], src2[real_x]); + } + } + } + } + } + // TransformDispatcher template struct TransformDispatcher; @@ -279,6 +342,20 @@ namespace grid_transform_detail if (stream == 0) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } + + template + __host__ static void call(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr& dst1, const GlobPtr& dst2, + const BinOp1& op1, const BinOp2& op2, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + const dim3 block(Policy::block_size_x, Policy::block_size_y); + const dim3 grid(divUp(cols, block.x), divUp(rows, block.y)); + + transformSimple<<>>(src1, src2, dst1, dst2, op1, op2, mask, rows, cols); + CV_CUDEV_SAFE_CALL( cudaGetLastError() ); + + if (stream == 0) + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); + } }; template struct TransformDispatcher @@ -336,6 +413,33 @@ namespace grid_transform_detail if (stream == 0) CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); } + + template + __host__ static void call(const GlobPtr& src1, const GlobPtr& src2, + const GlobPtr& dst1, const GlobPtr& dst2, + const BinOp1& op1, const BinOp2& op2, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + if (Policy::shift == 1 || + !isAligned(src1.data, Policy::shift * sizeof(SrcType1)) || !isAligned(src1.step, Policy::shift * sizeof(SrcType1)) || + !isAligned(src2.data, Policy::shift * sizeof(SrcType2)) || !isAligned(src2.step, Policy::shift * sizeof(SrcType2)) || + !isAligned(dst1.data, Policy::shift * sizeof(DstType1)) || !isAligned(dst1.step, Policy::shift * sizeof(DstType1))|| + !isAligned(dst2.data, Policy::shift * sizeof(DstType2)) || !isAligned(dst2.step, Policy::shift * sizeof(DstType2)) + ) + { + TransformDispatcher::call(src1, src2, dst1, dst2, op1, op2, mask, rows, cols, stream); + return; + } + + const dim3 block(Policy::block_size_x, Policy::block_size_y); + const dim3 grid(divUp(cols, block.x * Policy::shift), divUp(rows, block.y)); + + transformSmart<<>>(src1, src2, dst1, dst2, op1, op2, mask, rows, cols); + CV_CUDEV_SAFE_CALL( cudaGetLastError() ); + + if (stream == 0) + CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); + } + }; template @@ -350,6 +454,13 @@ namespace grid_transform_detail TransformDispatcher::call(src1, src2, dst, op, mask, rows, cols, stream); } + template + __host__ void transform_binary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr& dst1, const GlobPtr& dst2, + const BinOp1& op1, const BinOp2& op2, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + TransformDispatcher::call(src1, src2, dst1, dst2, op1, op2, mask, rows, cols, stream); + } + template __host__ void transform_unary(const GlobPtr& src, const GlobPtr& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) { @@ -362,6 +473,15 @@ namespace grid_transform_detail TransformDispatcher::cn == 1 && VecTraits::cn == 1 && VecTraits::cn == 1 && Policy::shift != 1, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream); } + template + __host__ void transform_binary(const GlobPtr& src1, const GlobPtr& src2, const GlobPtr& dst1, const GlobPtr& dst2, + const BinOp1& op1, const BinOp2& op2, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) + { + TransformDispatcher::cn == 1 && VecTraits::cn == 1 && + VecTraits::cn == 1 && VecTraits::cn == 1 && + Policy::shift != 1, Policy>::call(src1, src2, dst1, dst2, op1, op2, mask, rows, cols, stream); + } + // transform_tuple template struct Unroll diff --git a/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp b/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp index 5c92a813ed8..115d8c55e46 100644 --- a/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp @@ -72,11 +72,11 @@ __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_& dst, const Ma dst.create(rows, cols); - grid_split_merge_detail::MergeImpl::cn, Policy>::merge(shrinkPtr(src), - shrinkPtr(dst), - shrinkPtr(mask), - rows, cols, - StreamAccessor::getStream(stream)); + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeTuple(shrinkPtr(src), + shrinkPtr(dst), + shrinkPtr(mask), + rows, cols, + StreamAccessor::getStream(stream)); } template @@ -90,7 +90,7 @@ __host__ void gridMerge_(const SrcPtrTuple& src, const GlobPtrSz& dst, CV_Assert( getRows(dst) == rows && getCols(dst) == cols ); CV_Assert( getRows(mask) == rows && getCols(mask) == cols ); - grid_split_merge_detail::MergeImpl::cn, Policy>::merge(shrinkPtr(src), + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeTuple(shrinkPtr(src), shrinkPtr(dst), shrinkPtr(mask), rows, cols, @@ -107,11 +107,11 @@ __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_& dst, Stream& dst.create(rows, cols); - grid_split_merge_detail::MergeImpl::cn, Policy>::merge(shrinkPtr(src), - shrinkPtr(dst), - WithOutMask(), - rows, cols, - StreamAccessor::getStream(stream)); + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeTuple(shrinkPtr(src), + shrinkPtr(dst), + WithOutMask(), + rows, cols, + StreamAccessor::getStream(stream)); } template @@ -124,13 +124,87 @@ __host__ void gridMerge_(const SrcPtrTuple& src, const GlobPtrSz& dst, CV_Assert( getRows(dst) == rows && getCols(dst) == cols ); - grid_split_merge_detail::MergeImpl::cn, Policy>::merge(shrinkPtr(src), - shrinkPtr(dst), - WithOutMask(), - rows, cols, - StreamAccessor::getStream(stream)); + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeTuple(shrinkPtr(src), + shrinkPtr(dst), + WithOutMask(), + rows, cols, + StreamAccessor::getStream(stream)); +} + +template +__host__ void gridMergeArray_(const std::array& src, GpuMat_& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + CV_Assert( VecTraits::cn == src.size() ); + + const int rows = getRows(src); + const int cols = getCols(src); + + CV_Assert( getRows(mask) == rows && getCols(mask) == cols ); + + dst.create(rows, cols); + + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeArray(src, + shrinkPtr(dst), + shrinkPtr(mask), + rows, cols, + StreamAccessor::getStream(stream)); +} + +template +__host__ void gridMergeArray_(const std::array& src, const GlobPtrSz& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + CV_Assert( VecTraits::cn == src.size() ); + + const int rows = src[0].rows; + const int cols = src[0].cols; + + CV_Assert( getRows(dst) == rows && getCols(dst) == cols ); + CV_Assert( getRows(mask) == rows && getCols(mask) == cols ); + + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeArray(src, + shrinkPtr(dst), + shrinkPtr(mask), + rows, cols, + StreamAccessor::getStream(stream)); } +template +__host__ void gridMergeArray_(const std::array& src, GpuMat_& dst, Stream& stream = Stream::Null()) +{ + CV_Assert( VecTraits::cn == src.size() ); + + const int rows = src[0].rows; + const int cols = src[0].cols; + + dst.create(rows, cols); + + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeArray(src, + shrinkPtr(dst), + WithOutMask(), + rows, cols, + StreamAccessor::getStream(stream)); +} + +template +__host__ void gridMergeArray_(const std::array& src, const GlobPtrSz& dst, Stream& stream = Stream::Null()) +{ + CV_Assert( VecTraits::cn == src.size() ); + + const int rows = src[0].rows; + const int cols = src[0].cols; + + CV_Assert( getRows(dst) == rows && getCols(dst) == cols ); + + grid_split_merge_detail::MergeImpl::cn, Policy>::mergeArray(src, + shrinkPtr(dst), + WithOutMask(), + rows, cols, + StreamAccessor::getStream(stream)); +} + + +/////////////////////////////////////////////////////////////// + template __host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_&, GpuMat_& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) { @@ -522,6 +596,30 @@ __host__ void gridMerge(const SrcPtrTuple& src, const GlobPtrSz& dst, S gridMerge_(src, dst, stream); } +template +__host__ void gridMergeArray(const std::array& src, GpuMat_& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + gridMergeArray_(src, dst, mask, stream); +} + +template +__host__ void gridMerge(const std::array& src, const GlobPtrSz& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + gridMergeArray_(src, dst, mask, stream); +} + +template +__host__ void gridMerge(const std::array& src, GpuMat_& dst, Stream& stream = Stream::Null()) +{ + gridMergeArray_(src, dst, stream); +} + +template +__host__ void gridMerge(const std::array& src, const GlobPtrSz& dst, Stream& stream = Stream::Null()) +{ + gridMergeArray_(src, dst, stream); +} + template __host__ void gridSplit(const SrcPtr& src, const tuple< GpuMat_&, GpuMat_& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null()) { diff --git a/modules/cudev/include/opencv2/cudev/grid/transform.hpp b/modules/cudev/include/opencv2/cudev/grid/transform.hpp index 4f7d191e64b..f89cdf5d484 100644 --- a/modules/cudev/include/opencv2/cudev/grid/transform.hpp +++ b/modules/cudev/include/opencv2/cudev/grid/transform.hpp @@ -121,6 +121,22 @@ __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, Gpu grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream)); } +template +__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_& dst1, GpuMat_& dst2, + const BinOp1& op1, const BinOp2& op2, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + const int rows = getRows(src1); + const int cols = getCols(src1); + + CV_Assert( getRows(src2) == rows && getCols(src2) == cols ); + CV_Assert( getRows(mask) == rows && getCols(mask) == cols ); + + dst1.create(rows, cols); + dst2.create(rows, cols); + + grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst1), shrinkPtr(dst2), op1, op2, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream)); +} + template __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null()) { @@ -134,6 +150,22 @@ __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, con grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream)); } +template +__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst1, const GlobPtrSz& dst2, + const BinOp1& op1, const BinOp2& op2, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + const int rows = getRows(src1); + const int cols = getCols(src1); + + CV_Assert( getRows(dst1) == rows && getCols(dst1) == cols ); + CV_Assert( getRows(dst2) == rows && getCols(dst2) == cols ); + CV_Assert( getRows(src2) == rows && getCols(src2) == cols ); + CV_Assert( getRows(mask) == rows && getCols(mask) == cols ); + + grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst1), shrinkPtr(dst2), op1, op2, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream)); +} + + template __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_& dst, const BinOp& op, Stream& stream = Stream::Null()) { @@ -147,6 +179,21 @@ __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, Gpu grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream)); } +template +__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_& dst1, GpuMat_& dst2, + const BinOp1& op1, const BinOp2& op2, Stream& stream = Stream::Null()) +{ + const int rows = getRows(src1); + const int cols = getCols(src1); + + CV_Assert( getRows(src2) == rows && getCols(src2) == cols ); + + dst1.create(rows, cols); + dst2.create(rows, cols); + + grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst1), shrinkPtr(dst2), op1, op2, WithOutMask(), rows, cols, StreamAccessor::getStream(stream)); +} + template __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst, const BinOp& op, Stream& stream = Stream::Null()) { @@ -159,6 +206,20 @@ __host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, con grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream)); } +template +__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst1, const GlobPtrSz& dst2, + const BinOp1& op1, const BinOp2& op2, Stream& stream = Stream::Null()) +{ + const int rows = getRows(src1); + const int cols = getCols(src1); + + CV_Assert( getRows(dst1) == rows && getCols(dst1) == cols ); + CV_Assert( getRows(dst2) == rows && getCols(dst2) == cols ); + CV_Assert( getRows(src2) == rows && getCols(src2) == cols ); + + grid_transform_detail::transform_binary(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst1), shrinkPtr(dst2), op1, op2, WithOutMask(), rows, cols, StreamAccessor::getStream(stream)); +} + template __host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_&, GpuMat_& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null()) { @@ -449,24 +510,54 @@ __host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuM gridTransformBinary_(src1, src2, dst, op, mask, stream); } +template +__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_& dst1, GpuMat_& dst2, + const Op1& op1, const Op2& op2, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + gridTransformBinary_(src1, src2, dst1, dst2, op1, op2, mask, stream); +} + template __host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null()) { gridTransformBinary_(src1, src2, dst, op, mask, stream); } +template +__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst1, const GlobPtrSz& dst2, + const Op1& op1, const Op2& op2, const MaskPtr& mask, Stream& stream = Stream::Null()) +{ + gridTransformBinary_(src1, src2, dst1, dst2, op1, op2, mask, stream); +} + template __host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_& dst, const Op& op, Stream& stream = Stream::Null()) { gridTransformBinary_(src1, src2, dst, op, stream); } +template +__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, + GpuMat_& dst1, GpuMat_& dst2, + const Op1& op1, const Op2& op2, Stream& stream = Stream::Null()) +{ + gridTransformBinary_(src1, src2, dst1, dst2, op1, op2, stream); +} + template __host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz& dst, const Op& op, Stream& stream = Stream::Null()) { gridTransformBinary_(src1, src2, dst, op, stream); } +template +__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, + const GlobPtrSz& dst1, const GlobPtrSz& dst2, + const Op1& op1, const Op2& op2, Stream& stream = Stream::Null()) +{ + gridTransformBinary_(src1, src2, dst1, dst2, op1, op2, stream); +} + template __host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_&, GpuMat_& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null()) { diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp index 2024a7e01a2..98c115fa1bf 100644 --- a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp +++ b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp @@ -118,6 +118,18 @@ __host__ GlobPtrSz globPtr(const GpuMat& mat) return p; } +template +__host__ GlobPtrSz globPtr(const GpuMat_& mat) +{ + GlobPtrSz p; + p.data = (T*) mat.data; + p.step = mat.step; + p.rows = mat.rows; + p.cols = mat.cols; + return p; +} + + template struct PtrTraits< GlobPtrSz > : PtrTraitsBase, GlobPtr > { }; diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp index e68f4cf61f5..f5a3f8c85d4 100644 --- a/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp +++ b/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp @@ -49,6 +49,7 @@ #include "../common.hpp" #include "../util/tuple.hpp" #include "traits.hpp" +#include namespace cv { namespace cudev { @@ -175,4 +176,25 @@ template struct PtrTraits< ZipPtrSz > : PtrTraitsBase }} +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +template< class... Types > +struct tuple_size< cv::cudev::ZipPtr > > +: tuple_size > { }; + +template< class... Types > +struct tuple_size< cv::cudev::ZipPtrSz > > +: tuple_size > { }; + + +template +struct tuple_element > > +: tuple_element > { }; + +template +struct tuple_element > > +: tuple_element > { }; + +_LIBCUDACXX_END_NAMESPACE_STD + #endif diff --git a/modules/cudev/test/test_split_merge.cu b/modules/cudev/test/test_split_merge.cu index b25c8b96d6f..598b6b80ac2 100644 --- a/modules/cudev/test/test_split_merge.cu +++ b/modules/cudev/test/test_split_merge.cu @@ -70,7 +70,8 @@ public: GpuMat_ d_src2(src2); GpuMat_::type> dst; - gridMerge(zipPtr(d_src1, d_src2), dst); + std::array, 2 > d_src = {globPtr(d_src1), globPtr(d_src2)}; + gridMerge(d_src, dst); Mat dst_gold; Mat srcs[] = {src1, src2}; @@ -93,8 +94,10 @@ public: GpuMat_ d_src2(src2); GpuMat_ d_src3(src3); + std::array, 3 > d_src = {globPtr(d_src1), globPtr(d_src2), globPtr(d_src3)}; + GpuMat_::type> dst; - gridMerge(zipPtr(d_src1, d_src2, d_src3), dst); + gridMerge(d_src, dst); Mat dst_gold; Mat srcs[] = {src1, src2, src3};