diff --git a/modules/cannops/CMakeLists.txt b/modules/cannops/CMakeLists.txt
new file mode 100644
index 00000000000..0c16c5eb143
--- /dev/null
+++ b/modules/cannops/CMakeLists.txt
@@ -0,0 +1,17 @@
+ if(IOS OR WINRT OR ANDROID OR APPLE OR WIN32 OR (NOT HAVE_CANN))
+   ocv_module_disable(cannops)
+ endif()
+
+set(the_description "Ascend-accelerated Operations on Matrices")
+
+ocv_add_module(cannops opencv_core WRAP python)
+ocv_module_include_directories(${CANN_INCLUDE_DIRS})
+ocv_glob_module_sources()
+ocv_install_used_external_targets(${CANN_LIBRARIES})
+ocv_create_module(${CANN_LIBRARIES})
+
+ocv_include_directories(${CMAKE_SOURCE_DIR}/modules/ts/include)
+
+ocv_add_accuracy_tests(DEPENDS_ON opencv_cannops)
+ocv_add_perf_tests(DEPENDS_ON opencv_cannops)
+ocv_add_samples(opencv_cannops)
diff --git a/modules/cannops/Dockerfile b/modules/cannops/Dockerfile
new file mode 100644
index 00000000000..939999eed4f
--- /dev/null
+++ b/modules/cannops/Dockerfile
@@ -0,0 +1,67 @@
+# User guides
+#
+# 0. Install Ascend driver on host.
+#    (https://www.hiascend.com/en/hardware/firmware-drivers)
+#
+# 1. Run docker container.
+# docker run -it \
+#    --name opencv \
+#    --device /dev/davinci0 \
+#    --device /dev/davinci_manager \
+#    --device /dev/devmm_svm \
+#    --device /dev/hisi_hdc \
+#    -v /usr/local/dcmi:/usr/local/dcmi \
+#    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+#    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+#    opencv bash
+#
+# 2. Check environment.
+# npu-smi info
+#
+# 3. Compile opencv with Ascend NPU backend.
+# cmake -DWITH_CANN=1
+#
+# 4. Run opencv_test_cannops.
+# ./bin/opencv_test_cannops
+
+FROM openeuler/openeuler:22.03-lts-sp2
+
+RUN yum install -y \
+    git \
+    wget \
+    gcc \
+    g++ \
+    cmake \
+    make \
+    python-pip \
+    python3-devel
+
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple \
+    numpy \
+    sympy \
+    decorator \
+    scipy \
+    attrs \
+    psutil
+
+# Install CANN
+RUN wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%207.0.RC1/Ascend-cann-toolkit_7.0.RC1_linux-"$(uname -i)".run && \
+    chmod +x Ascend-cann-toolkit_7.0.RC1_linux-"$(uname -i)".run && \
+    ./Ascend-cann-toolkit_7.0.RC1_linux-"$(uname -i)".run --quiet --install && \
+    rm -f ./Ascend-cann-toolkit_7.0.RC1_linux-"$(uname -i)".run
+
+# Install kernel
+RUN wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%207.0.RC1/Ascend-cann-kernels-310p_7.0.RC1_linux.run && \
+    chmod +x Ascend-cann-kernels-310p_7.0.RC1_linux.run && \
+    ./Ascend-cann-kernels-310p_7.0.RC1_linux.run --quiet --install && \
+    rm -f ./Ascend-cann-kernels-310p_7.0.RC1_linux.run
+
+ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH:/usr/lib64
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:$LD_LIBRARY_PATH
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:$PYTHONPATH
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:$PATH
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
diff --git a/modules/cannops/include/opencv2/cann.hpp b/modules/cannops/include/opencv2/cann.hpp
new file mode 100644
index 00000000000..30555dd8257
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann.hpp
@@ -0,0 +1,328 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_HPP
+#define OPENCV_CANNOPS_CANN_HPP
+
+#include "opencv2/core.hpp"
+
+/**
+  @defgroup cann Ascend-accelerated Computer Vision
+  @{
+    @defgroup canncore Core part
+    @{
+      @defgroup cann_struct Data Structures
+      @defgroup cann_init Initializeation and Information
+    @}
+  @}
+ */
+
+namespace cv
+{
+namespace cann
+{
+class AscendStream;
+
+//! @addtogroup cann_struct
+//! @{
+
+//===================================================================================
+// AscendMat
+//===================================================================================
+
+/** @brief Base storage class for NPU memory with reference counting.
+ * AscendMat class has a similar interface with Mat and AscendMat, and work on [Ascend
+ * NPU](https://www.hiascend.com/) backend.
+ * @sa Mat cuda::GpuMat
+ */
+class AscendStream;
+class CV_EXPORTS_W AscendMat
+{
+public:
+    class CV_EXPORTS_W Allocator
+    {
+    public:
+        virtual ~Allocator() {}
+        // basic allocator
+        virtual std::shared_ptr<uchar> allocate(size_t size) = 0;
+        // allocator must fill data, step and refcount fields
+        virtual bool allocate(AscendMat* mat, int rows, int cols, size_t elemSize) = 0;
+    };
+
+    /**
+     * @brief Create default allocator for AscendMat. This allocator alloc memory from device for
+     * specific size.
+     */
+    CV_WRAP static AscendMat::Allocator* defaultAllocator();
+
+    /**
+     * @brief Set allocator for AscendMat.
+     * @param allocator
+     */
+    CV_WRAP static void setDefaultAllocator(AscendMat::Allocator* allocator);
+
+    //! default constructor
+    CV_WRAP explicit AscendMat(AscendMat::Allocator* allocator_ = AscendMat::defaultAllocator());
+
+    //! constructs AscendMat of the specified size and type
+    CV_WRAP AscendMat(int rows, int cols, int type,
+                      AscendMat::Allocator* allocator = AscendMat::defaultAllocator());
+    //! constructs AscendMat of the specified size and type
+    CV_WRAP AscendMat(Size size, int type,
+                      AscendMat::Allocator* allocator = AscendMat::defaultAllocator());
+
+    //! constructs AscendMat and fills it with the specified value s
+    CV_WRAP AscendMat(int rows, int cols, int type, Scalar& s,
+                      AscendMat::Allocator* allocator = AscendMat::defaultAllocator());
+    //! constructs AscendMat and fills it with the specified value s
+    CV_WRAP AscendMat(Size size, int type, Scalar& s,
+                      AscendMat::Allocator* allocator = AscendMat::defaultAllocator());
+
+    //! copy constructor
+    CV_WRAP AscendMat(const AscendMat& m);
+
+    //! constructs AscendMat by crop a certain area from another
+    CV_WRAP AscendMat(InputArray _m, const Rect& roi);
+    CV_WRAP AscendMat(InputArray _m, const Rect& roi, AscendStream& stream);
+
+    //! builds AscendMat from host memory (Blocking call)
+    CV_WRAP explicit AscendMat(InputArray arr, AscendStream& stream,
+                               AscendMat::Allocator* allocator = AscendMat::defaultAllocator());
+
+    //! assignment operators
+    AscendMat& operator=(const AscendMat& m);
+
+    //! sets some of the AscendMat elements to s (Blocking call)
+    CV_WRAP AscendMat& setTo(const Scalar& s);
+    //! sets some of the AscendMat elements to s (Non-Blocking call)
+    CV_WRAP AscendMat& setTo(const Scalar& s, AscendStream& stream);
+
+    //! sets all of the AscendMat elements to float (Blocking call)
+    CV_WRAP AscendMat& setTo(float sc);
+
+    //! sets all of the AscendMat elements to float (Non-Blocking call)
+    CV_WRAP AscendMat& setTo(float sc, AscendStream& stream);
+
+    //! swaps with other smart pointer
+    CV_WRAP void swap(AscendMat& mat);
+
+    //! allocates new AscendMat data unless the AscendMat already has specified size and type
+    CV_WRAP void create(int rows, int cols, int type);
+
+    //! upload host memory data to AscendMat (Blocking call)
+    CV_WRAP void upload(InputArray arr);
+    //! upload host memory data to AscendMat (Non-Blocking call)
+    CV_WRAP void upload(InputArray arr, AscendStream& stream);
+
+    //! download data from AscendMat to host (Blocking call)
+    CV_WRAP void download(OutputArray dst) const;
+    //! download data from AscendMat to host (Non-Blocking call)
+    CV_WRAP void download(OutputArray dst, AscendStream& stream) const;
+
+    //! converts AscendMat to another datatype (Blocking call)
+    CV_WRAP void convertTo(CV_OUT AscendMat& dst, int rtype) const;
+
+    //! converts AscendMat to another datatype (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT AscendMat& dst, int rtype, AscendStream& stream) const;
+
+    //! converts AscendMat to another datatype, dst mat is allocated. (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT AscendMat& dst, AscendStream& stream) const;
+
+    //! returns true iff the AscendMat data is continuous
+    //! (i.e. when there are no gaps between successive rows)
+    CV_WRAP bool isContinuous() const;
+
+    //! returns element size in bytes
+    CV_WRAP size_t elemSize() const;
+
+    //! returns the size of element channel in bytes
+    CV_WRAP size_t elemSize1() const;
+
+    //! returns element type
+    CV_WRAP int type() const;
+
+    //! returns element type
+    CV_WRAP int depth() const;
+
+    //! returns number of channels
+    CV_WRAP int channels() const;
+
+    //! returns step/elemSize1()
+    CV_WRAP size_t step1() const;
+
+    //! returns AscendMat size : width == number of columns, height == number of rows
+    CV_WRAP Size size() const;
+
+    //! returns true if AscendMat data is NULL
+    CV_WRAP bool empty() const;
+
+    //! internal use method: updates the continuity flag
+    CV_WRAP void updateContinuityFlag();
+
+    /*! includes several bit-fields:
+     - the magic signature
+     - continuity flag
+     - depth
+     - number of channels
+     */
+    int flags;
+
+    //! the number of rows and columns
+    int rows, cols;
+
+    //! a distance between successive rows in bytes; includes the gap if any
+    CV_PROP size_t step;
+
+    //! pointer to the data
+    std::shared_ptr<uchar> data;
+
+    //! helper fields used in locateROI and adjustROI
+    uchar* datastart;
+    const uchar* dataend;
+
+    //! allocator
+    Allocator* allocator;
+};
+
+class AscendStream;
+class AscendStreamAccessor;
+class AscendEvent;
+class AscendEventAccessor;
+class DefaultDeviceInitializer;
+
+//===================================================================================
+// AscendStream
+//===================================================================================
+
+/** @brief In AscendCL Stream(AscendStream) is a task queue. Stream is used to manage the
+ * parallelism of tasks. The tasks inside a Stream are executed sequentially, that is, the Stream
+ * executes sequentially according to the sent tasks; the tasks in different Streams are executed in
+ * parallel.
+ *
+ * All Non-blocking functions should pass parameter stream, These function returns immediately after
+ * the task is submitted. Caller should wait stream until completion.
+ *
+ * Blocking functions implicityly use the default stream, and synchronize stream before function
+ * return.
+ * @sa cuda::Stream
+ */
+
+// TODO: Stream is defined in namespace cuda, and pybind code does not use a namespace of stream,
+// change stream name to AscendStream to avoid confilct.
+class CV_EXPORTS_W AscendStream
+{
+public:
+    CV_WRAP AscendStream();
+
+    //! blocks the current CPU thread until all operations in the stream are complete.
+    CV_WRAP void waitForCompletion();
+
+    //! blocks the current CPU thread until event trigger.
+    CV_WRAP void waitAscendEvent(const cv::cann::AscendEvent& event);
+
+    /**
+     * @brief return default AscendStream object for default Acl stream.
+     */
+    CV_WRAP static AscendStream& Null();
+
+    // acl symbols CANNOT used in any hpp files. Use a inner class to avoid acl symbols defined in
+    // hpp.
+    class Impl;
+
+    void addTensorHolder(const std::shared_ptr<uchar>& holder);
+
+private:
+    Ptr<Impl> impl_;
+    AscendStream(const Ptr<Impl>& impl);
+
+    friend class AscendStreamAccessor;
+    friend class DefaultDeviceInitializer;
+};
+
+/**
+ * @brief AscendEvent to synchronize between different streams.
+ */
+class CV_EXPORTS_W AscendEvent
+{
+public:
+    CV_WRAP AscendEvent();
+
+    //! records an event
+    CV_WRAP void record(AscendStream& stream);
+
+    //! waits for an event to complete
+    CV_WRAP void waitForComplete() const;
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    AscendEvent(const Ptr<Impl>& impl);
+
+    friend class AscendEventAccessor;
+};
+
+/** @brief Bindings overload to create a Stream object from the address stored in an existing CANN
+ * Runtime API stream pointer (aclrtStream).
+ * @param AscendStreamAddress Memory address stored in a CANN Runtime API stream pointer
+ * (aclrtStream). The created Stream object does not perform any allocation or deallocation and
+ * simply wraps existing raw CANN Runtime API stream pointer.
+ * @note Overload for generation of bindings only, not exported or intended for use internally fro
+ * C++.
+ */
+CV_EXPORTS_W AscendStream wrapStream(size_t AscendStreamAddress);
+
+//! @} cann_struct
+
+//===================================================================================
+// Initialization & Info
+//===================================================================================
+
+//! @addtogroup cann_init
+//! @{
+
+//! Get Ascend matrix object from Input array, upload matrix memory if need. (Non-Blocking call)
+AscendMat getInputMat(InputArray src, AscendStream& stream);
+
+//! Get Ascend matrix object from Output array, upload matrix memory if need.
+AscendMat getOutputMat(OutputArray dst, int rows, int cols, int type, AscendStream& stream);
+
+//! Sync output matrix to Output array, download matrix memory if need.
+void syncOutput(const AscendMat& dst, OutputArray _dst, AscendStream& stream);
+
+/**
+ * @brief Choose Ascend npu device.
+ */
+CV_EXPORTS_W void setDevice(int device);
+
+/**
+ * @brief Clear all context created in current Ascend device.
+ */
+CV_EXPORTS_W void resetDevice();
+
+/**
+ * @brief Get current Ascend device.
+ */
+CV_EXPORTS_W int32_t getDevice();
+
+/**
+ * @brief init AscendCL.
+ */
+CV_EXPORTS_W void initAcl();
+
+/**
+ * @brief finalize AscendCL.
+ * @note finalizeAcl only can be called once for a process. Call this function after all AscendCL
+ * options finished.
+ */
+CV_EXPORTS_W void finalizeAcl();
+
+//! @} cann_init
+
+} // namespace cann
+} // namespace cv
+
+#include "opencv2/cann.inl.hpp"
+
+#endif // OPENCV_CANNOPS_CANN_HPP
diff --git a/modules/cannops/include/opencv2/cann.inl.hpp b/modules/cannops/include/opencv2/cann.inl.hpp
new file mode 100644
index 00000000000..4a97466b375
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann.inl.hpp
@@ -0,0 +1,97 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_INL_HPP
+#define OPENCV_CANNOPS_CANN_INL_HPP
+
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+inline AscendMat::AscendMat(AscendMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    // Empty mat is also continuous.
+    flags |= Mat::CONTINUOUS_FLAG;
+}
+
+inline AscendMat::AscendMat(int rows_, int cols_, int type_, AscendMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline AscendMat::AscendMat(Size size_, int type_, AscendMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline AscendMat::AscendMat(InputArray arr, AscendStream& stream, AscendMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    upload(arr, stream);
+}
+
+inline AscendMat::AscendMat(const AscendMat& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data),
+      datastart(m.datastart), dataend(m.dataend), allocator(m.allocator)
+{}
+
+inline AscendMat& AscendMat::operator=(const AscendMat& m)
+{
+    if (this != &m)
+    {
+        AscendMat temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline void AscendMat::swap(AscendMat& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(allocator, b.allocator);
+}
+
+inline bool AscendMat::isContinuous() const { return (flags & Mat::CONTINUOUS_FLAG) != 0; }
+
+inline size_t AscendMat::elemSize() const { return CV_ELEM_SIZE(flags); }
+
+inline size_t AscendMat::elemSize1() const { return CV_ELEM_SIZE1(flags); }
+
+inline int AscendMat::type() const { return CV_MAT_TYPE(flags); }
+
+inline int AscendMat::depth() const { return CV_MAT_DEPTH(flags); }
+
+inline int AscendMat::channels() const { return CV_MAT_CN(flags); }
+
+inline size_t AscendMat::step1() const { return step / elemSize1(); }
+
+inline Size AscendMat::size() const { return Size(cols, rows); }
+
+inline bool AscendMat::empty() const { return data == 0; }
+
+inline AscendStream::AscendStream(const Ptr<AscendStream::Impl>& impl) : impl_(impl) {}
+
+inline AscendEvent::AscendEvent(const Ptr<AscendEvent::Impl>& impl) : impl_(impl) {}
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_CANN_INL_HPP
diff --git a/modules/cannops/include/opencv2/cann_call.hpp b/modules/cannops/include/opencv2/cann_call.hpp
new file mode 100644
index 00000000000..651bff8bba0
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann_call.hpp
@@ -0,0 +1,157 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_CALL_HPP
+#define OPENCV_CANNOPS_CANN_CALL_HPP
+
+#include <vector>
+#include <set>
+#include <string>
+#include <acl/acl_base.h>
+#include "opencv2/cann.hpp"
+
+class aclopAttr;
+
+namespace cv
+{
+namespace cann
+{
+// Warpper for functions in CANN, callers should not call CANN's api directly, but should call the
+// function provided in cann_call.
+void aclrtMallocWarpper(void** data, size_t size);
+void aclrtFreeWarpper(void* data);
+void aclrtMemcpyWarpper(std::shared_ptr<uchar>& dst, size_t offset, const void* src, size_t size,
+                        AscendStream& stream);
+void aclrtMemcpyWarpper(void* dst, const std::shared_ptr<uchar>& src, size_t offset, size_t size,
+                        AscendStream& stream);
+void aclrtMemcpyWarpper(std::shared_ptr<uchar>& dst, size_t dstOffset,
+                        const std::shared_ptr<uchar>& src, size_t srcOffset, size_t size,
+                        AscendStream& stream);
+void aclrtMemcpy2dWarpper(std::shared_ptr<uchar>& dst, size_t offset, size_t dpitch,
+                          const void* src, size_t spitch, size_t width, size_t length,
+                          AscendStream& stream);
+void aclrtMemcpy2dWarpper(void* dst, size_t dpitch, const std::shared_ptr<uchar>& src,
+                          size_t offset, size_t spitch, size_t width, size_t length,
+                          AscendStream& stream);
+void aclrtMemsetWarpper(std::shared_ptr<uchar>& ptr, int32_t value, size_t count,
+                        AscendStream& stream);
+//! Type mapping between opencv and cann.
+aclDataType getACLType(int opencvdepth);
+//! Malloc and upload raw data to devices.
+std::shared_ptr<uchar> mallocAndUpload(const void* data, size_t size, AscendStream& stream,
+                                       AscendMat::Allocator* allocator);
+/**
+ * @brief Warpper of CANN streams.
+ */
+class AscendStream::Impl
+{
+public:
+    aclrtStream stream;
+    bool ownStream;
+    /**
+     * @brief Ascend and CANN use stream to implement asynchronous calls. Which means when function
+     * returns, operator may not finish, even not start. If caller free any tensors that participate
+     * in this operatation, it have a chance to access invalid memory.
+     * All tensors should add to holder, holder will be cleaned by waitForCompletion function, or when
+     * the stream is destructing.
+     */
+    std::set<std::shared_ptr<uchar>> tensorHolders;
+    Impl();
+    explicit Impl(aclrtStream stream);
+    void AddTensorHolder(const std::shared_ptr<uchar>& tensorData);
+};
+
+/**
+ * @brief Warpper of CANN event.
+ */
+class AscendEvent::Impl
+{
+public:
+    aclrtEvent event;
+    bool ownEvent;
+
+    Impl();
+    explicit Impl(aclrtEvent event);
+    ~Impl();
+};
+
+/**
+ * @brief Parameter type for call_call interfaces.
+ */
+struct AscendTensor
+{
+    const char* name;
+    std::shared_ptr<uchar> data;
+    size_t dataSize;
+    std::vector<int64_t> dims;
+    aclDataType dtype;
+    aclFormat format;
+    AscendTensor(){};
+    AscendTensor(std::shared_ptr<uchar> _data, size_t _dataSize, int64_t* _dims, size_t _dimSize,
+                 aclDataType _dtype, const char* _name = "", aclFormat _format = ACL_FORMAT_ND);
+    AscendTensor(std::shared_ptr<uchar> _data, size_t _dataSize, std::vector<int64_t>& _dims,
+                 aclDataType _dtype, const char* _name = "", aclFormat _format = ACL_FORMAT_ND)
+        : name(_name), data(_data), dataSize(_dataSize), dims(_dims), dtype(_dtype),
+          format(_format){};
+    AscendTensor(const AscendMat& ascendMat, const char* _name = "",
+                 aclFormat format = ACL_FORMAT_ND);
+};
+
+/**
+ * @brief Interface to call operators in CANN package.
+ */
+class OperatorRunner
+{
+private:
+    std::vector<aclDataBuffer*> inputBuffers_;
+    std::vector<aclDataBuffer*> outputBuffers_;
+    std::vector<aclTensorDesc*> inputDesc_;
+    std::vector<aclTensorDesc*> outputDesc_;
+    aclopAttr* opAttr_;
+    bool opAttrInit;
+    std::string op;
+
+    std::set<std::shared_ptr<uchar>> holder;
+
+    OperatorRunner& addInput(AscendTensor& mat);
+    OperatorRunner& addOutput(AscendTensor& mat);
+
+public:
+    OperatorRunner() : opAttrInit(false) {}
+    virtual ~OperatorRunner() { reset(); }
+    OperatorRunner& setOp(const char* op);
+    OperatorRunner& addInput(const AscendMat& mat);
+    OperatorRunner& addOutput(AscendMat& mat);
+    OperatorRunner& addAttr(float value, const char* name);
+    OperatorRunner& addAttr(const char* value, const char* name);
+    OperatorRunner& addAttr(int value, const char* name);
+    OperatorRunner& addAttr(bool value, const char* name);
+    OperatorRunner& addAttr(const int64_t* value, int size, const char* name);
+    OperatorRunner& addInput(const AscendMat& mat, const char* name);
+    OperatorRunner& addInput(const Scalar& sc, int type, const char* name);
+
+    template <typename T>
+    OperatorRunner& addInput(const T* value, int64_t* dims, size_t dimSize, aclDataType type,
+                             const char* name)
+    {
+        int64_t size = dims[0];
+        for (size_t i = 1; i < dimSize; i++)
+            size *= dims[i];
+
+        size_t dataSize = size * sizeof(T);
+        std::shared_ptr<uchar> ptr =
+            mallocAndUpload(value, dataSize, AscendStream::Null(), AscendMat::defaultAllocator());
+
+        AscendTensor tensor(ptr, dataSize, dims, dimSize, type, name);
+        return addInput(tensor);
+    }
+    OperatorRunner& addOutput(AscendMat& mat, const char* name);
+    OperatorRunner& reset();
+    OperatorRunner& run(AscendStream& stream);
+};
+
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_CANN_CALL_HPP
diff --git a/modules/cannops/include/opencv2/cann_interface.hpp b/modules/cannops/include/opencv2/cann_interface.hpp
new file mode 100644
index 00000000000..6667eb58519
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann_interface.hpp
@@ -0,0 +1,516 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_INTERFACE_HPP
+#define OPENCV_CANNOPS_CANN_INTERFACE_HPP
+
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+
+/**
+  @addtogroup cann
+  @{
+    @defgroup cannops Operations for Ascend Backend.
+    @{
+        @defgroup cannops_elem Per-element Operations
+        @defgroup cannops_core Core Operations on Matrices
+        @defgroup cannimgproc Image Processing
+    @}
+  @}
+ */
+
+//! @addtogroup cannops_elem
+//! @{
+
+/** @brief Computes a matrix-matrix or matrix-scalar sum.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param dtype Optional depth of the output array.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::add cuda::add
+ */
+CV_EXPORTS_W void add(const InputArray src1, const InputArray src2, OutputArray dst,
+                      const InputArray mask = noArray(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+// This code should not be compiled nor analyzed by doxygen. This interface only for python binding
+// code generation. add(InputArray, InputArray ...) can accept Scalar as its parametr.(Scalar -> Mat
+// -> InputArray)
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void add(const InputArray src1, const Scalar& src2, OutputArray dst,
+                      const InputArray mask = noArray(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void add(const Scalar& src1, const InputArray src2, OutputArray dst,
+                      const InputArray mask = noArray(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+#endif
+// More overload functions. In order to decouple from the main opencv repository and simplify
+// user calling methods, besides the traditional Input/OutputArray parameters, some
+// overloaded functions for the AcendMat parameter is also provided.
+/** @overload */
+CV_EXPORTS_W void add(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                      const AscendMat& mask = AscendMat(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void add(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                      const AscendMat& mask = AscendMat(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void add(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                      const AscendMat& mask = AscendMat(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+
+/** @brief Computes a matrix-matrix or matrix-scalar difference.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param dtype Optional depth of the output array.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::subtract cuda::subtract
+ */
+CV_EXPORTS_W void subtract(const InputArray src1, const InputArray src2, OutputArray dst,
+                           const InputArray mask = noArray(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void subtract(const InputArray src1, const Scalar& src2, OutputArray dst,
+                           const InputArray mask = noArray(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void subtract(const Scalar& src1, const InputArray src2, OutputArray dst,
+                           const InputArray mask = noArray(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+#endif
+/** @overload */
+CV_EXPORTS_W void subtract(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                           const AscendMat& mask = AscendMat(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void subtract(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                           const AscendMat& mask = AscendMat(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void subtract(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                           const AscendMat& mask = AscendMat(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+
+/** @brief Computes a matrix-matrix or matrix-scalar per-element product.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param scale Optional scale factor.
+ * @param dtype Optional depth of the output array.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::multiply cuda::multiply
+ */
+CV_EXPORTS_W void multiply(const InputArray src1, const InputArray src2, OutputArray dst,
+                           float scale = 1, int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void multiply(const InputArray src1, const Scalar& src2, OutputArray dst,
+                           float scale = 1, int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void multiply(const Scalar& src1, const InputArray src2, OutputArray dst,
+                           float scale = 1, int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+#endif
+/** @overload */
+CV_EXPORTS_W void multiply(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                           float scale = 1, int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void multiply(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                           float scale = 1, int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void multiply(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                           float scale = 1, int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+
+/** @brief Computes a matrix-matrix or matrix-scalar division.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param scale Optional scale factor.
+ * @param dtype Optional depth of the output array.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::divide cuda::divide
+ */
+CV_EXPORTS_W void divide(const InputArray src1, const InputArray src2, OutputArray dst,
+                         float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void divide(const InputArray src1, const Scalar& src2, OutputArray dst,
+                         float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void divide(const Scalar& src1, const InputArray src2, OutputArray dst,
+                         float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+#endif
+CV_EXPORTS_W void divide(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                         float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void divide(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                         float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void divide(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                         float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+
+/** @brief Performs a per-element bitwise conjunction of two matrices (or of matrix and scalar).
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::bitwise_and cuda::bitwise_and
+ */
+CV_EXPORTS_W void bitwise_and(const InputArray src1, const InputArray src2, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void bitwise_and(const InputArray src1, const Scalar& src2, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void bitwise_and(const Scalar& src1, const InputArray src2, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+#endif
+CV_EXPORTS_W void bitwise_and(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_and(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_and(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+
+/** @brief Performs a per-element bitwise disjunction of two matrices (or of matrix and scalar).
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::bitwise_or cuda::bitwise_or
+ */
+CV_EXPORTS_W void bitwise_or(const InputArray src1, const InputArray src2, OutputArray dst,
+                             const InputArray mask = noArray(),
+                             AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void bitwise_or(const InputArray src1, const Scalar& src2, OutputArray dst,
+                             const InputArray mask = noArray(),
+                             AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void bitwise_or(const Scalar& src1, const InputArray src2, OutputArray dst,
+                             const InputArray mask = noArray(),
+                             AscendStream& stream = AscendStream::Null());
+#endif
+CV_EXPORTS_W void bitwise_or(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                             const AscendMat& mask = AscendMat(),
+                             AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_or(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                             const AscendMat& mask = AscendMat(),
+                             AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_or(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                             const AscendMat& mask = AscendMat(),
+                             AscendStream& stream = AscendStream::Null());
+
+/** @brief Performs a per-element bitwise exclusive or operation of two matrices (or of matrix and
+ * scalar).
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::bitwise_xor cuda::bitwise_xor
+ */
+CV_EXPORTS_W void bitwise_xor(const InputArray src1, const InputArray src2, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void bitwise_xor(const InputArray src1, const Scalar& src2, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void bitwise_xor(const Scalar& src1, const InputArray src2, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+#endif
+CV_EXPORTS_W void bitwise_xor(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_xor(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_xor(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+
+/** @brief Performs a per-element bitwise inversion.
+ * @param src First source matrix.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::bitwise_not cuda::bitwise_not
+ */
+CV_EXPORTS_W void bitwise_not(const InputArray src, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_not(const AscendMat& src, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+
+/** @brief Computes the weighted sum of two arrays.
+
+@param src1 First source array.
+@param alpha Weight for the first array elements.
+@param src2 Second source array of the same size and channel number as src1 .
+@param beta Weight for the second array elements.
+@param dst Destination array that has the same size and number of channels as the input arrays.
+@param gamma Scalar added to each sum.
+@param dtype Optional depth of the destination array. When both input arrays have the same depth,
+dtype can be set to -1, which will be equivalent to src1.depth().
+@param stream Stream for the asynchronous version.
+
+The function addWeighted calculates the weighted sum of two arrays as follows:
+
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{src1} (I)* \texttt{alpha} +  \texttt{src2} (I)*
+\texttt{beta} +  \texttt{gamma} )\f]
+
+where I is a multi-dimensional index of array elements. In case of multi-channel arrays, each
+channel is processed independently.
+
+@sa cv::addWeighted cv::cuda::addWeighted
+ */
+CV_EXPORTS_W void addWeighted(const InputArray src1, double alpha, const InputArray src2,
+                              double beta, double gamma, OutputArray dst, int dtype = -1,
+                              AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2,
+                              double beta, double gamma, CV_OUT AscendMat& dst, int dtype = -1,
+                              AscendStream& stream = AscendStream::Null());
+
+/** @brief Applies a fixed-level threshold to each array element.
+
+@param src Source array (single-channel).
+@param dst Destination array with the same size and type as src .
+@param thresh Threshold value.
+@param maxval Maximum value to use with THRESH_BINARY and THRESH_BINARY_INV threshold types.
+@param type Threshold type. For details, see threshold . The THRESH_MASK, THRESH_OTSU and
+THRESH_TRIANGLE threshold types are not supported.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::threshold cv::cuda::threshold
+*/
+CV_EXPORTS_W double threshold(const InputArray src, OutputArray dst, double thresh, double maxval,
+                              int type, AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W double threshold(const AscendMat& src, CV_OUT AscendMat& dst, double thresh,
+                              double maxval, int type, AscendStream& stream = AscendStream::Null());
+
+//! @} cannops_elem
+
+//! @addtogroup cannops_core
+//! @{
+
+/** @brief Makes a multi-channel matrix out of several single-channel matrices.
+
+@param src Array/vector of source matrices.
+@param n Number of source matrices.
+@param dst Destination matrix.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::merge cv::cuda::merge
+ */
+CV_EXPORTS_W void merge(const AscendMat* src, size_t n, CV_OUT AscendMat& dst,
+                      AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void merge(const std::vector<AscendMat>& src, CV_OUT AscendMat& dst,
+                        AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void merge(const AscendMat* src, size_t n, OutputArray& dst,
+                        AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void merge(const std::vector<AscendMat>& src, OutputArray& dst,
+                        AscendStream& stream = AscendStream::Null());
+
+/** @brief Copies each plane of a multi-channel matrix into an array.
+
+@param src Source matrix.
+@param dst Destination array/vector of single-channel matrices.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::split cv::cuda::split
+ */
+CV_EXPORTS_W void split(const AscendMat& src, AscendMat* dst,
+                      AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void split(const AscendMat& src, CV_OUT std::vector<AscendMat>& dst,
+                        AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void split(const InputArray src, AscendMat* dst,
+                        AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void split(const InputArray src, CV_OUT std::vector<AscendMat>& dst,
+                        AscendStream& stream = AscendStream::Null());
+
+/** @brief Transposes a matrix.
+
+@param src Source matrix.
+@param dst Destination matrix.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::transpose cv::cuda::transpose
+ */
+CV_EXPORTS_W void transpose(InputArray src, OutputArray dst,
+                            AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void transpose(const AscendMat& src, CV_OUT AscendMat& dst,
+                            AscendStream& stream = AscendStream::Null());
+/** @brief Flips a 2D matrix around vertical, horizontal, or both axes.
+
+@param src Source matrix.
+@param dst Destination matrix.
+@param flipCode Flip mode for the source:
+-   0 Flips around x-axis.
+-   \> 0 Flips around y-axis.
+-   \< 0 Flips around both axes.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::flip cv::cuda::flip
+ */
+CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode,
+                       AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void flip(const AscendMat& src, CV_OUT AscendMat& dst, int flipCode,
+                       AscendStream& stream = AscendStream::Null());
+/** @brief Rotates a 2D array in multiples of 90 degrees.
+The function cv::rotate rotates the array in one of three different ways:
+*   Rotate by 90 degrees clockwise (rotateCode = ROTATE_90_CLOCKWISE).
+*   Rotate by 180 degrees clockwise (rotateCode = ROTATE_180).
+*   Rotate by 270 degrees clockwise (rotateCode = ROTATE_90_COUNTERCLOCKWISE).
+@param src input array.
+@param dst output array of the same type as src.  The size is the same with ROTATE_180,
+and the rows and cols are switched for ROTATE_90_CLOCKWISE and ROTATE_90_COUNTERCLOCKWISE.
+@param rotateCode an enum to specify how to rotate the array; see the enum #RotateFlags
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::rotate
+*/
+CV_EXPORTS_W void rotate(InputArray src, OutputArray dst, int rotateCode,
+                         AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void rotate(const AscendMat& src, CV_OUT AscendMat& dst, int rotateMode,
+                         AscendStream& stream = AscendStream::Null());
+
+/** @brief crop a 2D array.
+The function crops the matrix by given cv::Rect.
+Output matrix must be of the same depth as input one, size is specified by given rect size.
+
+@param src input array.
+@param rect a rect to crop a array to
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::gapi::crop
+*/
+CV_EXPORTS_W AscendMat crop(InputArray src, const Rect& rect,
+                            AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W AscendMat crop(const AscendMat& src, const Rect& rect,
+                            AscendStream& stream = AscendStream::Null());
+/** @brief Resizes an image src down to or up to the specified size.
+@param src    input image
+@param dst    output image; it has the size dsize (when it is non-zero) or the size computed from
+src.size(), fx, and fy; the type of dst is the same as of src.
+@param dsize  output image size; if it equals zero, it is computed as:
+     \f[𝚍𝚜𝚒𝚣𝚎 = 𝚂𝚒𝚣𝚎(𝚛𝚘𝚞𝚗𝚍(𝚏𝚡*𝚜𝚛𝚌.𝚌𝚘𝚕𝚜), 𝚛𝚘𝚞𝚗𝚍(𝚏𝚢*𝚜𝚛𝚌.𝚛𝚘𝚠𝚜))\f]
+     Either dsize or both fx and fy must be non-zero.
+@param fx     scale factor along the horizontal axis; when it equals 0, it is computed as
+\f[(𝚍𝚘𝚞𝚋𝚕𝚎)𝚍𝚜𝚒𝚣𝚎.𝚠𝚒𝚍𝚝𝚑/𝚜𝚛𝚌.𝚌𝚘𝚕𝚜\f]
+
+@param fy     scale factor along the vertical axis; when it equals 0, it is computed as
+\f[(𝚍𝚘𝚞𝚋𝚕𝚎)𝚍𝚜𝚒𝚣𝚎.𝚑𝚎𝚒𝚐𝚑𝚝/𝚜𝚛𝚌.𝚛𝚘𝚠𝚜\f]
+@param interpolation    interpolation method(see **cv.cann.InterpolationFlags**)
+@sa cv::resize
+*/
+
+//! interpolation algorithm
+enum InterpolationFlags
+{
+    /** nearest neighbor interpolation */
+    INTER_NEAREST = 0,
+    /** bilinear interpolation */
+    INTER_LINEAR = 1,
+    /** bicubic interpolation */
+    INTER_CUBIC = 2,
+    /** resampling using pixel area relation. It may be a preferred method for image decimation, as
+    it gives moire'-free results. But when the image is zoomed, it is similar to the INTER_NEAREST
+    method. */
+    INTER_AREA = 3,
+    /** mask for interpolation codes */
+    INTER_MAX = 7,
+};
+
+CV_EXPORTS_W void resize(InputArray _src, OutputArray _dst, Size dsize, double inv_scale_x,
+                         double inv_scale_y, int interpolation,
+                         AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void resize(const AscendMat& src, CV_OUT AscendMat& dst, Size dsize, double inv_scale_x,
+                         double inv_scale_y, int interpolation,
+                         AscendStream& stream = AscendStream::Null());
+
+//! @} cannops_core
+
+//! @addtogroup cannimgproc
+//! @{
+
+/** @brief Converts an image from one color space to another.
+
+@param src Source image with CV_8U , CV_16U , or CV_32F depth and 1, 3, or 4 channels.
+@param dst Destination image.
+@param code Color space conversion code. For details, see cvtColor .
+@param dstCn Number of channels in the destination image. If the parameter is 0, the number of the
+channels is derived automatically from src and the code .
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::cvtColor cv::cuda::cvtColor
+ */
+CV_EXPORTS_W void cvtColor(const InputArray src, OutputArray dst, int code, int dstCn = 0,
+                           AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void cvtColor(const AscendMat& src, CV_OUT AscendMat& dst, int code, int dstCn = 0,
+                           AscendStream& stream = AscendStream::Null());
+
+//! @} cannimgproc
+
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_CANN_INTERFACE_HPP
diff --git a/modules/cannops/include/opencv2/cann_private.hpp b/modules/cannops/include/opencv2/cann_private.hpp
new file mode 100644
index 00000000000..bcbe33feb19
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann_private.hpp
@@ -0,0 +1,33 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_PRIVATE_HPP
+#define OPENCV_CANNOPS_CANN_PRIVATE_HPP
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+void arithm_op(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const char* op,
+               AscendStream& stream);
+void arithm_op(const AscendMat& src, const Scalar& sc, AscendMat& dst, const char* op,
+               AscendStream& stream);
+void arithm_op(const Scalar& sc, const AscendMat& src, AscendMat& dst, const char* op,
+               AscendStream& stream);
+void arithm_op(const AscendMat& src, AscendMat& dst, const char* op, AscendStream& stream);
+void arithm_op(const AscendMat& src, float scalar, AscendMat& dst, const char* op,
+               AscendStream& stream);
+void transpose(const AscendMat& src, int64_t* perm, AscendMat& dst, AscendStream& stream);
+void flip(const AscendMat& src, std::vector<int32_t>& asixs, AscendMat& dst, AscendStream& stream);
+void crop(const AscendMat& src, AscendMat& dst, const AscendMat& sizeSrcNpu, int64_t* offset,
+          AscendStream& stream);
+void transData(const AscendMat& src, AscendMat& dst, const char* from, const char* to,
+               AscendStream& stream);
+void resize(const AscendMat& src, AscendMat& dst, int32_t* dstSize, int interpolation,
+            AscendStream& stream);
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_CANN_PRIVATE_HPP
diff --git a/modules/cannops/include/opencv2/stream_accessor.hpp b/modules/cannops/include/opencv2/stream_accessor.hpp
new file mode 100644
index 00000000000..ff64d7dcbc0
--- /dev/null
+++ b/modules/cannops/include/opencv2/stream_accessor.hpp
@@ -0,0 +1,39 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_STREAM_ACCESSOR_HPP
+#define OPENCV_CANNOPS_STREAM_ACCESSOR_HPP
+
+#include <acl/acl_base.h>
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+//! @addtogroup cann_struct
+//! @{
+
+/** @brief Class that enables getting aclrtAscendStream from cann::AscendStream
+ */
+struct AscendStreamAccessor
+{
+    CV_EXPORTS static aclrtStream getStream(const AscendStream& stream);
+    CV_EXPORTS static AscendStream wrapStream(aclrtStream stream);
+};
+
+/** @brief Class that enables getting aclrtAscendEvent from cann::AscendEvent
+ */
+struct AscendEventAccessor
+{
+    CV_EXPORTS static aclrtEvent getEvent(const AscendEvent& event);
+    CV_EXPORTS static AscendEvent wrapEvent(aclrtEvent event);
+};
+
+//! @} cann_struct
+
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_STREAM_ACCESSOR_HPP
diff --git a/modules/cannops/misc/python/pyopencv_cann.hpp b/modules/cannops/misc/python/pyopencv_cann.hpp
new file mode 100644
index 00000000000..02d62487c6a
--- /dev/null
+++ b/modules/cannops/misc/python/pyopencv_cann.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_PYOPENCV_CANN_HPP
+#define OPENCV_CANNOPS_PYOPENCV_CANN_HPP
+
+#ifdef HAVE_OPENCV_CORE
+
+#include "opencv2/cann.hpp"
+
+typedef std::vector<cann::AscendMat> vector_AscendMat;
+typedef cann::AscendMat::Allocator AscendMat_Allocator;
+
+CV_PY_TO_CLASS(cann::AscendMat);
+CV_PY_TO_CLASS(cann::AscendStream);
+
+CV_PY_TO_CLASS_PTR(cann::AscendMat);
+CV_PY_TO_CLASS_PTR(cann::AscendMat::Allocator);
+
+CV_PY_FROM_CLASS(cann::AscendMat);
+CV_PY_FROM_CLASS(cann::AscendStream);
+
+CV_PY_FROM_CLASS_PTR(cann::AscendMat::Allocator);
+
+#endif // HAVE_OPENCV_CORE
+
+#endif // OPENCV_CANNOPS_PYOPENCV_CANN_HPP
diff --git a/modules/cannops/misc/python/test/test_cannops.py b/modules/cannops/misc/python/test/test_cannops.py
new file mode 100644
index 00000000000..f1b53bc192c
--- /dev/null
+++ b/modules/cannops/misc/python/test/test_cannops.py
@@ -0,0 +1,281 @@
+# This file is part of OpenCV project.
+# It is subject to the license terms in the LICENSE file found in the top-level directory
+# of this distribution and at http://opencv.org/license.html.
+
+import cv2 as cv
+from tests_common import NewOpenCVTests
+import numpy as np
+
+def genMask(mask, listx, listy):
+    for row in range(mask.shape[0]):
+        for col in range(mask.shape[1]):
+            if (row in listx and col in listx) or (row in listy and col in listy):
+                mask[row][col] = 1
+    mask = mask.astype(np.uint8)
+    return mask
+
+
+mask = np.zeros((5, 5))
+listx = [0, 1]
+listy = [1, 2]
+mask = genMask(mask, listx, listy)
+
+
+class cannop_test(NewOpenCVTests):
+    def test_ascend(self):
+        cv.cann.initAcl()
+        cv.cann.getDevice()
+        cv.cann.setDevice(0)
+        stream = cv.cann.AscendStream_Null()
+        cv.cann.wrapStream(id(stream))
+        cv.cann.resetDevice()
+
+    def test_arithmetic(self):
+        # input data
+        npMat1 = np.random.random((5, 5, 3)).astype(int)
+        npMat2 = np.random.random((5, 5, 3)).astype(int)
+        cv.cann.setDevice(0)
+
+        # ACLMat input data
+        aclMat1 = cv.cann.AscendMat()
+        aclMat1.upload(npMat1)
+        aclMat2 = cv.cann.AscendMat()
+        aclMat2.upload(npMat2)
+        aclMask = cv.cann.AscendMat()
+        aclMask.upload(mask)
+        aclMatDst = cv.cann.AscendMat(aclMat1.size(), aclMat1.type())
+
+        # InputArray interface test
+        self.assertTrue(np.allclose(cv.cann.add(
+            npMat1, npMat2), cv.add(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.subtract(
+            npMat1, npMat2), cv.subtract(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.multiply(
+            npMat1, npMat2, scale=2), cv.multiply(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.divide(
+            npMat1, npMat2, scale=2), cv.divide(npMat1, npMat2, scale=2)))
+
+        # AscendMat interface test
+        self.assertTrue(np.allclose(cv.cann.add(aclMat1, aclMat2).download(),
+                                    cv.add(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.subtract(aclMat1, aclMat2).download(),
+                                    cv.subtract(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.multiply(aclMat1, aclMat2, scale=2).download(),
+                                    cv.multiply(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.divide(aclMat1, aclMat2, scale=2).download(),
+                                    cv.divide(npMat1, npMat2, scale=2)))
+
+        # mask
+        self.assertTrue(np.allclose(cv.cann.add(
+            npMat1, npMat2, mask=mask), cv.add(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.subtract(
+            npMat1, npMat2, mask=mask), cv.subtract(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.multiply(npMat1, npMat2, scale=2),
+                                    cv.multiply(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.divide(npMat1, npMat2, scale=2),
+                                    cv.divide(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.addWeighted(npMat1, 2, npMat2, 4, 3),
+                                    cv.addWeighted(npMat1, 2, npMat2, 4, 3)))
+
+        self.assertTrue(np.allclose(cv.cann.add(aclMat1, aclMat2, mask=aclMask).download(),
+                                    cv.add(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.subtract(aclMat1, aclMat2, mask=aclMask).download(),
+                                    cv.subtract(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.multiply(aclMat1, aclMat2, scale=2).download(),
+                                    cv.multiply(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.divide(aclMat1, aclMat2, scale=2).download(),
+                                    cv.divide(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.addWeighted(aclMat1, 2, aclMat2, 4, 3).download(),
+                                    cv.addWeighted(npMat1, 2, npMat2, 4, 3)))
+
+        # stream
+        stream = cv.cann.AscendStream()
+        matDst = cv.cann.add(npMat1, npMat2, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(matDst, cv.add(npMat1, npMat2)))
+        matDst = cv.cann.add(npMat1, npMat2, mask=mask, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(matDst, cv.add(npMat1, npMat2, mask=mask)))
+        matDst = cv.cann.subtract(npMat1, npMat2, mask=mask, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(
+            matDst, cv.subtract(npMat1, npMat2, mask=mask)))
+
+        # stream AsceendMat
+        aclMatDst = cv.cann.add(aclMat1, aclMat2, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(aclMatDst.download(),
+                        cv.add(npMat1, npMat2)))
+
+        aclMatDst = cv.cann.add(aclMat1, aclMat2, mask=aclMask, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(aclMatDst.download(),
+                        cv.add(npMat1, npMat2, mask=mask)))
+
+        aclMatDst = cv.cann.subtract(aclMat1, aclMat2, mask=aclMask, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(aclMatDst.download(),
+                        cv.subtract(npMat1, npMat2, mask=mask)))
+
+        cv.cann.resetDevice()
+
+    def test_logical(self):
+        npMat1 = np.random.random((5, 5, 3)).astype(np.uint16)
+        npMat2 = np.random.random((5, 5, 3)).astype(np.uint16)
+        cv.cann.setDevice(0)
+
+        # ACLMat input data
+        aclMat1 = cv.cann.AscendMat()
+        aclMat1.upload(npMat1)
+        aclMat2 = cv.cann.AscendMat()
+        aclMat2.upload(npMat2)
+        aclMask = cv.cann.AscendMat()
+        aclMask.upload(mask)
+
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(npMat1, npMat2),
+                                    cv.bitwise_or(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(
+            npMat1, npMat2), cv.bitwise_or(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(npMat1, npMat2),
+                                    cv.bitwise_and(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(
+            npMat1, npMat2), cv.bitwise_and(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(npMat1, npMat2),
+                                    cv.bitwise_xor(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(
+            npMat1, npMat2), cv.bitwise_xor(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_not(npMat1),
+                                    cv.bitwise_not(npMat1)))
+        self.assertTrue(np.allclose(
+            cv.cann.bitwise_not(npMat1), cv.bitwise_not(npMat1)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(npMat1, npMat2, mask=mask),
+                                    cv.bitwise_and(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(npMat1, npMat2, mask=mask),
+                                    cv.bitwise_or(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_not(npMat1, mask=mask),
+                                    cv.bitwise_not(npMat1, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(npMat1, npMat2, mask=mask),
+                                    cv.bitwise_xor(npMat1, npMat2, mask=mask)))
+
+        # AscendMat interface
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(aclMat1, aclMat2).download(),
+                                    cv.bitwise_or(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(aclMat1, aclMat2).download(),
+                                    cv.bitwise_or(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(aclMat1, aclMat2).download(),
+                                    cv.bitwise_and(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(
+            aclMat1, aclMat2).download(), cv.bitwise_and(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(aclMat1, aclMat2).download(),
+                                    cv.bitwise_xor(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(
+            aclMat1, aclMat2).download(), cv.bitwise_xor(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_not(aclMat1).download(),
+                                    cv.bitwise_not(npMat1)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_not(aclMat1).download(),
+                                    cv.bitwise_not(npMat1)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(aclMat1, aclMat2, mask=aclMask).download(),
+                                    cv.bitwise_and(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(aclMat1, aclMat2, mask=aclMask).download(),
+                                    cv.bitwise_or(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_not(aclMat1, mask=aclMask).download(),
+                                    cv.bitwise_not(npMat1, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(aclMat1, aclMat2, mask=aclMask).download(),
+                                    cv.bitwise_xor(npMat1, npMat2, mask=mask)))
+        cv.cann.resetDevice()
+
+    def test_imgproc(self):
+        npMat = (np.random.random((128, 128, 3)) * 255).astype(np.uint8)
+        cv.cann.setDevice(0)
+        aclMat = cv.cann.AscendMat()
+        aclMatDst = aclMat
+        aclMat.upload(npMat)
+
+        # TODO try pass out param, not use return value.
+        # merge & split
+        self.assertTrue(np.allclose(
+            cv.cann.merge(cv.cann.split(npMat)).download(), npMat))
+        self.assertTrue(np.allclose(
+            cv.cann.merge(cv.cann.split(aclMat)).download(), npMat))
+
+        # transpose
+        self.assertTrue(np.allclose(
+            cv.cann.transpose(npMat), cv.transpose(npMat)))
+        self.assertTrue(np.allclose(
+            cv.cann.transpose(aclMat).download(), cv.transpose(npMat)))
+
+        # crop
+        w_off, h_off, crop_w, crop_h = 0, 0, 64, 64
+        roi = [w_off, h_off, crop_w, crop_h]
+        self.assertTrue(np.allclose(
+            cv.cann.crop(npMat, roi).download(), npMat[w_off:crop_w, h_off:crop_h]))
+        self.assertTrue(np.allclose(
+            cv.cann.crop(aclMat, roi).download(), npMat[w_off:crop_w, h_off:crop_h]))
+
+        # resize
+        dstSize = np.array([crop_w, crop_h])
+        aclMat32F = cv.cann.AscendMat()
+        aclMat32F.upload(npMat.astype(np.float32))
+        self.assertTrue(np.allclose(cv.cann.resize(npMat.astype(np.float32), dstSize, 0, 0, 3),
+                        cv.resize(npMat.astype(np.float32), dstSize, 0, 0, 3)))
+        self.assertTrue(np.allclose(cv.cann.resize(aclMat32F, dstSize, 0, 0, 3).download(),
+                        cv.resize(npMat.astype(np.float32), dstSize, 0, 0, 3)))
+        # flip
+        flipMode = [0, 1, -1]
+        for fMode in flipMode:
+            self.assertTrue(np.allclose(cv.cann.flip(
+                npMat, fMode), cv.flip(npMat, fMode)))
+            self.assertTrue(np.allclose(cv.cann.flip(
+                aclMat, fMode).download(), cv.flip(npMat, fMode)))
+
+        # rotate
+        rotateMode = [0, 1, 2]
+        for rMode in rotateMode:
+            self.assertTrue(np.allclose(cv.cann.rotate(
+                npMat, rMode), cv.rotate(npMat, rMode)))
+            self.assertTrue(np.allclose(cv.cann.rotate(
+                aclMat, rMode).download(), cv.rotate(npMat, rMode)))
+
+        # cvtColcor
+        cvtModeC1 = [cv.COLOR_GRAY2BGR, cv.COLOR_GRAY2BGRA]
+        cvtModeC3 = [cv.COLOR_BGR2GRAY, cv.COLOR_BGRA2BGR, cv.COLOR_BGR2RGBA, cv.COLOR_RGBA2BGR,
+                     cv.COLOR_BGR2RGB, cv.COLOR_BGRA2RGBA, cv.COLOR_RGB2GRAY, cv.COLOR_BGRA2GRAY,
+                     cv.COLOR_RGBA2GRAY, cv.COLOR_BGR2BGRA, cv.COLOR_BGR2YUV, cv.COLOR_RGB2YUV,
+                     cv.COLOR_YUV2BGR, cv.COLOR_YUV2RGB, cv.COLOR_BGR2YCrCb, cv.COLOR_RGB2YCrCb,
+                     cv.COLOR_YCrCb2BGR, cv.COLOR_YCrCb2RGB, cv.COLOR_BGR2XYZ, cv.COLOR_RGB2XYZ,
+                     cv.COLOR_XYZ2BGR, cv.COLOR_XYZ2RGB,]
+        for cvtM in cvtModeC3:
+            self.assertTrue(np.allclose(cv.cann.cvtColor(
+                npMat, cvtM), cv.cvtColor(npMat, cvtM), 1))
+            self.assertTrue(np.allclose(cv.cann.cvtColor(
+                aclMat, cvtM).download(), cv.cvtColor(npMat, cvtM), 1))
+
+        npMatC1 = (np.random.random((128, 128, 1)) * 255).astype(np.uint8)
+        aclMatC1 = cv.cann.AscendMat()
+        aclMatC1.upload(npMatC1)
+        for cvtM in cvtModeC1:
+            self.assertTrue(np.allclose(cv.cann.cvtColor(
+                npMatC1, cvtM), cv.cvtColor(npMatC1, cvtM), 1))
+            self.assertTrue(np.allclose(cv.cann.cvtColor(
+                aclMatC1, cvtM).download(), cv.cvtColor(npMatC1, cvtM), 1))
+
+        # threshold
+        threshType = [cv.THRESH_BINARY, cv.THRESH_BINARY_INV,
+                      cv.THRESH_TRUNC, cv.THRESH_TOZERO, cv.THRESH_TOZERO_INV]
+        for tType in threshType:
+            cvRet, cvThresh = cv.threshold(
+                npMat.astype(np.uint8), 127, 255, tType)
+            cannRet, cannThresh = cv.cann.threshold(
+                npMat.astype(np.float32), 127, 255, tType)
+            self.assertTrue(np.allclose(cvThresh, cannThresh))
+            self.assertTrue(np.allclose(cvRet, cannRet))
+
+            aclMat.upload(npMat.astype(np.float32))
+            cannRet, cannThresh = cv.cann.threshold(
+                aclMat, 127, 255, tType)
+            self.assertTrue(np.allclose(cvThresh, cannThresh.download()))
+            self.assertTrue(np.allclose(cvRet, cannRet))
+        cv.cann.resetDevice()
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/cannops/perf/perf_core.cpp b/modules/cannops/perf/perf_core.cpp
new file mode 100644
index 00000000000..a9d86fca881
--- /dev/null
+++ b/modules/cannops/perf/perf_core.cpp
@@ -0,0 +1,161 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_interface.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+#define TYPICAL_ASCEND_MAT_SIZES \
+    Values(::perf::sz1080p, ::perf::sz2K, ::perf::sz2160p, ::perf::sz4320p)
+#define DEF_PARAM_TEST(name, ...) \
+    typedef ::perf::TestBaseWithParam<testing::tuple<__VA_ARGS__>> name
+
+DEF_PARAM_TEST(NPU, Size);
+DEF_PARAM_TEST(CPU, Size);
+
+PERF_TEST_P(NPU, MERGE, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC1);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    AscendMat ascendMat[3];
+    ascendMat[0].upload(mat);
+    ascendMat[1].upload(mat);
+    ascendMat[2].upload(mat);
+
+    TEST_CYCLE() { cv::cann::merge(&ascendMat[0], 3, dst); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MERGE, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC1);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    Mat mats[3] = {mat, mat, mat};
+    TEST_CYCLE() { cv::merge(&mats[0], 3, dst); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, SPLIT, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    AscendMat ascendMat[3];
+
+    TEST_CYCLE() { cv::cann::split(mat, &ascendMat[0]); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, SPLIT, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    declare.in(mat, WARMUP_RNG);
+    Mat mats[3] = {mat, mat, mat};
+    TEST_CYCLE() { cv::split(mat, &mats[0]); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, TRANSPOSE, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::transpose(mat, dst); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, TRANSPOSE, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::transpose(mat, dst); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, FLIP, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::flip(mat, dst, -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, FLIP, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::flip(mat, dst, -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, ROTATE, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::rotate(mat, dst, 1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, ROTATE, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::rotate(mat, dst, 1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, CROP, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    Rect b(1, 2, 4, 4);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { AscendMat cropped_cann(mat, b); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, CROP, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    Rect b(1, 2, 4, 4);
+    TEST_CYCLE() { Mat cropped_cv(mat, b); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, CROP_OVERLOAD, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    Rect b(1, 2, 4, 4);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::crop(mat, b); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/perf/perf_cvtcolor.cpp b/modules/cannops/perf/perf_cvtcolor.cpp
new file mode 100644
index 00000000000..c868d4fec04
--- /dev/null
+++ b/modules/cannops/perf/perf_cvtcolor.cpp
@@ -0,0 +1,69 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_interface.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+#define CVT_COLORS_3                                                                         \
+    Values(COLOR_BGR2BGRA, COLOR_BGRA2BGR, COLOR_BGR2RGBA, COLOR_RGBA2BGR, COLOR_BGR2RGB,    \
+           COLOR_BGRA2RGBA, COLOR_BGR2GRAY, COLOR_BGRA2GRAY, COLOR_RGBA2GRAY, COLOR_BGR2XYZ, \
+           COLOR_RGB2XYZ, COLOR_XYZ2BGR, COLOR_XYZ2RGB, COLOR_BGR2YCrCb, COLOR_RGB2YCrCb,    \
+           COLOR_YCrCb2BGR, COLOR_YCrCb2RGB, COLOR_BGR2YUV, COLOR_RGB2YUV, COLOR_YUV2BGR,    \
+           COLOR_YUV2RGB)
+#define CVT_COLORS_1 Values(COLOR_GRAY2BGR, COLOR_GRAY2BGRA)
+#define TYPICAL_ASCEND_MAT_SIZES \
+    Values(::perf::sz1080p, ::perf::sz2K)
+#define DEF_PARAM_TEST(name, ...) \
+    typedef ::perf::TestBaseWithParam<testing::tuple<__VA_ARGS__>> name
+
+DEF_PARAM_TEST(NPU, Size, ColorConversionCodes);
+DEF_PARAM_TEST(CPU, Size, ColorConversionCodes);
+
+PERF_TEST_P(NPU, CVT_COLOR_3, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, CVT_COLORS_3))
+{
+    Mat mat(GET_PARAM(0), CV_32FC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::cvtColor(mat, dst, GET_PARAM(1)); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, CVT_COLOR_3, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, CVT_COLORS_3))
+{
+    Mat mat(GET_PARAM(0), CV_32FC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::cvtColor(mat, dst, GET_PARAM(1)); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, CVT_COLOR_1, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, CVT_COLORS_1))
+{
+    Mat mat(GET_PARAM(0), CV_32FC1);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::cvtColor(mat, dst, GET_PARAM(1)); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, CVT_COLOR_1, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, CVT_COLORS_1))
+{
+    Mat mat(GET_PARAM(0), CV_32FC1);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::cvtColor(mat, dst, GET_PARAM(1)); }
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/perf/perf_element_operations.cpp b/modules/cannops/perf/perf_element_operations.cpp
new file mode 100644
index 00000000000..0612abe6085
--- /dev/null
+++ b/modules/cannops/perf/perf_element_operations.cpp
@@ -0,0 +1,211 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_interface.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+#define ARITHM_MAT_DEPTH Values(CV_32S, CV_32SC3)
+#define TYPICAL_ASCEND_MAT_SIZES \
+    Values(::perf::sz1080p, ::perf::sz2K, ::perf::sz2160p, ::perf::sz4320p)
+#define DEF_PARAM_TEST(name, ...) \
+    typedef ::perf::TestBaseWithParam<testing::tuple<__VA_ARGS__>> name
+
+DEF_PARAM_TEST(NPU, Size, int);
+DEF_PARAM_TEST(CPU, Size, int);
+
+PERF_TEST_P(NPU, MAT_ADD_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::add(mat1, mat2, dst, noArray(), -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_ADD_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::add(mat1, mat2, dst, noArray(), -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_SUB_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::subtract(mat1, mat2, dst, noArray(), -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_SUB_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::subtract(mat1, mat2, dst, noArray(), -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_MUL_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::multiply(mat1, mat2, dst, 1, -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_MUL_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::multiply(mat1, mat2, dst, 1, -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_DIV_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::divide(mat1, mat2, dst, 1, -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_DIV_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::divide(mat1, mat2, dst, 1, -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_BITWISE_AND_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::bitwise_and(mat1, mat2, dst, noArray()); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_BITWISE_AND_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::bitwise_and(mat1, mat2, dst, noArray()); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_BITWISE_OR_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::bitwise_or(mat1, mat2, dst, noArray()); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_BITWISE_OR_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::bitwise_or(mat1, mat2, dst, noArray()); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_BITWISE_XOR_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::bitwise_xor(mat1, mat2, dst, noArray()); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_BITWISE_XOR_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::bitwise_xor(mat1, mat2, dst, noArray()); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_BITWISE_NOT_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::bitwise_not(mat, dst, noArray()); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_BITWISE_NOT_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::bitwise_not(mat, dst, noArray()); }
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/perf/perf_main.cpp b/modules/cannops/perf/perf_main.cpp
new file mode 100644
index 00000000000..33503ac4158
--- /dev/null
+++ b/modules/cannops/perf/perf_main.cpp
@@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_interface.hpp"
+using namespace perf;
+
+class CannEnvironment : public ::testing::Environment
+{
+public:
+    virtual ~CannEnvironment() = default;
+    virtual void SetUp() CV_OVERRIDE { cv::cann::initAcl(); }
+    virtual void TearDown() CV_OVERRIDE { cv::cann::finalizeAcl(); }
+};
+
+static void initTests()
+{
+    CannEnvironment* cannEnv = new CannEnvironment();
+    ::testing::AddGlobalTestEnvironment(cannEnv);
+}
+
+CV_PERF_TEST_MAIN("cannops", initTests())
diff --git a/modules/cannops/perf/perf_precomp.hpp b/modules/cannops/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..59e2fa03d7b
--- /dev/null
+++ b/modules/cannops/perf/perf_precomp.hpp
@@ -0,0 +1,19 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+#include "opencv2/cann.hpp"
+
+#define DEVICE_ID 0
+
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::cann;
+
+#endif
diff --git a/modules/cannops/samples/image_processing.cpp b/modules/cannops/samples/image_processing.cpp
new file mode 100644
index 00000000000..9dca2176dfd
--- /dev/null
+++ b/modules/cannops/samples/image_processing.cpp
@@ -0,0 +1,60 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <iostream>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/cann.hpp>
+#include <opencv2/cann_interface.hpp>
+
+int main(int argc, char* argv[])
+{
+    cv::CommandLineParser parser(argc, argv,
+                                 "{@input|puppy.png|path to input image}"
+                                 "{@output|output.png|path to output image}"
+                                 "{help||show help}");
+    parser.about("This is a sample for image processing with Ascend NPU. \n");
+    if (argc != 3 || parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    std::string imagePath = parser.get<std::string>(0);
+    std::string outputPath = parser.get<std::string>(1);
+
+    // read input image and generate guass noise
+    //! [input_noise]
+    cv::Mat img = cv::imread(imagePath);
+    // Generate gauss noise that will be added into the input image
+    cv::Mat gaussNoise(img.rows, img.cols, img.type());
+    cv::RNG rng;
+    rng.fill(gaussNoise, cv::RNG::NORMAL, 0, 25);
+    //! [input_noise]
+
+    // setup cann
+    //! [setup]
+    cv::cann::initAcl();
+    cv::cann::setDevice(0);
+    //! [setup]
+
+    //! [image-process]
+    cv::Mat output;
+    // add gauss noise to the image
+    cv::cann::add(img, gaussNoise, output);
+    // rotate the image with a certain mode (0, 1 and 2, correspond to rotation of 90, 180 and 270
+    // degrees clockwise respectively)
+    cv::cann::rotate(output, output, 0);
+    // flip the image with a certain mode (0, positive and negative number, correspond to flipping
+    // around the x-axis, y-axis and both axes respectively)
+    cv::cann::flip(output, output, 0);
+    //! [image-process]
+
+    cv::imwrite(outputPath, output);
+
+    //! [tear-down-cann]
+    cv::cann::resetDevice();
+    cv::cann::finalizeAcl();
+    //! [tear-down-cann]
+    return 0;
+}
diff --git a/modules/cannops/samples/image_processing.py b/modules/cannops/samples/image_processing.py
new file mode 100644
index 00000000000..dc974bdd78c
--- /dev/null
+++ b/modules/cannops/samples/image_processing.py
@@ -0,0 +1,42 @@
+# This file is part of OpenCV project.
+# It is subject to the license terms in the LICENSE file found in the top-level directory
+# of this distribution and at http://opencv.org/license.html.
+
+import numpy as np
+import cv2
+import argparse
+
+parser = argparse.ArgumentParser(description='This is a sample for image processing with Ascend NPU.')
+parser.add_argument('image', help='path to input image')
+parser.add_argument('output', help='path to output image')
+args = parser.parse_args()
+
+# read input image and generate guass noise
+#! [input_noise]
+img = cv2.imread(args.image)
+# Generate gauss noise that will be added into the input image
+gaussNoise = np.random.normal(0, 25,(img.shape[0], img.shape[1], img.shape[2])).astype(img.dtype)
+#! [input_noise]
+
+# setup cann
+#! [setup]
+cv2.cann.initAcl()
+cv2.cann.setDevice(0)
+#! [setup]
+
+#! [image-process]
+# add gauss noise to the image
+output = cv2.cann.add(img, gaussNoise)
+# rotate the image with a certain mode (0, 1 and 2, correspond to rotation of 90, 180
+# and 270 degrees clockwise respectively)
+output = cv2.cann.rotate(output, 0)
+# flip the image with a certain mode (0, positive and negative number, correspond to flipping
+# around the x-axis, y-axis and both axes respectively)
+output = cv2.cann.flip(output, 0)
+#! [image-process]
+
+cv2.imwrite(args.output, output)
+
+#! [tear-down-cann]
+cv2.cann.finalizeAcl()
+#! [tear-down-cann]
diff --git a/modules/cannops/src/ascend_mat.cpp b/modules/cannops/src/ascend_mat.cpp
new file mode 100644
index 00000000000..ba17a545bb7
--- /dev/null
+++ b/modules/cannops/src/ascend_mat.cpp
@@ -0,0 +1,232 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+#include <iostream>
+
+namespace
+{
+class DefaultAllocator : public cv::cann::AscendMat::Allocator
+{
+public:
+    std::shared_ptr<uchar> allocate(size_t size) CV_OVERRIDE;
+    bool allocate(cv::cann::AscendMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE;
+};
+
+std::shared_ptr<uchar> DefaultAllocator::allocate(size_t size)
+{
+    uchar* data;
+    cv::cann::aclrtMallocWarpper((void**)(&data), size);
+    return std::shared_ptr<uchar>(data, [](void* ptr) { cv::cann::aclrtFreeWarpper(ptr); });
+}
+
+bool DefaultAllocator::allocate(cv::cann::AscendMat* mat, int rows, int cols, size_t elemSize)
+{
+    mat->data = allocate(elemSize * cols * rows);
+    mat->step = cols * elemSize;
+
+    return true;
+}
+
+DefaultAllocator cannDefaultAllocator;
+cv::cann::AscendMat::Allocator* g_defaultAllocator = &cannDefaultAllocator;
+} // namespace
+
+namespace cv
+{
+namespace cann
+{
+AscendMat::Allocator* AscendMat::defaultAllocator() { return g_defaultAllocator; }
+
+void AscendMat::setDefaultAllocator(AscendMat::Allocator* allocator)
+{
+    CV_Assert(allocator != 0);
+    g_defaultAllocator = allocator;
+}
+
+// TODO: this function is copied from matrix.cpp, which is a local symbol there and can not
+// be refreneced, consider optimizing.
+static int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step)
+{
+    int i, j;
+    for (i = 0; i < dims; i++)
+    {
+        if (size[i] > 1)
+            break;
+    }
+
+    uint64 t = (uint64)size[std::min(i, dims - 1)] * CV_MAT_CN(flags);
+    for (j = dims - 1; j > i; j--)
+    {
+        t *= size[j];
+        if (step[j] * size[j] < step[j - 1])
+            break;
+    }
+
+    if (j <= i && t == (uint64)(int)t)
+        return flags | Mat::CONTINUOUS_FLAG;
+    return flags & ~Mat::CONTINUOUS_FLAG;
+}
+
+void AscendMat::updateContinuityFlag()
+{
+    int sz[] = {rows, cols};
+    size_t steps[] = {step, elemSize()};
+    flags = cv::cann::updateContinuityFlag(flags, 2, sz, steps);
+}
+
+void AscendMat::create(int _rows, int _cols, int _type)
+{
+    CV_DbgAssert(_rows >= 0 && _cols >= 0);
+
+    _type &= Mat::TYPE_MASK;
+
+    if (rows == _rows && cols == _cols && type() == _type && data)
+        return;
+
+    if (_rows > 0 && _cols > 0)
+    {
+        flags = Mat::MAGIC_VAL + _type;
+        rows = _rows;
+        cols = _cols;
+
+        const size_t esz = elemSize();
+
+        bool allocSuccess = allocator->allocate(this, rows, cols, esz);
+
+        if (!allocSuccess)
+        {
+            // custom allocator fails, try default allocator
+            allocator = defaultAllocator();
+            allocSuccess = allocator->allocate(this, rows, cols, esz);
+            CV_Assert(allocSuccess);
+        }
+
+        if (esz * cols == step)
+            flags |= Mat::CONTINUOUS_FLAG;
+
+        datastart = data.get();
+        dataend = data.get() + step * (rows - 1) + cols * esz;
+    }
+}
+
+void AscendMat::upload(InputArray arr) { upload(arr, AscendStream::Null()); }
+
+void AscendMat::upload(InputArray arr, AscendStream& stream)
+{
+    Mat mat = arr.getMat();
+    CV_DbgAssert(!mat.empty());
+    create(mat.rows, mat.cols, mat.type());
+    aclrtMemcpy2dWarpper(data, 0, step, mat.data, mat.step[0], cols * elemSize(), rows, stream);
+}
+
+void AscendMat::download(OutputArray dst) const { download(dst, AscendStream::Null()); }
+
+void AscendMat::download(OutputArray _dst, AscendStream& stream) const
+{
+    CV_DbgAssert(!empty());
+
+    _dst.create(size(), type());
+    Mat dst = _dst.getMat();
+    aclrtMemcpy2dWarpper(dst.data, dst.step[0], data, 0, step, cols * elemSize(), rows, stream);
+}
+
+AscendMat::AscendMat(int rows_, int cols_, int type_, Scalar& s_, AscendMat::Allocator* allocator_)
+    : flags(0), rows(rows_), cols(cols_), step(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    create(rows_, cols_, type_);
+    setTo(s_);
+}
+
+AscendMat::AscendMat(Size size_, int type_, Scalar& s_, AscendMat::Allocator* allocator_)
+    : flags(0), rows(size_.height), cols(size_.width), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    create(size_.height, size_.width, type_);
+    setTo(s_);
+}
+
+AscendMat::AscendMat(InputArray _m, const Rect& roi) : AscendMat(_m, roi, AscendStream::Null()) {}
+
+AscendMat::AscendMat(InputArray _m, const Rect& roi, AscendStream& stream)
+    : rows(roi.height), cols(roi.width), allocator(defaultAllocator())
+{
+    AscendMat m;
+    m.upload(_m, stream);
+    step = m.step;
+    data = m.data;
+    flags = m.flags;
+    CV_Assert(0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y &&
+              0 <= roi.height && roi.y + roi.height <= m.rows);
+    size_t esz = CV_ELEM_SIZE(flags);
+    size_t sizeMem = esz * roi.width * roi.height * m.channels();
+    size_t offset = roi.y * m.step + roi.x * esz;
+
+    void* dst = malloc(sizeMem);
+    size_t dpitch = roi.width * esz;
+    std::shared_ptr<uchar> dstDevice = allocator->allocate(sizeMem);
+    aclrtMemcpy2dWarpper(dst, dpitch, data, offset, step, dpitch, roi.height, stream);
+    aclrtMemcpy2dWarpper(dstDevice, 0, dpitch, dst, dpitch, dpitch, roi.height, stream);
+    data = dstDevice;
+    step = dpitch;
+    free(dst);
+    updateContinuityFlag();
+}
+
+AscendMat& AscendMat::setTo(const Scalar& sc) { return setTo(sc, AscendStream::Null()); }
+
+AscendMat& AscendMat::setTo(const Scalar& sc, AscendStream& stream)
+{
+    size_t totalBytes = (size_t)rows * cols * elemSize();
+    if (totalBytes == 0)
+        return *this;
+
+    aclrtMemsetWarpper(data, 0, totalBytes, stream);
+    AscendMat dst(rows, cols, type());
+    arithm_op(*this, sc, dst, "Add", stream);
+    swap(dst);
+
+    return *this;
+}
+
+AscendMat& AscendMat::setTo(float sc) { return setTo(sc, AscendStream::Null()); }
+
+AscendMat& AscendMat::setTo(float sc, AscendStream& stream)
+{
+    size_t totalBytes = (size_t)rows * cols * elemSize();
+    if (totalBytes == 0)
+        return *this;
+
+    aclrtMemsetWarpper(data, 0, totalBytes, stream);
+
+    AscendMat dst(rows, cols, type());
+    arithm_op(*this, sc, dst, "Adds", stream);
+    swap(dst);
+
+    return *this;
+}
+
+void AscendMat::convertTo(AscendMat& dst, int rtype) const
+{
+    convertTo(dst, rtype, AscendStream::Null());
+}
+
+void AscendMat::convertTo(AscendMat& dst, int _rtype, AscendStream& stream) const
+{
+    int cn = channels();
+    dst.create(rows, cols, CV_MAKE_TYPE(_rtype, cn));
+    convertTo(dst, stream);
+}
+
+void AscendMat::convertTo(AscendMat& dst, AscendStream& stream) const
+{
+    OperatorRunner runner;
+    runner.setOp("Cast")
+        .addInput(*this, "x")
+        .addOutput(dst, "y")
+        .addAttr((int32_t)(getACLType(dst.depth())), "dst_type")
+        .run(stream);
+}
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannops/src/cann_call.cpp b/modules/cannops/src/cann_call.cpp
new file mode 100644
index 00000000000..3b83052ccbe
--- /dev/null
+++ b/modules/cannops/src/cann_call.cpp
@@ -0,0 +1,524 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include "precomp.hpp"
+#include "opencv2/core/private.hpp"
+namespace cv
+{
+namespace cann
+{
+/*******************************Acl Error Checker*****************************/
+static inline void checkAclError(aclError err, const char* file, const int line, const char* func)
+{
+    if (ACL_SUCCESS != err)
+    {
+        const char* errMsg = aclGetRecentErrMsg();
+        cv::error(cv::Error::StsError, errMsg == nullptr ? "" : errMsg, func, file, line);
+    }
+}
+
+static inline void checkAclPtr(void* ptr, const char* file, const int line, const char* func)
+{
+    if (nullptr == ptr)
+    {
+        const char* errMsg = aclGetRecentErrMsg();
+        cv::error(cv::Error::StsError, errMsg == nullptr ? "" : errMsg, func, file, line);
+    }
+}
+
+#define CV_ACL_SAFE_CALL(expr) checkAclError((expr), __FILE__, __LINE__, CV_Func)
+#define CV_ACL_SAFE_CALL_PTR(expr)                     \
+    ({                                                 \
+        auto ptr = (expr);                             \
+        checkAclPtr(ptr, __FILE__, __LINE__, CV_Func); \
+        ptr;                                           \
+    })
+
+/******************************Acl Runtime Warpper****************************/
+void aclrtMallocWarpper(void** data, size_t size)
+{
+    CV_ACL_SAFE_CALL(aclrtMalloc(data, size, ACL_MEM_MALLOC_HUGE_FIRST));
+}
+
+void aclrtFreeWarpper(void* data) { CV_ACL_SAFE_CALL(aclrtFree(data)); }
+
+void aclrtMemcpyWarpper(std::shared_ptr<uchar>& dst, size_t offset, const void* src, size_t size,
+                        AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(
+            aclrtMemcpy(dst.get() + offset, size, src, size, ACL_MEMCPY_HOST_TO_DEVICE));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpyAsync(dst.get() + offset, size, src, size,
+                                          ACL_MEMCPY_HOST_TO_DEVICE, rawStream));
+        if (offset == 0)
+            stream.addTensorHolder(dst);
+    }
+}
+
+void aclrtMemcpyWarpper(void* dst, const std::shared_ptr<uchar>& src, size_t offset, size_t size,
+                        AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(
+            aclrtMemcpy(dst, size, src.get() + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpyAsync(dst, size, src.get() + offset, size,
+                                          ACL_MEMCPY_DEVICE_TO_HOST, rawStream));
+        if (offset == 0)
+            stream.addTensorHolder(src);
+    }
+}
+
+void aclrtMemcpyWarpper(std::shared_ptr<uchar>& dst, size_t dstOffset,
+                        const std::shared_ptr<uchar>& src, size_t srcOffset, size_t size,
+                        AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemcpy(dst.get() + dstOffset, size, src.get() + srcOffset, size,
+                                     ACL_MEMCPY_DEVICE_TO_DEVICE));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpyAsync(dst.get() + dstOffset, size, src.get() + srcOffset, size,
+                                          ACL_MEMCPY_DEVICE_TO_DEVICE, rawStream));
+        if (srcOffset == 0)
+            stream.addTensorHolder(src);
+        if (dstOffset == 0)
+            stream.addTensorHolder(dst);
+    }
+}
+
+void aclrtMemcpy2dWarpper(std::shared_ptr<uchar>& dst, size_t offset, size_t dpitch,
+                          const void* src, size_t spitch, size_t width, size_t length,
+                          AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemcpy2d(dst.get() + offset, dpitch, src, spitch, width, length,
+                                       ACL_MEMCPY_HOST_TO_DEVICE));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpy2dAsync(dst.get() + offset, dpitch, src, spitch, width, length,
+                                            ACL_MEMCPY_HOST_TO_DEVICE, rawStream));
+        stream.addTensorHolder(dst);
+    }
+}
+
+void aclrtMemcpy2dWarpper(void* dst, size_t dpitch, const std::shared_ptr<uchar>& src,
+                          size_t offset, size_t spitch, size_t width, size_t length,
+                          AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemcpy2d(dst, dpitch, src.get() + offset, spitch, width, length,
+                                       ACL_MEMCPY_DEVICE_TO_HOST));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpy2dAsync(dst, dpitch, src.get() + offset, spitch, width, length,
+                                            ACL_MEMCPY_DEVICE_TO_HOST, rawStream));
+        stream.addTensorHolder(src);
+    }
+}
+
+void aclrtMemsetWarpper(std::shared_ptr<uchar>& ptr, int32_t value, size_t count,
+                        AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemset(ptr.get(), count, value, count));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemsetAsync(ptr.get(), count, value, count, rawStream));
+        stream.addTensorHolder(ptr);
+    }
+}
+
+aclDataType getACLType(int opencvdepth)
+{
+    switch (opencvdepth)
+    {
+        case CV_8S:
+            return ACL_INT8;
+        case CV_16S:
+            return ACL_INT16;
+        case CV_8U:
+            return ACL_UINT8;
+        case CV_16U:
+            return ACL_UINT16;
+        case CV_32S:
+            return ACL_INT32;
+        case CV_32F:
+            return ACL_FLOAT;
+        case CV_64F:
+            return ACL_DOUBLE;
+        case CV_16F:
+            return ACL_FLOAT16;
+        default:
+            return ACL_DT_UNDEFINED;
+    }
+}
+
+std::shared_ptr<uchar> mallocAndUpload(const void* data, size_t size, AscendStream& stream,
+                                       AscendMat::Allocator* allocator)
+{
+    std::shared_ptr<uchar> ptr = allocator->allocate(size);
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemcpy(ptr.get(), size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+    else
+        CV_ACL_SAFE_CALL(
+            aclrtMemcpyAsync(ptr.get(), size, data, size, ACL_MEMCPY_HOST_TO_DEVICE, rawStream));
+    return ptr;
+}
+
+/**************************Acl attribute preparation**************************/
+
+OperatorRunner& OperatorRunner::reset()
+{
+    holder.clear();
+    op.clear();
+    for (auto desc : inputDesc_)
+    {
+        aclDestroyTensorDesc(desc);
+    }
+    for (auto desc : outputDesc_)
+    {
+        aclDestroyTensorDesc(desc);
+    }
+    for (auto buf : inputBuffers_)
+    {
+        CV_ACL_SAFE_CALL(aclDestroyDataBuffer(buf));
+    }
+    for (auto buf : outputBuffers_)
+    {
+        CV_ACL_SAFE_CALL(aclDestroyDataBuffer(buf));
+    }
+    if (opAttrInit)
+        aclopDestroyAttr(opAttr_);
+    inputDesc_.clear();
+    outputDesc_.clear();
+    inputBuffers_.clear();
+    outputBuffers_.clear();
+    opAttrInit = false;
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::setOp(const char* opName)
+{
+    reset();
+    opAttr_ = CV_ACL_SAFE_CALL_PTR(aclopCreateAttr());
+    opAttrInit = true;
+    op = std::string(opName);
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addAttr(float value, const char* name)
+{
+    CV_ACL_SAFE_CALL(aclopSetAttrFloat(opAttr_, name, value));
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addAttr(const char* value, const char* name)
+{
+    CV_ACL_SAFE_CALL(aclopSetAttrString(opAttr_, name, value));
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addAttr(int value, const char* name)
+{
+    CV_ACL_SAFE_CALL(aclopSetAttrInt(opAttr_, name, value));
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addAttr(bool value, const char* name)
+{
+    CV_ACL_SAFE_CALL(aclopSetAttrBool(opAttr_, name, value));
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addAttr(const int64_t* value, int size, const char* name)
+{
+    CV_ACL_SAFE_CALL(aclopSetAttrListInt(opAttr_, name, size, value));
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addInput(AscendTensor& tensor)
+{
+    auto descPtr = CV_ACL_SAFE_CALL_PTR(
+        aclCreateTensorDesc(tensor.dtype, tensor.dims.size(), &tensor.dims[0], tensor.format));
+    if (descPtr != nullptr)
+    {
+        if (tensor.name != nullptr && strlen(tensor.name) != 0)
+            aclSetTensorDescName(descPtr, tensor.name);
+        inputDesc_.push_back(descPtr);
+    }
+    auto bufPtr = CV_ACL_SAFE_CALL_PTR(aclCreateDataBuffer(tensor.data.get(), tensor.dataSize));
+    if (bufPtr != nullptr)
+        inputBuffers_.push_back(bufPtr);
+    holder.insert(tensor.data);
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addOutput(AscendTensor& tensor)
+{
+    auto descPtr = CV_ACL_SAFE_CALL_PTR(
+        aclCreateTensorDesc(tensor.dtype, tensor.dims.size(), &tensor.dims[0], tensor.format));
+    if (descPtr != nullptr)
+    {
+        if (tensor.name != nullptr && strlen(tensor.name) != 0)
+            aclSetTensorDescName(descPtr, tensor.name);
+        outputDesc_.push_back(descPtr);
+    }
+    auto bufPtr = CV_ACL_SAFE_CALL_PTR(aclCreateDataBuffer(tensor.data.get(), tensor.dataSize));
+    if (bufPtr != nullptr)
+        outputBuffers_.push_back(bufPtr);
+    holder.insert(tensor.data);
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addInput(const AscendMat& mat, const char* name)
+{
+    AscendTensor tensor(mat, name);
+    return addInput(tensor);
+}
+
+OperatorRunner& OperatorRunner::addOutput(AscendMat& mat, const char* name)
+{
+    AscendTensor tensor(mat, name);
+    return addOutput(tensor);
+}
+
+OperatorRunner& OperatorRunner::addInput(const Scalar& sc, int type, const char* name)
+{
+    uchar rawData[32];
+    cv::scalarToRawData(sc, rawData, type, 0);
+    std::shared_ptr<uchar> scPtr = mallocAndUpload(
+        rawData, (CV_ELEM_SIZE(type)), AscendStream::Null(), AscendMat::defaultAllocator());
+
+    int64_t dims[] = {1, 1, 1, (CV_MAT_CN(type))};
+    AscendTensor tensor(scPtr, (CV_ELEM_SIZE(type)), dims, sizeof(dims) / sizeof(dims[0]),
+                        getACLType(CV_MAT_DEPTH(type)), name);
+    return addInput(tensor);
+}
+
+OperatorRunner& OperatorRunner::run(AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    CV_ACL_SAFE_CALL(aclopCompileAndExecute(op.c_str(), inputDesc_.size(), inputDesc_.data(),
+                                            inputBuffers_.data(), outputDesc_.size(),
+                                            outputDesc_.data(), outputBuffers_.data(), opAttr_,
+                                            ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, rawStream));
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtSynchronizeStream(rawStream));
+    else
+    {
+        for (const auto& ptr : holder)
+            stream.addTensorHolder(ptr);
+    }
+    return *this;
+}
+
+/********************************Ascend Tensor********************************/
+
+AscendTensor::AscendTensor(std::shared_ptr<uchar> _data, size_t _dataSize, int64_t* _dims,
+                           size_t _dimSize, aclDataType _dtype, const char* _name,
+                           aclFormat _format)
+    : name(_name), data(_data), dataSize(_dataSize), dtype(_dtype), format(_format)
+{
+    dims.assign(_dims, _dims + _dimSize);
+}
+
+AscendTensor::AscendTensor(const AscendMat& ascendMat, const char* _name, aclFormat _format)
+    : name(_name), format(_format)
+{
+    data = ascendMat.data;
+    // Ascend can't process with gaps in matrix.
+    CV_Assert(ascendMat.isContinuous());
+    dataSize = ascendMat.rows * ascendMat.cols * ascendMat.elemSize();
+
+    switch (_format)
+    {
+        case ACL_FORMAT_NHWC:
+        case ACL_FORMAT_ND:
+            dims.resize(4);
+            // Batch, default = 1.
+            dims[0] = 1;
+            // Default OpenCV image format = NHWC.
+            dims[1] = ascendMat.rows;
+            dims[2] = ascendMat.cols;
+            dims[3] = ascendMat.channels();
+            break;
+        case ACL_FORMAT_NCHW:
+            dims.resize(4);
+            dims[0] = 1;
+            dims[1] = ascendMat.channels();
+            dims[2] = ascendMat.rows;
+            dims[3] = ascendMat.cols;
+            break;
+        default:
+            CV_Error(Error::StsBadArg, "Unknown/unsupported matrix format");
+    }
+
+    dtype = getACLType(ascendMat.depth());
+}
+
+/**********************************Device*************************************/
+void setDevice(int device_id)
+{
+    aclrtContext context;
+    CV_ACL_SAFE_CALL(aclrtSetDevice(device_id));
+    CV_ACL_SAFE_CALL(aclrtCreateContext(&context, device_id));
+}
+
+void resetDevice() { CV_ACL_SAFE_CALL(aclrtResetDevice(getDevice())); }
+
+int32_t getDevice()
+{
+    int32_t deviceId;
+    CV_ACL_SAFE_CALL(aclrtGetDevice(&deviceId));
+    return deviceId;
+}
+
+void initAcl() { CV_ACL_SAFE_CALL(aclInit(nullptr)); }
+
+void finalizeAcl() { CV_ACL_SAFE_CALL(aclFinalize()); }
+
+class DefaultDeviceInitializer
+{
+public:
+    DefaultDeviceInitializer();
+    ~DefaultDeviceInitializer();
+
+    AscendStream& getNullAscendStream(int deviceId);
+
+private:
+    std::vector<Ptr<AscendStream>> streams_;
+    Mutex streams_mtx_;
+};
+
+DefaultDeviceInitializer::DefaultDeviceInitializer() {}
+
+DefaultDeviceInitializer::~DefaultDeviceInitializer() { streams_.clear(); }
+
+AscendStream& DefaultDeviceInitializer::getNullAscendStream(int deviceId)
+{
+    AutoLock lock(streams_mtx_);
+
+    if (streams_.empty())
+    {
+        uint32_t deviceCount;
+        CV_ACL_SAFE_CALL(aclrtGetDeviceCount(&deviceCount));
+
+        if (deviceCount > 0)
+            streams_.resize(deviceCount);
+    }
+
+    CV_DbgAssert(deviceId >= 0 && deviceId < static_cast<int>(streams_.size()));
+
+    if (streams_[deviceId].empty())
+    {
+        aclrtStream stream = nullptr;
+        Ptr<AscendStream::Impl> impl = makePtr<AscendStream::Impl>(stream);
+        streams_[deviceId] = Ptr<AscendStream>(new AscendStream(impl));
+    }
+
+    return *streams_[deviceId];
+}
+
+DefaultDeviceInitializer initializer;
+
+/***********************************Event*************************************/
+AscendEvent::Impl::Impl() : event(nullptr), ownEvent(true)
+{
+    CV_ACL_SAFE_CALL(aclrtCreateEvent(&event));
+}
+
+AscendEvent::Impl::Impl(aclrtEvent e) : event(e), ownEvent(false) {}
+
+AscendEvent::Impl::~Impl()
+{
+    if (event && ownEvent)
+    {
+        CV_ACL_SAFE_CALL(aclrtDestroyEvent(event));
+    }
+}
+
+aclrtEvent AscendEventAccessor::getEvent(const AscendEvent& event) { return event.impl_->event; }
+
+AscendEvent AscendEventAccessor::wrapEvent(aclrtEvent event)
+{
+    return AscendEvent(makePtr<AscendEvent::Impl>(event));
+}
+
+AscendEvent::AscendEvent() { impl_ = makePtr<Impl>(); }
+
+void AscendEvent::record(AscendStream& stream)
+{
+    CV_ACL_SAFE_CALL(aclrtRecordEvent(impl_->event, AscendStreamAccessor::getStream(stream)));
+}
+
+void AscendEvent::waitForComplete() const { CV_ACL_SAFE_CALL(aclrtSynchronizeEvent(impl_->event)); }
+
+/************************************Stream***********************************/
+void AscendStream::Impl::AddTensorHolder(const std::shared_ptr<uchar>& tensorData)
+{
+    tensorHolders.insert(tensorData);
+}
+
+AscendStream::Impl::Impl() : stream(nullptr), ownStream(true)
+{
+    CV_ACL_SAFE_CALL(aclrtCreateStream(&stream));
+}
+
+AscendStream::Impl::Impl(aclrtStream s) : stream(s), ownStream(false) {}
+
+aclrtStream AscendStreamAccessor::getStream(const AscendStream& stream)
+{
+    return stream.impl_->stream;
+}
+
+AscendStream AscendStreamAccessor::wrapStream(aclrtStream stream)
+{
+    return AscendStream(makePtr<AscendStream::Impl>(stream));
+}
+
+AscendStream wrapStream(size_t AscendStreamAddress)
+{
+    return AscendStreamAccessor::wrapStream(reinterpret_cast<aclrtStream>(AscendStreamAddress));
+}
+
+AscendStream::AscendStream() { impl_ = makePtr<Impl>(); }
+
+void AscendStream::waitForCompletion()
+{
+    CV_ACL_SAFE_CALL(aclrtSynchronizeStream(impl_->stream));
+    impl_->tensorHolders.clear();
+}
+
+void AscendStream::waitAscendEvent(const AscendEvent& event)
+{
+    CV_ACL_SAFE_CALL(aclrtStreamWaitEvent(impl_->stream, AscendEventAccessor::getEvent(event)));
+}
+
+AscendStream& AscendStream::Null()
+{
+    const uint32_t deviceId = getDevice();
+    return initializer.getNullAscendStream(deviceId);
+}
+
+void AscendStream::addTensorHolder(const std::shared_ptr<uchar>& holder)
+{
+    impl_->AddTensorHolder(holder);
+}
+
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannops/src/color.cpp b/modules/cannops/src/color.cpp
new file mode 100644
index 00000000000..f08a785e576
--- /dev/null
+++ b/modules/cannops/src/color.cpp
@@ -0,0 +1,777 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+namespace cv
+{
+namespace cann
+{
+// Integer type images will have a loss of accuracy during calculation, so they must be converted to
+// float before calculation.
+static AscendMat convertTo(const AscendMat& src, int dtype, AscendStream& stream)
+{
+    AscendMat ret;
+    if (src.depth() != dtype)
+        src.convertTo(ret, dtype, stream);
+    else
+        ret = src;
+    return ret;
+}
+
+static void convertBack(const AscendMat& src, AscendMat& dst, AscendStream& stream)
+{
+    if (src.depth() != dst.depth())
+        src.convertTo(dst, stream);
+}
+
+//! Set alpha channel to a Mat.
+static void matAlphaSet(AscendMat& mat, int dtype, AscendStream& stream)
+{
+    if (dtype < 0)
+        dtype = mat.depth();
+
+    if (mat.depth() == CV_8U || mat.depth() == CV_16U)
+    {
+        size_t size = mat.rows * mat.step;
+        aclrtMemsetWarpper(mat.data, 255, size, stream);
+    }
+    else
+    {
+        if (dtype == CV_32F)
+            mat.setTo(1.0f, stream);
+        else
+            mat.setTo((dtype == CV_8U ? (1 << 8) : (1 << 16)) - 1, stream);
+    }
+}
+
+inline void checkImg(const AscendMat& mat)
+{
+    int depth = mat.depth();
+    CV_Assert(!mat.empty());
+    CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32F);
+}
+
+inline void cvtBGRtoBGR(const AscendMat& src, AscendMat& dst, int dcn, bool swapBlue,
+                        AscendStream& stream)
+{
+    checkImg(src);
+    CV_Assert(src.channels() == 3 || src.channels() == 4);
+
+    AscendMat matChannels[4];
+    split(src, matChannels, stream);
+
+    if (swapBlue)
+        std::swap(matChannels[0], matChannels[2]);
+
+    if (dcn == 4 && src.channels() != 4)
+    {
+        AscendMat& alpha = matChannels[3];
+        alpha.create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), 1));
+        matAlphaSet(alpha, -1, stream);
+    }
+
+    merge(matChannels, dcn, dst, stream);
+}
+
+inline void cvtBGRtoBGR(InputArray& _src, OutputArray& _dst, int dcn, bool swapBlue,
+                        AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtBGRtoBGR(src, dst, dcn, swapBlue, stream);
+    dst.download(_dst, stream);
+}
+
+// TODO duplicated code
+static const float B2YF = 0.114f;
+static const float G2YF = 0.587f;
+static const float R2YF = 0.299f;
+
+inline void cvtBGRtoGray(const AscendMat& src, AscendMat& dst, int, bool swapBlue,
+                         AscendStream& stream)
+{
+    checkImg(src);
+    CV_Assert(src.channels() == 3 || src.channels() == 4);
+
+    float coeffs[] = {B2YF, G2YF, R2YF};
+    dst.create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), 1));
+    AscendMat formatedSrc = convertTo(src, CV_32F, stream);
+    AscendMat formatedDst = convertTo(dst, CV_32F, stream);
+
+    // For RGB
+    if (swapBlue)
+        std::swap(coeffs[0], coeffs[2]);
+
+    Scalar sc = {coeffs[0], coeffs[1], coeffs[2], 0};
+    AscendMat grayRet(formatedSrc.rows, formatedSrc.cols, formatedSrc.type());
+    arithm_op(formatedSrc, sc, grayRet, "Mul", stream);
+
+    AscendMat matChannels[4];
+    split(grayRet, matChannels, stream);
+
+    OperatorRunner runner;
+    runner.setOp("AddN")
+        .addInput(matChannels[0], "x0")
+        .addInput(matChannels[1], "x1")
+        .addInput(matChannels[2], "x2")
+        .addOutput(formatedDst, "y")
+        .addAttr(3, "N")
+        .run(stream);
+
+    convertBack(formatedDst, dst, stream);
+}
+
+inline void cvtBGRtoGray(const InputArray& _src, OutputArray& _dst, int, bool swapBlue,
+                         AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtBGRtoGray(src, dst, 0, swapBlue, stream);
+    dst.download(_dst, stream);
+}
+
+inline void cvtGraytoBGR(const AscendMat& src, AscendMat& dst, int dcn, bool, AscendStream& stream)
+{
+    checkImg(src);
+    CV_Assert(src.channels() == 1);
+
+    AscendMat matChannels[4];
+    for (int i = 0; i < 3; i++)
+        matChannels[i] = src;
+
+    if (dcn == 4)
+    {
+        AscendMat& alpha = matChannels[3];
+        alpha.create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), 1));
+        matAlphaSet(alpha, -1, stream);
+    }
+
+    merge(matChannels, dcn, dst, stream);
+}
+
+inline void cvtGraytoBGR(const InputArray& _src, OutputArray& _dst, int dcn, bool,
+                         AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtGraytoBGR(src, dst, dcn, false, stream);
+    dst.download(_dst, stream);
+}
+
+static const float RGB2XYZ_D65[] = {0.412453, 0.357580, 0.180423, 0.212671, 0.715160,
+                                    0.072169, 0.019334, 0.119193, 0.950227};
+
+static const float XYZ2RGB_D65[] = {3.240479, -1.53715, -0.498535, -0.969256, 1.875991,
+                                    0.041556, 0.055648, -0.204043, 1.057311};
+
+inline void matMulRGB(const AscendMat& src, AscendMat& dst, float* matrix, AscendStream& stream)
+{
+    checkImg(src);
+    CV_Assert(src.channels() == 3);
+
+    dst.create(src.rows, src.cols, src.type());
+    AscendMat formatedSrc = convertTo(src, CV_32F, stream);
+    AscendMat formatedDst = convertTo(dst, CV_32F, stream);
+
+    int64_t dims[] = {3, 3};
+    OperatorRunner runner;
+    runner.setOp("BatchMatMulV2")
+        .addInput(formatedSrc, "x1")
+        .addInput<float>(matrix, dims, 2, getACLType(CV_32F), "x2")
+        .addOutput(formatedDst, "y")
+        .addAttr(false, "adj_x1")
+        .addAttr(true, "adj_x2")
+        .run(stream);
+
+    if (src.depth() != CV_32F)
+    {
+        AscendMat thresholdTempMat(formatedSrc.size(), formatedSrc.type());
+        uint16_t thresh = (src.depth() == CV_8U ? (1 << 8) : (1 << 16)) - 1;
+        threshold(formatedDst, thresholdTempMat, thresh, 0, 2 /*THRESH_TRUNC*/, stream);
+        threshold(thresholdTempMat, formatedDst, 0, 0, 3 /*THRESH_TOZERO*/, stream);
+    }
+
+    convertBack(formatedDst, dst, stream);
+}
+
+// TODO: should deal with overflow. set 255 instead of cut off.
+inline void cvtBGRtoXYZ(const AscendMat& src, AscendMat& dst, int, bool swapBlue,
+                        AscendStream& stream)
+{
+    float coeffs[9];
+    memcpy(coeffs, RGB2XYZ_D65, 9 * sizeof(float));
+    if (!swapBlue)
+    {
+        std::swap(coeffs[0], coeffs[2]);
+        std::swap(coeffs[3], coeffs[5]);
+        std::swap(coeffs[6], coeffs[8]);
+    }
+    matMulRGB(src, dst, coeffs, stream);
+}
+
+inline void cvtBGRtoXYZ(const InputArray& _src, OutputArray& _dst, int, bool swapBlue,
+                        AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtBGRtoXYZ(src, dst, 0, swapBlue, stream);
+    dst.download(_dst, stream);
+}
+
+inline void cvtXYZtoBGR(const AscendMat& src, AscendMat& dst, int dcn, bool swapBlue,
+                        AscendStream& stream)
+{
+    float coeffs[9];
+    memcpy(coeffs, XYZ2RGB_D65, 9 * sizeof(float));
+    if (!swapBlue)
+    {
+        std::swap(coeffs[0], coeffs[6]);
+        std::swap(coeffs[1], coeffs[7]);
+        std::swap(coeffs[2], coeffs[8]);
+    }
+
+    if (dcn == 4)
+    {
+        AscendMat tempMat[2];
+        matMulRGB(src, tempMat[0], coeffs, stream);
+        tempMat[1].create(tempMat[0].rows, tempMat[0].cols, CV_MAKE_TYPE(tempMat[0].depth(), 1));
+        matAlphaSet(tempMat[1], -1, stream);
+        merge(tempMat, 2, dst, stream);
+    }
+    else
+        matMulRGB(src, dst, coeffs, stream);
+}
+
+inline void cvtXYZtoBGR(const InputArray& _src, OutputArray& _dst, int dcn, bool swapBlue,
+                        AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtXYZtoBGR(src, dst, dcn, swapBlue, stream);
+    dst.download(_dst, stream);
+}
+
+// TODO duplicated code
+static const float YCRF = 0.713f;
+static const float YCBF = 0.564f;
+static const float R2VF = 0.877f;
+static const float B2UF = 0.492f;
+inline void cvtBGRtoYCrCb(const AscendMat& src, AscendMat& dst, float* coeffs, bool swapBlue,
+                          bool yuvOrder, AscendStream& stream)
+{
+    checkImg(src);
+    CV_Assert(src.channels() == 3);
+
+    int buleIdx = swapBlue ? 2 : 0;
+    int depth = src.depth();
+    float delta = (depth == CV_8U) ? 128 : ((depth == CV_16U) ? 32768 : 0.5);
+
+    dst.create(src.rows, src.cols, src.type());
+    AscendMat formatedSrc = convertTo(src, CV_32F, stream);
+    AscendMat formatedDst = convertTo(dst, CV_32F, stream);
+
+    AscendMat YCrCb[3], RGB[3];
+    split(formatedSrc, RGB, stream);
+    cvtBGRtoGray(formatedSrc, YCrCb[0], 1, swapBlue, stream);
+    YCrCb[1].create(YCrCb[0].rows, YCrCb[0].cols, YCrCb[0].type());
+    YCrCb[2].create(YCrCb[0].rows, YCrCb[0].cols, YCrCb[0].type());
+
+    AscendMat tempMat1(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), 1)),
+        tempMat2(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), 1));
+
+    arithm_op(RGB[buleIdx ^ 2], YCrCb[0], tempMat1, "Sub", stream);
+    arithm_op(tempMat1, coeffs[0], tempMat2, "Muls", stream);
+    arithm_op(tempMat2, delta, YCrCb[1], "Adds", stream);
+
+    arithm_op(RGB[buleIdx], YCrCb[0], tempMat1, "Sub", stream);
+    arithm_op(tempMat1, coeffs[1], tempMat2, "Muls", stream);
+    arithm_op(tempMat2, delta, YCrCb[2], "Adds", stream);
+
+    if (yuvOrder)
+        std::swap(YCrCb[1], YCrCb[2]);
+
+    merge(YCrCb, 3, formatedDst, stream);
+    if (src.depth() != CV_32F)
+    {
+        AscendMat thresholdTempMat(formatedSrc.size(), formatedSrc.type());
+        uint16_t thresh = (src.depth() == CV_8U ? (1 << 8) : (1 << 16)) - 1;
+        threshold(formatedDst, thresholdTempMat, thresh, 0, 2 /*THRESH_TRUNC*/, stream);
+        threshold(thresholdTempMat, formatedDst, 0, 0, 3 /*THRESH_TOZERO*/, stream);
+    }
+
+    convertBack(formatedDst, dst, stream);
+}
+
+inline void cvtBGRtoYCrCb(const InputArray& _src, OutputArray& _dst, float* coeffs, bool swapBlue,
+                          bool yuvOrder, AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtBGRtoYCrCb(src, dst, coeffs, swapBlue, yuvOrder, stream);
+    dst.download(_dst, stream);
+}
+
+static const float CR2RF = 1.403f;
+static const float CR2GF = -0.714f;
+static const float CB2GF = -0.344f;
+static const float CB2BF = 1.773f;
+
+static const float V2RF = 1.140f;
+static const float V2GF = -0.581f;
+static const float U2GF = -0.395f;
+static const float U2BF = 2.032f;
+
+inline void cvtYCrCbtoBGR(const AscendMat& src, AscendMat& dst, int dcn, float* coeffs,
+                          bool swapBlue, bool yuvOrder, AscendStream& stream)
+{
+    checkImg(src);
+    CV_Assert(src.channels() == 3);
+
+    int buleIdx = swapBlue ? 2 : 0;
+    int depth = src.depth();
+    float delta = (depth == CV_8U) ? 128 : ((depth == CV_16U) ? 32768 : 0.5);
+
+    dst.create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), dcn));
+    AscendMat formatedSrc = convertTo(src, CV_32F, stream);
+    AscendMat formatedDst = convertTo(dst, CV_32F, stream);
+
+    AscendMat YCrCb[3], RGB[4];
+    split(formatedSrc, YCrCb, stream);
+    if (yuvOrder)
+        std::swap(YCrCb[1], YCrCb[2]);
+
+    RGB[0].create(formatedSrc.rows, formatedSrc.cols, CV_MAKE_TYPE(formatedSrc.depth(), 1));
+    RGB[1].create(formatedSrc.rows, formatedSrc.cols, CV_MAKE_TYPE(formatedSrc.depth(), 1));
+    RGB[2].create(formatedSrc.rows, formatedSrc.cols, CV_MAKE_TYPE(formatedSrc.depth(), 1));
+    AscendMat tempMat1(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), 1)),
+        tempMat2(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), 1)),
+        CbSubDelta(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), 1)),
+        CrSubDelta(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), 1));
+
+    arithm_op(YCrCb[1], (0.0f - delta), CrSubDelta, "Adds", stream);
+    arithm_op(YCrCb[2], (0.0f - delta), CbSubDelta, "Adds", stream);
+    arithm_op(CrSubDelta, coeffs[0], tempMat1, "Muls", stream);
+    arithm_op(YCrCb[0], tempMat1, RGB[buleIdx ^ 2], "Add", stream);
+
+    arithm_op(CrSubDelta, coeffs[1], tempMat1, "Muls", stream);
+    arithm_op(YCrCb[0], tempMat1, tempMat2, "Add", stream);
+    arithm_op(CbSubDelta, coeffs[2], tempMat1, "Muls", stream);
+    arithm_op(tempMat2, tempMat1, RGB[1], "Add", stream);
+
+    arithm_op(CbSubDelta, coeffs[3], tempMat1, "Muls", stream);
+    arithm_op(YCrCb[0], tempMat1, RGB[buleIdx], "Add", stream);
+
+    if (dcn == 4)
+    {
+        RGB[3].create(RGB[0].rows, RGB[0].cols, RGB[0].type());
+        matAlphaSet(RGB[3], src.depth(), stream);
+    }
+
+    merge(RGB, dcn, formatedDst, stream);
+    if (src.depth() != CV_32F)
+    {
+        AscendMat thresholdTempMat(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), dcn));
+        uint16_t thresh = (src.depth() == CV_8U ? (1 << 8) : (1 << 16)) - 1;
+        threshold(formatedDst, thresholdTempMat, thresh, 0, 2 /*THRESH_TRUNC*/, stream);
+        threshold(thresholdTempMat, formatedDst, 0, 0, 3 /*THRESH_TOZERO*/, stream);
+    }
+
+    convertBack(formatedDst, dst, stream);
+}
+
+inline void cvtYCrCbtoBGR(const InputArray& _src, OutputArray& _dst, int dcn, float* coeffs,
+                          bool swapBlue, bool yuvOrder, AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, swapBlue, yuvOrder, stream);
+    dst.download(_dst, stream);
+}
+
+// The input may be Input/OutputArray or AscendMat. Use templates to reduce duplicate code.
+template <typename SRC, typename DST>
+inline void BGR2BGRA(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 4, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGRA2BGR(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 3, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGR2RGBA(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 4, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void RGBA2BGR(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 3, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGR2RGB(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 3, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGRA2RGBA(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 4, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGR2GRAY(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoGray(src, dst, 1, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void RGB2GRAY(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoGray(src, dst, 1, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void GRAY2BGR(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtGraytoBGR(src, dst, 3, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void GRAY2BGRA(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtGraytoBGR(src, dst, 4, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGRA2GRAY(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoGray(src, dst, 1, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void RGBA2GRAY(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoGray(src, dst, 1, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGR2XYZ(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoXYZ(src, dst, 3, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void RGB2XYZ(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoXYZ(src, dst, 3, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void XYZ2BGR(const SRC& src, DST& dst, int dcn, AscendStream& stream)
+{
+    if (dcn <= 0)
+        dcn = 3;
+    cvtXYZtoBGR(src, dst, dcn, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void XYZ2RGB(const SRC& src, DST& dst, int dcn, AscendStream& stream)
+{
+    if (dcn <= 0)
+        dcn = 3;
+    cvtXYZtoBGR(src, dst, dcn, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGR2YCrCb(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    float coeffs[2];
+    coeffs[0] = YCRF;
+    coeffs[1] = YCBF;
+    cvtBGRtoYCrCb(src, dst, coeffs, false, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void RGB2YCrCb(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    float coeffs[2];
+    coeffs[0] = YCRF;
+    coeffs[1] = YCBF;
+    cvtBGRtoYCrCb(src, dst, coeffs, true, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void YCrCb2BGR(const SRC& src, DST& dst, int dcn, AscendStream& stream)
+{
+    float coeffs[4];
+    coeffs[0] = CR2RF;
+    coeffs[1] = CR2GF;
+    coeffs[2] = CB2GF;
+    coeffs[3] = CB2BF;
+    if (dcn <= 0)
+        dcn = 3;
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, false, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void YCrCb2RGB(const SRC& src, DST& dst, int dcn, AscendStream& stream)
+{
+    float coeffs[4];
+    coeffs[0] = CR2RF;
+    coeffs[1] = CR2GF;
+    coeffs[2] = CB2GF;
+    coeffs[3] = CB2BF;
+    if (dcn <= 0)
+        dcn = 3;
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, true, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGR2YUV(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    float coeffs[2];
+    coeffs[0] = R2VF;
+    coeffs[1] = B2UF;
+    cvtBGRtoYCrCb(src, dst, coeffs, false, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void RGB2YUV(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    float coeffs[2];
+    coeffs[0] = R2VF;
+    coeffs[1] = B2UF;
+    cvtBGRtoYCrCb(src, dst, coeffs, true, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void YUV2BGR(const SRC& src, DST& dst, int dcn, AscendStream& stream)
+{
+    float coeffs[4];
+    coeffs[0] = V2RF;
+    coeffs[1] = V2GF;
+    coeffs[2] = U2GF;
+    coeffs[3] = U2BF;
+    if (dcn <= 0)
+        dcn = 3;
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, false, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void YUV2RGB(const SRC& src, DST& dst, int dcn, AscendStream& stream)
+{
+    float coeffs[4];
+    coeffs[0] = V2RF;
+    coeffs[1] = V2GF;
+    coeffs[2] = U2GF;
+    coeffs[3] = U2BF;
+    if (dcn <= 0)
+        dcn = 3;
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, true, true, stream);
+}
+
+template <typename SRC, typename DST>
+void cvtColorDo(const SRC& src, DST& dst, int code, int dcn, AscendStream& stream)
+{
+    typedef void (*func_t)(const SRC& src, DST& dst, int dcn, AscendStream& stream);
+    static const func_t funcs[] = {
+        BGR2BGRA,  // CV_BGR2BGRA    =0
+        BGRA2BGR,  // CV_BGRA2BGR    =1
+        BGR2RGBA,  // CV_BGR2RGBA    =2
+        RGBA2BGR,  // CV_RGBA2BGR    =3
+        BGR2RGB,   // CV_BGR2RGB     =4
+        BGRA2RGBA, // CV_BGRA2RGBA   =5
+
+        BGR2GRAY,  // CV_BGR2GRAY    =6
+        RGB2GRAY,  // CV_RGB2GRAY    =7
+        GRAY2BGR,  // CV_GRAY2BGR    =8
+        GRAY2BGRA, // CV_GRAY2BGRA   =9
+        BGRA2GRAY, // CV_BGRA2GRAY   =10
+        RGBA2GRAY, // CV_RGBA2GRAY   =11
+
+        0, // CV_BGR2BGR565  =12
+        0, // CV_RGB2BGR565  =13
+        0, // CV_BGR5652BGR  =14
+        0, // CV_BGR5652RGB  =15
+        0, // CV_BGRA2BGR565 =16
+        0, // CV_RGBA2BGR565 =17
+        0, // CV_BGR5652BGRA =18
+        0, // CV_BGR5652RGBA =19
+
+        0, // CV_GRAY2BGR565 =20
+        0, // CV_BGR5652GRAY =21
+
+        0, // CV_BGR2BGR555  =22
+        0, // CV_RGB2BGR555  =23
+        0, // CV_BGR5552BGR  =24
+        0, // CV_BGR5552RGB  =25
+        0, // CV_BGRA2BGR555 =26
+        0, // CV_RGBA2BGR555 =27
+        0, // CV_BGR5552BGRA =28
+        0, // CV_BGR5552RGBA =29
+
+        0, // CV_GRAY2BGR555 =30
+        0, // CV_BGR5552GRAY =31
+
+        BGR2XYZ, // CV_BGR2XYZ     =32
+        RGB2XYZ, // CV_RGB2XYZ     =33
+        XYZ2BGR, // CV_XYZ2BGR     =34
+        XYZ2RGB, // CV_XYZ2RGB     =35
+
+        BGR2YCrCb, // CV_BGR2YCrCb   =36
+        RGB2YCrCb, // CV_RGB2YCrCb   =37
+        YCrCb2BGR, // CV_YCrCb2BGR   =38
+        YCrCb2RGB, // CV_YCrCb2RGB   =39
+
+        0, // CV_BGR2HSV     =40
+        0, // CV_RGB2HSV     =41
+
+        0, //                =42
+        0, //                =43
+
+        0, // CV_BGR2Lab     =44
+        0, // CV_RGB2Lab     =45
+
+        0, // CV_BayerBG2BGR =46
+        0, // CV_BayeRGB2BGR =47
+        0, // CV_BayerRG2BGR =48
+        0, // CV_BayerGR2BGR =49
+
+        0, // CV_BGR2Luv     =50
+        0, // CV_RGB2Luv     =51
+
+        0, // CV_BGR2HLS     =52
+        0, // CV_RGB2HLS     =53
+
+        0, // CV_HSV2BGR     =54
+        0, // CV_HSV2RGB     =55
+
+        0, // CV_Lab2BGR     =56
+        0, // CV_Lab2RGB     =57
+        0, // CV_Luv2BGR     =58
+        0, // CV_Luv2RGB     =59
+
+        0, // CV_HLS2BGR     =60
+        0, // CV_HLS2RGB     =61
+
+        0, // CV_BayerBG2BGR_VNG =62
+        0, // CV_BayeRGB2BGR_VNG =63
+        0, // CV_BayerRG2BGR_VNG =64
+        0, // CV_BayerGR2BGR_VNG =65
+
+        0, // CV_BGR2HSV_FULL = 66
+        0, // CV_RGB2HSV_FULL = 67
+        0, // CV_BGR2HLS_FULL = 68
+        0, // CV_RGB2HLS_FULL = 69
+
+        0, // CV_HSV2BGR_FULL = 70
+        0, // CV_HSV2RGB_FULL = 71
+        0, // CV_HLS2BGR_FULL = 72
+        0, // CV_HLS2RGB_FULL = 73
+
+        0, // CV_LBGR2Lab     = 74
+        0, // CV_LRGB2Lab     = 75
+        0, // CV_LBGR2Luv     = 76
+        0, // CV_LRGB2Luv     = 77
+
+        0, // CV_Lab2LBGR     = 78
+        0, // CV_Lab2LRGB     = 79
+        0, // CV_Luv2LBGR     = 80
+        0, // CV_Luv2LRGB     = 81
+
+        BGR2YUV, // CV_BGR2YUV      = 82
+        RGB2YUV, // CV_RGB2YUV      = 83
+        YUV2BGR, // CV_YUV2BGR      = 84
+        YUV2RGB, // CV_YUV2RGB      = 85
+
+        0, // CV_BayerBG2GRAY = 86
+        0, // CV_BayeRGB2GRAY = 87
+        0, // CV_BayerRG2GRAY = 88
+        0, // CV_BayerGR2GRAY = 89
+
+        // YUV 4:2:0 formats family
+        0, // CV_YUV2RGB_NV12 = 90,
+        0, // CV_YUV2BGR_NV12 = 91,
+        0, // CV_YUV2RGB_NV21 = 92,
+        0, // CV_YUV2BGR_NV21 = 93,
+
+        0, // CV_YUV2RGBA_NV12 = 94,
+        0, // CV_YUV2BGRA_NV12 = 95,
+        0, // CV_YUV2RGBA_NV21 = 96,
+        0, // CV_YUV2BGRA_NV21 = 97,
+
+        0, // CV_YUV2RGB_YV12 = 98,
+        0, // CV_YUV2BGR_YV12 = 99,
+        0, // CV_YUV2RGB_IYUV = 100,
+        0, // CV_YUV2BGR_IYUV = 101,
+
+        0, // CV_YUV2RGBA_YV12 = 102,
+        0, // CV_YUV2BGRA_YV12 = 103,
+        0, // CV_YUV2RGBA_IYUV = 104,
+        0, // CV_YUV2BGRA_IYUV = 105,
+
+        0, // CV_YUV2GRAY_420 = 106,
+
+        // YUV 4:2:2 formats family
+        0, // CV_YUV2RGB_UYVY = 107,
+        0, // CV_YUV2BGR_UYVY = 108,
+        0, // //CV_YUV2RGB_VYUY = 109,
+        0, // //CV_YUV2BGR_VYUY = 110,
+
+        0, // CV_YUV2RGBA_UYVY = 111,
+        0, // CV_YUV2BGRA_UYVY = 112,
+        0, // //CV_YUV2RGBA_VYUY = 113,
+        0, // //CV_YUV2BGRA_VYUY = 114,
+
+        0, // CV_YUV2RGB_YUY2 = 115,
+        0, // CV_YUV2BGR_YUY2 = 116,
+        0, // CV_YUV2RGB_YVYU = 117,
+        0, // CV_YUV2BGR_YVYU = 118,
+
+        0, // CV_YUV2RGBA_YUY2 = 119,
+        0, // CV_YUV2BGRA_YUY2 = 120,
+        0, // CV_YUV2RGBA_YVYU = 121,
+        0, // CV_YUV2BGRA_YVYU = 122,
+
+        0, // CV_YUV2GRAY_UYVY = 123,
+        0, // CV_YUV2GRAY_YUY2 = 124,
+
+        // alpha premultiplication
+        0, // CV_RGBA2mRGBA = 125,
+        0, // CV_mRGBA2RGBA = 126,
+
+        0, // CV_COLORCVT_MAX  = 127
+    };
+
+    CV_Assert(code < 128);
+
+    func_t func = funcs[code];
+
+    if (func == 0)
+        CV_Error(Error::StsBadFlag, "Unknown/unsupported color conversion code");
+
+    func(src, dst, dcn, stream);
+}
+
+// Instantiate templates to avoid confusion in python code generation
+void cvtColor(const InputArray src, OutputArray dst, int code, int dcn, AscendStream& stream)
+{
+    cvtColorDo(src, dst, code, dcn, stream);
+}
+
+void cvtColor(const AscendMat& src, AscendMat& dst, int code, int dcn, AscendStream& stream)
+{
+    cvtColorDo(src, dst, code, dcn, stream);
+}
+
+} // namespace cann
+} // namespace cv
\ No newline at end of file
diff --git a/modules/cannops/src/core.cpp b/modules/cannops/src/core.cpp
new file mode 100644
index 00000000000..7d328915ef9
--- /dev/null
+++ b/modules/cannops/src/core.cpp
@@ -0,0 +1,310 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+namespace cv
+{
+namespace cann
+{
+// Transform data type from one to another. eg. from NCHW to NHWC.
+void transData(const AscendMat& src, AscendMat& dst, const char* from, const char* to,
+               AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp("TransData")
+        .addInput(src, "src")
+        .addOutput(dst, "dst")
+        .addAttr(from, "src_format")
+        .addAttr(to, "dst_format")
+        .run(stream);
+}
+
+void merge(const AscendMat* src, size_t n, AscendMat& dst, AscendStream& stream)
+{
+    if (src == nullptr || n < 2)
+        return;
+
+    int depth = src->depth();
+    int rows = src->rows;
+    int cols = src->cols;
+
+    // All matrix must have same size and type
+    for (size_t i = 1; i < n; i++)
+    {
+        CV_Assert(src[i].depth() == depth && src[i].channels() == 1);
+        CV_Assert(src[i].rows == rows && src[i].cols == cols);
+    }
+
+    int cns = 0;
+    for (size_t i = 0; i < n; i++)
+        cns += src[i].channels();
+    dst.create(src->rows, src->cols, CV_MAKE_TYPE(src->depth(), cns));
+
+    OperatorRunner runner;
+    runner.setOp("ConcatD");
+
+    for (size_t i = 0; i < n; i++)
+    {
+        runner.addInput(src[i], ("x" + std::to_string(i)).c_str());
+    }
+
+    runner.addOutput(dst, "output_data").addAttr(3, "concat_dim").run(stream);
+}
+
+void merge(const std::vector<AscendMat>& src, AscendMat& dst, AscendStream& stream)
+{
+    merge(&src[0], src.size(), dst, stream);
+}
+
+void merge(const AscendMat* src, size_t n, OutputArray& _dst, AscendStream& stream)
+{
+    AscendMat dst;
+    merge(src, n, dst, stream);
+    dst.download(_dst, stream);
+}
+void merge(const std::vector<AscendMat>& src, OutputArray& dst, AscendStream& stream)
+{
+    merge(&src[0], src.size(), dst, stream);
+}
+
+void split(const AscendMat& src, AscendMat* dst, AscendStream& stream)
+{
+    if (src.empty() || dst == nullptr)
+        return;
+
+    int cn = src.channels();
+
+    OperatorRunner runner;
+    runner.setOp("SplitD").addInput(src, "x");
+    for (int i = 0; i < cn; i++)
+    {
+        dst[i].create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), 1));
+        runner.addOutput(dst[i], ("y" + std::to_string(i)).c_str());
+    }
+    runner.addAttr(3, "split_dim").addAttr(cn, "num_split").run(stream);
+}
+
+void split(const AscendMat& src, std::vector<AscendMat>& dst, AscendStream& stream)
+{
+    dst.resize(src.channels());
+    split(src, &dst[0], stream);
+}
+
+void split(const InputArray _src, AscendMat* dst, AscendStream& stream)
+{
+    AscendMat src;
+    src.upload(_src, stream);
+    split(src, dst, stream);
+}
+void split(const InputArray _src, std::vector<AscendMat>& dst, AscendStream& stream)
+{
+    AscendMat src;
+    src.upload(_src, stream);
+    dst.resize(src.channels());
+    split(_src, &dst[0], stream);
+}
+
+void transpose(const AscendMat& src, int64_t* perm, AscendMat& dst, AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp("TransposeD")
+        .addInput(src, "x")
+        .addOutput(dst, "y")
+        .addAttr(perm, 4, "perm")
+        .run(stream);
+}
+
+void transpose(const AscendMat& src, AscendMat& dst, AscendStream& stream)
+{
+    int64_t perm[] = {0, 2, 1, 3};
+    dst.create(src.cols, src.rows, src.type());
+    transpose(src, perm, dst, stream);
+}
+
+void transpose(InputArray _src, OutputArray _dst, AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    transpose(src, dst, stream);
+    dst.download(_dst, stream);
+}
+
+void flip(const AscendMat& src, std::vector<int32_t>& asixs, AscendMat& dst, AscendStream& stream)
+{
+    int64_t dim = asixs.size();
+    OperatorRunner runner;
+    runner.setOp("ReverseV2")
+        .addInput(src, "x")
+        .addInput<int32_t>(&asixs.at(0), &dim, 1, ACL_INT32, "axis")
+        .addOutput(dst, "y")
+        .run(stream);
+}
+
+void flip(const AscendMat& src, AscendMat& dst, int flipCode, AscendStream& stream)
+{
+    std::vector<int32_t> asix;
+    if (flipCode == 0)
+        asix.push_back(1);
+    else if (flipCode > 0)
+        asix.push_back(2);
+    else
+    {
+        asix.push_back(1);
+        asix.push_back(2);
+    }
+    dst.create(src.rows, src.cols, src.type());
+    flip(src, asix, dst, stream);
+}
+
+void flip(const InputArray _src, OutputArray _dst, int flipCode, AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    flip(src, dst, flipCode, stream);
+    dst.download(_dst, stream);
+}
+
+void rotate(const AscendMat& src, AscendMat& dst, int rotateMode, AscendStream& stream)
+{
+    AscendMat tempMat;
+    switch (rotateMode)
+    {
+        case ROTATE_90_CLOCKWISE:
+        {
+            dst.create(src.cols, src.rows, src.type());
+            transpose(src, tempMat, stream);
+            flip(tempMat, dst, 1, stream);
+            break;
+        }
+        case ROTATE_180:
+        {
+            dst.create(src.rows, src.cols, src.type());
+            flip(src, dst, -1, stream);
+            break;
+        }
+        case ROTATE_90_COUNTERCLOCKWISE:
+        {
+            dst.create(src.cols, src.rows, src.type());
+            transpose(src, tempMat, stream);
+            flip(tempMat, dst, 0, stream);
+            break;
+        }
+        default:
+            break;
+    }
+}
+
+void rotate(InputArray _src, OutputArray _dst, int rotateMode, AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    rotate(src, dst, rotateMode, stream);
+    dst.download(_dst, stream);
+}
+
+void crop(const AscendMat& src, AscendMat& dst, const AscendMat& sizeSrcNpu, int64_t* offset,
+          AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp("Crop")
+        .addInput(src, "x")
+        .addInput(sizeSrcNpu, "size")
+        .addAttr(1, "axis")
+        .addAttr(offset, 3, "offsets")
+        .addOutput(dst, "y")
+        .run(stream);
+}
+
+AscendMat crop(const AscendMat& src, const Rect& rect, AscendStream& stream)
+{
+    AscendMat dst, sizeSrcNpu;
+    // left-up conner
+    int x = rect.x, y = rect.y, width = rect.width, height = rect.height;
+    int64_t offset[] = {y, x, 0};
+
+    CV_Assert(x + width <= src.cols && y + height <= src.rows);
+    int size1[] = {1, src.channels(), height, width};
+    dst.create(height, width, src.type());
+
+    Mat sizeSrc(height, width, src.type(), size1);
+    sizeSrcNpu.upload(sizeSrc);
+    crop(src, dst, sizeSrcNpu, offset, stream);
+
+    return dst;
+}
+AscendMat crop(InputArray _src, const Rect& rect, AscendStream& stream)
+{
+    AscendMat src;
+    src.upload(_src, stream);
+    return crop(src, rect, stream);
+}
+
+void resize(const AscendMat& src, AscendMat& dst, int32_t* dstSize, int interpolation,
+            AscendStream& stream)
+{
+    OperatorRunner runner;
+    int64_t dims[] = {2};
+    char const* mode = "";
+    switch (interpolation)
+    {
+        case INTER_CUBIC:
+            mode = "ResizeBicubic";
+            break;
+        case INTER_AREA:
+            mode = "ResizeArea";
+            break;
+        default:
+            break;
+    }
+
+    runner.setOp(mode)
+        .addInput(src, "images")
+        .addInput<int32_t>(dstSize, dims, 1, ACL_INT32, "size")
+        .addAttr(true, "half_pixel_centers")
+        .addOutput(dst, "y")
+        .run(stream);
+}
+
+void resize(const AscendMat& src, AscendMat& dst, Size dsize, double inv_scale_x,
+            double inv_scale_y, int interpolation, AscendStream& stream)
+{
+    Size ssize = src.size();
+    CV_Assert(!ssize.empty());
+    float_t scaleX = (float_t)inv_scale_x;
+    float_t scaleY = (float_t)inv_scale_y;
+    CV_Assert(interpolation == INTER_CUBIC || interpolation == INTER_AREA);
+
+    if (dsize.empty())
+    {
+        CV_Assert(scaleX > 0);
+        CV_Assert(scaleY > 0);
+        dsize = Size(saturate_cast<int>(ssize.width * inv_scale_x),
+                     saturate_cast<int>(ssize.height * inv_scale_y));
+        CV_Assert(!dsize.empty());
+    }
+    else
+    {
+        scaleX = (float_t)dsize.width / ssize.width;
+        scaleY = (float_t)dsize.height / ssize.height;
+        CV_Assert(scaleX > 0);
+        CV_Assert(scaleY > 0);
+    }
+
+    int32_t dstSize[] = {dsize.width, dsize.height};
+    dst.create(dstSize[0], dstSize[1], src.type());
+    resize(src, dst, dstSize, interpolation, stream);
+}
+
+void resize(InputArray _src, OutputArray _dst, Size dsize, double inv_scale_x, double inv_scale_y,
+            int interpolation, AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    resize(src, dst, dsize, inv_scale_x, inv_scale_y, interpolation, stream);
+    dst.download(_dst, stream);
+}
+
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannops/src/element_operations.cpp b/modules/cannops/src/element_operations.cpp
new file mode 100644
index 00000000000..402658369b5
--- /dev/null
+++ b/modules/cannops/src/element_operations.cpp
@@ -0,0 +1,499 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+namespace cv
+{
+namespace cann
+{
+
+static inline void applyMask(const AscendMat& src, AscendMat& dst, const AscendMat& mask,
+                             AscendStream& stream)
+{
+    int mtype = mask.type();
+    CV_Assert((mtype == CV_8UC1 || mtype == CV_8SC1) && mask.size() == src.size());
+    AscendMat onesMask, castedMask;
+    onesMask.create(mask.rows, mask.cols, mask.type());
+
+    OperatorRunner runner;
+    runner.setOp("Div")
+        .addInput(mask, "x1")
+        .addInput(mask, "x2")
+        .addOutput(onesMask, "y")
+        .run(stream);
+
+    onesMask.convertTo(castedMask, dst.depth(), stream);
+    arithm_op(src, castedMask, dst, "Mul", stream);
+}
+
+static inline void applyScale(const AscendMat& src, AscendMat& dst, float scale,
+                              AscendStream& stream)
+{
+    OperatorRunner runner;
+    arithm_op(src, scale, dst, "Muls", stream);
+}
+
+void arithm_op(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const char* op,
+               AscendStream& stream)
+{
+    if (src2.empty())
+        arithm_op(src1, dst, op, stream);
+    else
+    {
+        OperatorRunner runner;
+        runner.setOp(op).addInput(src1, "x1").addInput(src2, "x2").addOutput(dst, "y").run(stream);
+    }
+}
+
+void arithm_op(const AscendMat& src, const Scalar& sc, AscendMat& dst, const char* op,
+               AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp(op)
+        .addInput(src, "x1")
+        .addInput(sc, src.type(), "x2")
+        .addOutput(dst, "y")
+        .run(stream);
+}
+
+void arithm_op(const Scalar& sc, const AscendMat& src, AscendMat& dst, const char* op,
+               AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp(op)
+        .addInput(sc, src.type(), "x1")
+        .addInput(src, "x2")
+        .addOutput(dst, "y")
+        .run(stream);
+}
+
+void arithm_op(const AscendMat& src, AscendMat& dst, const char* op, AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp(op).addInput(src, "x").addOutput(dst, "y").run(stream);
+}
+
+void arithm_op(const AscendMat& src, float scalar, AscendMat& dst, const char* op,
+               AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp(op).addInput(src, "x").addAttr(scalar, "value").addOutput(dst, "y").run(stream);
+}
+
+// Helper function for template arithm_op. all function called in template arithm_op should be
+// done in both AscendMat and Scalar.
+static void getInputInfo(const AscendMat& src, int& depth, int& cn, Size& size)
+{
+    depth = src.depth();
+    cn = src.channels();
+    size = src.size();
+}
+
+static void getInputInfo(const Scalar& src, int& depth, int& cn, Size& size)
+{
+    CV_UNUSED(src);
+    depth = -1;
+    cn = -1;
+    size = {-1, -1};
+}
+
+static void convert(const AscendMat& src, AscendMat& dst, AscendStream& stream)
+{
+    src.convertTo(dst, CV_32F, stream);
+}
+
+static void convert(const Scalar& src, Scalar& dst, AscendStream& stream)
+{
+    CV_UNUSED(stream);
+    dst = src;
+}
+
+template <typename T1, typename T2>
+static void arithm_op(const T1& src1, const T2& src2, AscendMat& dst, const AscendMat& mask, float scale,
+                      int dtype, const char* op, AscendStream& stream)
+{
+    T1 castedSrc1;
+    T2 castedSrc2;
+    AscendMat castedRet;
+
+    int sdepth1, sdepth2, scn1, scn2;
+    Size ssize1, ssize2;
+    getInputInfo(src1, sdepth1, scn1, ssize1);
+    getInputInfo(src2, sdepth2, scn2, ssize2);
+
+    int sdepth = sdepth1 == -1 ? sdepth2 : sdepth1;
+    int cn = scn1 == -1 ? scn2 : scn1;
+    Size size = sdepth1 == -1 ? ssize2 : ssize1;
+
+    if (sdepth1 != -1 && sdepth2 != -1 && !ssize1.empty() && !ssize2.empty())
+        CV_Assert(sdepth1 == sdepth2 && scn1 == scn2 && ssize1 == ssize2);
+
+    if (dtype < 0)
+        dtype = sdepth;
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    CV_Assert(sdepth <= CV_16F && ddepth <= CV_16F);
+
+    dst.create(size.height, size.width, CV_MAKE_TYPE(ddepth, cn));
+
+    // In order to achieve high accuracy, convert integers to float for calculation.
+    if (scale != 1 && dtype < CV_32F)
+    {
+        convert(src1, castedSrc1, stream);
+        convert(src2, castedSrc2, stream);
+        castedRet.create(size.height, size.width, CV_MAKE_TYPE(CV_32F, cn));
+    }
+    else
+    {
+        castedSrc1 = src1;
+        castedSrc2 = src2;
+        castedRet = dst;
+    }
+
+    // step1, calculate operator.
+    OperatorRunner runner;
+    arithm_op(castedSrc1, castedSrc2, castedRet, op, stream);
+
+    // step2, apply mask if need.
+    if (!mask.empty())
+        applyMask(castedRet, castedRet, mask, stream);
+
+    // step3, apply scale if need.
+    if (scale != 1)
+        applyScale(castedRet, castedRet, scale, stream);
+
+    // After rounding the result, convert the type to the original type.
+    if (castedRet.depth() != dst.depth())
+    {
+        runner.setOp("Round").addInput(castedRet, "x").addOutput(castedRet, "y").run(stream);
+        castedRet.convertTo(dst, stream);
+    }
+}
+
+static void arithm_op(const InputArray _src1, const InputArray _src2, OutputArray _dst, const InputArray _mask,
+                      float scale, int dtype, const char* op, AscendStream& stream)
+{
+    const bool isScalar1 = (_src1.kind() == _InputArray::MATX);
+    const bool isScalar2 = (_src2.kind() == _InputArray::MATX);
+
+    if (isScalar1 && isScalar2)
+        CV_Error(Error::StsBadArg, "At list one matrix parameter shoule be passwd.");
+
+    AscendMat src1, src2, dst, mask;
+    Mat scalar;
+
+    if (!isScalar1 && !_src1.empty())
+        src1.upload(_src1, stream);
+    if (!isScalar2 && !_src2.empty())
+        src2.upload(_src2, stream);
+
+    if (!_mask.empty())
+        mask.upload(_mask, stream);
+
+    Scalar val;
+    if (isScalar1)
+        scalar = _src1.getMat();
+    else if (isScalar2)
+        scalar = _src2.getMat();
+
+    if (!scalar.empty())
+    {
+        CV_Assert(scalar.total() <= 4);
+        scalar.convertTo(Mat_<double>(scalar.rows, scalar.cols, &val[0]), CV_64F);
+    }
+
+    if (isScalar1)
+        arithm_op(val, src2, dst, mask, scale, dtype, op, stream);
+    else if (isScalar2)
+        arithm_op(src1, val, dst, mask, scale, dtype, op, stream);
+    else
+        arithm_op(src1, src2, dst, mask, scale, dtype, op, stream);
+
+    dst.download(_dst, stream);
+}
+
+// In order to supply more interfaces, differnet function declaration shoule be done.
+void add(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask, int dtype,
+         AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
+}
+
+void add(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+         AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
+}
+
+void add(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+         AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
+}
+
+void add(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+         AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
+}
+
+
+void subtract(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
+}
+
+void subtract(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
+}
+
+void subtract(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
+}
+
+void subtract(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
+}
+
+
+void multiply(const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), scale, dtype, "Mul", stream);
+}
+
+void multiply(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "Mul", stream);
+}
+
+void multiply(const AscendMat& src1, const Scalar& src2, AscendMat& dst, float scale, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "Mul", stream);
+}
+
+void multiply(const Scalar& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "Mul", stream);
+}
+
+
+void divide(const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+            AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), scale, dtype, "RealDiv", stream);
+}
+
+void divide(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+            AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "RealDiv", stream);
+}
+
+void divide(const AscendMat& src1, const Scalar& src2, AscendMat& dst, float scale, int dtype,
+            AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "RealDiv", stream);
+}
+
+void divide(const Scalar& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+            AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "RealDiv", stream);
+}
+
+
+void bitwise_and(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
+}
+
+void bitwise_and(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
+}
+
+void bitwise_and(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
+}
+
+void bitwise_and(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
+}
+
+
+void bitwise_or(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+                AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
+}
+
+void bitwise_or(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+                AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
+}
+
+void bitwise_or(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+                AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
+}
+
+void bitwise_or(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+                AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
+}
+
+
+void bitwise_xor(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
+}
+
+void bitwise_xor(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
+}
+
+void bitwise_xor(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
+}
+
+void bitwise_xor(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
+}
+
+
+void bitwise_not(const InputArray src, OutputArray dst, const InputArray mask, AscendStream& stream)
+{
+    arithm_op(src, noArray(), dst, mask, 1, -1, "Invert", stream);
+}
+
+void bitwise_not(const AscendMat& src, AscendMat& dst, const AscendMat& mask, AscendStream& stream)
+{
+    arithm_op(src, AscendMat(), dst, mask, 1, -1, "Invert", stream);
+}
+
+
+void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2, double beta, double gamma,
+                 AscendMat& dst, int dtype, AscendStream& stream)
+{
+    if (dtype < 0)
+        dtype = src1.depth();
+
+    CV_Assert(src2.depth() == src1.depth() && src2.size() == src1.size() &&
+              src1.channels() == src2.channels());
+
+    int type = CV_MAKE_TYPE(dtype, src1.channels());
+    dst.create(src1.rows, src1.cols, type);
+
+    // TODO: Consider overflow, should extend type or not?
+    AscendMat src1Weighted(src1.size(), type), src2Weighted(src1.size(), type),
+        srcWeightedSumRet(src1.size(), type);
+
+    arithm_op(src1, (float)alpha, src1Weighted, "Muls", stream);
+    arithm_op(src2, (float)beta, src2Weighted, "Muls", stream);
+    arithm_op(src1Weighted, src2Weighted, srcWeightedSumRet, "Add", stream);
+    arithm_op(srcWeightedSumRet, (float)gamma, dst, "Adds", stream);
+}
+
+void addWeighted(const InputArray _src1, double alpha, const InputArray _src2, double beta, double gamma,
+                 OutputArray _dst, int dtype, AscendStream& stream)
+{
+    AscendMat src1, src2, dst;
+    src1.upload(_src1, stream);
+    src2.upload(_src2, stream);
+    addWeighted(src1, alpha, src2, beta, gamma, dst, dtype, stream);
+    dst.download(_dst, stream);
+}
+
+double threshold(const AscendMat& src, AscendMat& dst, double thresh, double maxval, int type,
+                 AscendStream& stream)
+{
+    // ThresholdTypes is defined in opencv2/imgproc, This type is the only Symbol we need.
+    // Add imgproc to dependence is too heavy, use magic number instead.
+    CV_Assert(type <= 4 /*THRESH_TOZERO_INV*/);
+
+    AscendMat threshMat(src.size(), src.type());
+
+    dst.create(src.rows, src.cols, src.type());
+
+    OperatorRunner runner;
+    runner.setOp("Threshold")
+        .addInput(src, "x")
+        .addOutput(threshMat, "y")
+        .addAttr((float)thresh, "threshold")
+        .run(stream);
+
+    // THRESH_*_INV, THRESH_TRUNC need a inverse threshMat.
+    // THRESH_BINARY_INV = 1, THRESH_TRUNC = 2, THRESH_TOZERO_INV = 4,
+    if (type == 1 || type == 2 || type == 4)
+    {
+        AscendMat threshInvMat(src.size(), src.type());
+        AscendMat ones(src.size(), src.type());
+        Scalar s(1, 1, 1, 1);
+        ones.setTo(s, stream);
+        arithm_op(ones, threshMat, threshInvMat, "Sub", stream);
+
+        if (type == 1)
+            arithm_op(threshInvMat, (float)maxval, dst, "Muls", stream);
+        else if (type == 2)
+        {
+            AscendMat ToZeroInvMat(src.size(), src.type());
+            AscendMat TruncMat(src.size(), src.type());
+            arithm_op(threshInvMat, src, ToZeroInvMat, "Mul", stream);
+            arithm_op(threshMat, (float)thresh, TruncMat, "Muls", stream);
+            arithm_op(ToZeroInvMat, TruncMat, dst, "Add", stream);
+        }
+        else
+            arithm_op(threshInvMat, src, dst, "Mul", stream);
+    }
+    else
+    {
+        if (type == 0) /* THRESH_BINARY = 0 */
+            arithm_op(threshMat, (float)maxval, dst, "Muls", stream);
+        else if (type == 3) /* THRESH_TOZERO = 3 */
+            arithm_op(threshMat, src, dst, "Mul", stream);
+        else
+            CV_Error(Error::StsError, "Unknown/unsupported threshold type");
+    }
+    return thresh;
+}
+
+double threshold(const InputArray _src, OutputArray _dst, double thresh, double maxval, int type,
+                 AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    dst.create(src.rows, src.cols, src.type());
+    double ret = threshold(src, dst, thresh, maxval, type, stream);
+    dst.download(_dst, stream);
+    return ret;
+}
+
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannops/src/precomp.hpp b/modules/cannops/src/precomp.hpp
new file mode 100644
index 00000000000..8411cc40407
--- /dev/null
+++ b/modules/cannops/src/precomp.hpp
@@ -0,0 +1,14 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include "opencv2/cann.hpp"
+#include "opencv2/stream_accessor.hpp"
+#include "opencv2/cann_call.hpp"
+#include "opencv2/cann_interface.hpp"
+#include "opencv2/cann_private.hpp"
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cannops/test/test_core.cpp b/modules/cannops/test/test_core.cpp
new file mode 100644
index 00000000000..6b63a8cf061
--- /dev/null
+++ b/modules/cannops/test/test_core.cpp
@@ -0,0 +1,217 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include <vector>
+
+namespace opencv_test
+{
+namespace
+{
+TEST(CORE, MERGE)
+{
+    Mat m1 = (Mat_<uchar>(2, 2) << 1, 4, 7, 10);
+    Mat m2 = (Mat_<uchar>(2, 2) << 2, 5, 8, 11);
+    Mat m3 = (Mat_<uchar>(2, 2) << 3, 6, 9, 12);
+    Mat channels[3] = {m1, m2, m3};
+    Mat m;
+    cv::merge(channels, 3, m);
+
+    cv::cann::setDevice(0);
+
+    AscendMat a1, a2, a3;
+    a1.upload(m1);
+    a2.upload(m2);
+    a3.upload(m3);
+    AscendMat aclChannels[3] = {a1, a2, a3};
+    std::vector<AscendMat> aclChannelsVector;
+    aclChannelsVector.push_back(a1);
+    aclChannelsVector.push_back(a2);
+    aclChannelsVector.push_back(a3);
+
+    Mat checker1, checker2;
+    cv::cann::merge(aclChannels, 3, checker1);
+    cv::cann::merge(aclChannelsVector, checker2);
+
+    EXPECT_MAT_NEAR(m, checker1, 0.0);
+    EXPECT_MAT_NEAR(m, checker2, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(CORE, SPLIT)
+{
+    char d[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+    Mat m(2, 2, CV_8UC3, d);
+    Mat channels[3];
+    cv::split(m, channels);
+
+    cv::cann::setDevice(0);
+
+    AscendMat aclChannels[3];
+    std::vector<AscendMat> aclChannelsVector;
+
+    cv::cann::split(m, aclChannels);
+    cv::cann::split(m, aclChannelsVector);
+
+    Mat checker1[3], checker2[3];
+    aclChannels[0].download(checker1[0]);
+    aclChannels[1].download(checker1[1]);
+    aclChannels[2].download(checker1[2]);
+
+    aclChannelsVector[0].download(checker2[0]);
+    aclChannelsVector[1].download(checker2[1]);
+    aclChannelsVector[2].download(checker2[2]);
+
+    EXPECT_MAT_NEAR(channels[0], checker1[0], 0.0);
+    EXPECT_MAT_NEAR(channels[1], checker1[1], 0.0);
+    EXPECT_MAT_NEAR(channels[2], checker1[2], 0.0);
+
+    EXPECT_MAT_NEAR(channels[0], checker2[0], 0.0);
+    EXPECT_MAT_NEAR(channels[1], checker2[1], 0.0);
+    EXPECT_MAT_NEAR(channels[2], checker2[2], 0.0);
+
+    AscendMat npuM;
+    npuM.upload(m);
+    cv::cann::split(npuM, aclChannels);
+    cv::cann::split(npuM, aclChannelsVector);
+
+    aclChannels[0].download(checker1[0]);
+    aclChannels[1].download(checker1[1]);
+    aclChannels[2].download(checker1[2]);
+
+    aclChannelsVector[0].download(checker2[0]);
+    aclChannelsVector[1].download(checker2[1]);
+    aclChannelsVector[2].download(checker2[2]);
+
+    EXPECT_MAT_NEAR(channels[0], checker1[0], 0.0);
+    EXPECT_MAT_NEAR(channels[1], checker1[1], 0.0);
+    EXPECT_MAT_NEAR(channels[2], checker1[2], 0.0);
+
+    EXPECT_MAT_NEAR(channels[0], checker2[0], 0.0);
+    EXPECT_MAT_NEAR(channels[1], checker2[1], 0.0);
+    EXPECT_MAT_NEAR(channels[2], checker2[2], 0.0);
+    cv::cann::resetDevice();
+}
+
+TEST(CORE, TRANSPOSE)
+{
+    Mat cpuMat = randomMat(10, 10, CV_32SC3), cpuRetMat, checker;
+    cv::transpose(cpuMat, cpuRetMat);
+    cv::cann::transpose(cpuMat, checker);
+    EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(cpuMat);
+    cv::cann::transpose(npuMat, npuChecker);
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+}
+
+TEST(CORE, FLIP)
+{
+    Mat cpuMat = randomMat(10, 10, CV_32SC3), cpuRetMat, checker;
+
+    int flipMode;
+
+    for (flipMode = -1; flipMode < 2; flipMode++)
+    {
+        cv::flip(cpuMat, cpuRetMat, flipMode);
+        cv::cann::flip(cpuMat, checker, flipMode);
+        EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+    }
+
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(cpuMat);
+    for (flipMode = -1; flipMode < 2; flipMode++)
+    {
+        cv::flip(cpuMat, cpuRetMat, flipMode);
+        cv::cann::flip(npuMat, npuChecker, flipMode);
+        npuChecker.download(checker);
+        EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+    }
+}
+
+TEST(CORE, ROTATE)
+{
+    Mat cpuRetMat, checker, cpuMat = randomMat(3, 5, CV_16S, 0.0, 255.0);
+
+    int rotateMode;
+    for (rotateMode = 0; rotateMode < 3; rotateMode++)
+    {
+        cv::rotate(cpuMat, cpuRetMat, rotateMode);
+        cv::cann::rotate(cpuMat, checker, rotateMode);
+        EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+    }
+
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(cpuMat);
+    for (rotateMode = 0; rotateMode < 3; rotateMode++)
+    {
+        cv::rotate(cpuMat, cpuRetMat, rotateMode);
+        cv::cann::rotate(npuMat, npuChecker, rotateMode);
+        npuChecker.download(checker);
+        EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+    }
+}
+
+TEST(CORE, CROP)
+{
+    Mat cpuOpRet, checker, cpuMat = randomMat(6, 6, CV_32SC3, 0.0, 255.0);
+    Rect b(1, 2, 4, 4);
+    Mat cropped_cv(cpuMat, b);
+    AscendMat cropped_cann(cpuMat, b);
+    cropped_cann.download(checker);
+    EXPECT_MAT_NEAR(cropped_cv, checker, 1e-10);
+}
+
+TEST(CORE, CROP_OVERLOAD)
+{
+    Mat cpuOpRet, checker, cpuMat = randomMat(6, 6, CV_16SC3, 0.0, 255.0);
+    const Rect b(1, 2, 4, 4);
+    Mat cropped_cv = cpuMat(b);
+    AscendMat cropped_cann = cv::cann::crop(cpuMat, b);
+    cropped_cann.download(checker);
+    EXPECT_MAT_NEAR(cropped_cv, checker, 1e-10);
+
+    AscendMat npuMat;
+    npuMat.upload(cpuMat);
+    cropped_cann = cv::cann::crop(npuMat, b);
+    cropped_cann.download(checker);
+    EXPECT_MAT_NEAR(cropped_cv, checker, 1e-10);
+}
+
+TEST(CORE, RESIZE)
+{
+    Mat resized_cv, checker, cpuMat = randomMat(10, 10, CV_32F, 100.0, 255.0);
+    Size dsize = Size(6, 6);
+    // only support {2 INTER_CUBIC} and {3 INTER_AREA}
+    // only the resize result of INTER_AREA is close to CV's.
+    int flags = 3;
+    cv::cann::setDevice(0);
+    cv::resize(cpuMat, resized_cv, dsize, 0, 0, flags);
+    cv::cann::resize(cpuMat, checker, dsize, 0, 0, flags);
+    EXPECT_MAT_NEAR(resized_cv, checker, 1e-4);
+
+    cv::resize(cpuMat, resized_cv, Size(), 0.5, 0.5, flags);
+    cv::cann::resize(cpuMat, checker, Size(), 0.5, 0.5, flags);
+    EXPECT_MAT_NEAR(resized_cv, checker, 1e-4);
+
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(cpuMat);
+    cv::resize(cpuMat, resized_cv, dsize, 0, 0, flags);
+    cv::cann::resize(npuMat, npuChecker, dsize, 0, 0, flags);
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(resized_cv, checker, 1e-4);
+
+    cv::resize(cpuMat, resized_cv, Size(), 0.5, 0.5, flags);
+    cv::cann::resize(npuMat, npuChecker, Size(), 0.5, 0.5, flags);
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(resized_cv, checker, 1e-4);
+    cv::cann::resetDevice();
+}
+
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/test/test_cvtcolor.cpp b/modules/cannops/test/test_cvtcolor.cpp
new file mode 100644
index 00000000000..27a92298961
--- /dev/null
+++ b/modules/cannops/test/test_cvtcolor.cpp
@@ -0,0 +1,89 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+void cvtColorTest(int code, int cn, int dcn = 3, float diff = 0.0f)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat cpuRet, npuRet;
+
+    Mat img8U = randomMat(512, 512, CV_MAKETYPE(CV_8U, cn), 0.0f, 255.0f);
+    Mat img16U = randomMat(512, 512, CV_MAKETYPE(CV_16U, cn), 0.0f, 65535.0f);
+    Mat img32F = randomMat(512, 512, CV_MAKETYPE(CV_32F, cn), 0.0f, 65535.0f);
+
+    cv::cvtColor(img8U, cpuRet, code, dcn);
+    cv::cann::cvtColor(img8U, npuRet, code, dcn);
+    EXPECT_MAT_NEAR(cpuRet, npuRet, diff);
+
+    cv::cvtColor(img16U, cpuRet, code, dcn);
+    cv::cann::cvtColor(img16U, npuRet, code, dcn);
+    EXPECT_MAT_NEAR(cpuRet, npuRet, diff);
+
+    cv::cvtColor(img32F, cpuRet, code, dcn);
+    cv::cann::cvtColor(img32F, npuRet, code, dcn);
+    EXPECT_MAT_NEAR(cpuRet, npuRet, diff);
+    cv::cann::resetDevice();
+}
+
+TEST(CVT_COLOR, BGR2BGRA) { cvtColorTest(COLOR_BGR2BGRA, 3, 4); }
+TEST(CVT_COLOR, BGRA2BGR) { cvtColorTest(COLOR_BGRA2BGR, 4); }
+TEST(CVT_COLOR, BGR2RGBA) { cvtColorTest(COLOR_BGR2RGBA, 3, 4); }
+TEST(CVT_COLOR, RGBA2BGR) { cvtColorTest(COLOR_RGBA2BGR, 4); }
+TEST(CVT_COLOR, BGR2RGB) { cvtColorTest(COLOR_BGR2RGB, 3); }
+TEST(CVT_COLOR, BGRA2RGBA) { cvtColorTest(COLOR_BGRA2RGBA, 4, 4); }
+
+// Due to parameter accuracy issues, the calculation results have certain accuracy differences.
+TEST(CVT_COLOR, BGR2GRAY) { cvtColorTest(COLOR_BGR2GRAY, 3, 1, 10.0f); }
+TEST(CVT_COLOR, RGB2GRAY) { cvtColorTest(COLOR_BGR2GRAY, 3, 1, 10.0f); }
+TEST(CVT_COLOR, GRAY2BGR) { cvtColorTest(COLOR_GRAY2BGR, 1); }
+TEST(CVT_COLOR, GRAY2BGRA) { cvtColorTest(COLOR_GRAY2BGRA, 1, 4); }
+TEST(CVT_COLOR, BGRA2GRAY) { cvtColorTest(COLOR_BGRA2GRAY, 4, 1, 10.0f); }
+TEST(CVT_COLOR, RGBA2GRAY) { cvtColorTest(COLOR_RGBA2GRAY, 4, 1, 10.0f); }
+
+TEST(CVT_COLOR, RGB2XYZ) { cvtColorTest(COLOR_RGB2XYZ, 3, 3, 50.0f); }
+TEST(CVT_COLOR, BGR2XYZ) { cvtColorTest(COLOR_BGR2XYZ, 3, 3, 50.0f); }
+TEST(CVT_COLOR, XYZ2BGR) { cvtColorTest(COLOR_XYZ2BGR, 3, 3, 150.0f); }
+TEST(CVT_COLOR, XYZ2RGB) { cvtColorTest(COLOR_XYZ2RGB, 3, 3, 150.0f); }
+TEST(CVT_COLOR, XYZ2BGR_DC4) { cvtColorTest(COLOR_XYZ2BGR, 3, 4, 150.0f); }
+TEST(CVT_COLOR, XYZ2RGB_DC4) { cvtColorTest(COLOR_XYZ2RGB, 3, 4, 150.0f); }
+
+TEST(CVT_COLOR, BGR2YCrCb) { cvtColorTest(COLOR_BGR2YCrCb, 3, 3, 10.0f); }
+TEST(CVT_COLOR, RGB2YCrCb) { cvtColorTest(COLOR_RGB2YCrCb, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YCrCb2BGR) { cvtColorTest(COLOR_YCrCb2BGR, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YCrCb2RGB) { cvtColorTest(COLOR_YCrCb2RGB, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YCrCb2BGR_DC4) { cvtColorTest(COLOR_YCrCb2BGR, 3, 4, 10.0f); }
+TEST(CVT_COLOR, YCrCb2RGB_DC4) { cvtColorTest(COLOR_YCrCb2RGB, 3, 4, 10.0f); }
+
+TEST(CVT_COLOR, BGR2YUV) { cvtColorTest(COLOR_BGR2YUV, 3, 3, 10.0f); }
+TEST(CVT_COLOR, RGB2YUV) { cvtColorTest(COLOR_RGB2YUV, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YUV2BGR) { cvtColorTest(COLOR_YUV2BGR, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YUV2RGB) { cvtColorTest(COLOR_YUV2RGB, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YUV2BGR_DC4) { cvtColorTest(COLOR_YUV2BGR, 3, 4, 10.0f); }
+TEST(CVT_COLOR, YUV2RGB_DC4) { cvtColorTest(COLOR_YUV2RGB, 3, 4, 10.0f); }
+
+// Test of AscendMat. Since the logic is the same, only interface test is needed.
+TEST(CVT_COLOR, COLOR_BGR2BGRA_ASCENDMAT)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat cpuRet, npuRet;
+
+    Mat img8U = randomMat(512, 512, CV_8UC3, 0.0f, 255.0f);
+    cv::cvtColor(img8U, cpuRet, COLOR_BGR2BGRA, 4);
+
+    AscendMat npuImg8U, npuChecker;
+    npuImg8U.upload(img8U);
+    cv::cann::cvtColor(npuImg8U, npuChecker, COLOR_BGR2BGRA, 4);
+    npuChecker.download(npuRet);
+    EXPECT_MAT_NEAR(cpuRet, npuRet, 10.0f);
+    cv::cann::resetDevice();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/test/test_element_operations.cpp b/modules/cannops/test/test_element_operations.cpp
new file mode 100644
index 00000000000..76c103a65f4
--- /dev/null
+++ b/modules/cannops/test/test_element_operations.cpp
@@ -0,0 +1,697 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include <iostream>
+
+namespace opencv_test
+{
+namespace
+{
+template <typename FCV, typename FCANN, typename... PARAMS>
+void testMatOpMat(FCV cvFunc, FCANN cannFunc, PARAMS... param)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat mat1 = randomMat(10, 10, CV_32SC3);
+    Mat mat2 = randomMat(10, 10, CV_32SC3);
+    Mat cpuDst, check;
+
+    cvFunc(mat1, mat2, cpuDst, param...);
+    cannFunc(mat1, mat2, check, param..., AscendStream::Null());
+    EXPECT_MAT_NEAR(cpuDst, check, 1.0);
+
+    AscendStream stream;
+    cannFunc(mat1, mat2, check, param..., stream);
+    stream.waitForCompletion();
+    EXPECT_MAT_NEAR(cpuDst, check, 1.0);
+
+    cv::cann::resetDevice();
+}
+
+template <typename FCV, typename FCANN, typename DTMASK, typename... PARAMS>
+void testAscendMatOpAscendMatMask(FCV cvFunc, FCANN cannFunc, DTMASK mask = AscendMat(),
+                                  PARAMS... param)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat mat1 = randomMat(10, 10, CV_32SC3);
+    Mat mat2 = randomMat(10, 10, CV_32SC3);
+    Mat cpuDst, check, cpuMask;
+    AscendMat npuMat1, npuMat2, npuCheck;
+    npuMat1.upload(mat1);
+    npuMat2.upload(mat2);
+    if (mask.empty())
+    {
+        cvFunc(mat1, mat2, cpuDst, noArray(), param...);
+    }
+    else
+    {
+        mask.download(cpuMask);
+        cvFunc(mat1, mat2, cpuDst, cpuMask, param...);
+    }
+
+    cannFunc(npuMat1, npuMat2, npuCheck, mask, param..., AscendStream::Null());
+    npuCheck.download(check);
+    EXPECT_MAT_NEAR(cpuDst, check, 1.0);
+
+    AscendStream stream;
+    cannFunc(npuMat1, npuMat2, npuCheck, mask, param..., stream);
+    npuCheck.download(check);
+    stream.waitForCompletion();
+    EXPECT_MAT_NEAR(cpuDst, check, 1.0);
+
+    cv::cann::resetDevice();
+}
+
+template <typename FCV, typename FCANN, typename... PARAMS>
+void testAscendMatOpAscendMat(FCV cvFunc, FCANN cannFunc, PARAMS... param)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat mat1 = randomMat(10, 10, CV_32SC3);
+    Mat mat2 = randomMat(10, 10, CV_32SC3);
+    Mat cpuDst, check;
+    AscendMat npuMat1, npuMat2, npuCheck;
+    npuMat1.upload(mat1);
+    npuMat2.upload(mat2);
+    cvFunc(mat1, mat2, cpuDst, param...);
+    cannFunc(npuMat1, npuMat2, npuCheck, param..., AscendStream::Null());
+    npuCheck.download(check);
+    EXPECT_MAT_NEAR(cpuDst, check, 1.0);
+
+    AscendStream stream;
+    cannFunc(npuMat1, npuMat2, npuCheck, param..., stream);
+    npuCheck.download(check);
+    stream.waitForCompletion();
+    EXPECT_MAT_NEAR(cpuDst, check, 1.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(ELEMENTWISE_OP, MAT_ADD_MAT)
+{
+    testMatOpMat(
+        cv::add,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        noArray(), -1);
+    testAscendMatOpAscendMatMask(
+        cv::add,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_SUB_MAT)
+{
+    testMatOpMat(
+        cv::subtract,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        noArray(), -1);
+    testAscendMatOpAscendMatMask(
+        cv::subtract,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_MUL_MAT)
+{
+    testMatOpMat(
+        cv::multiply,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+    testAscendMatOpAscendMat(
+        cv::multiply,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_DIV_MAT)
+{
+    testMatOpMat([](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+                 { cv::divide(src1, src2, dst, scale, dtype); },
+                 [](const InputArray src1, const InputArray src2, OutputArray dst, float scale,
+                    int dtype, AscendStream& stream)
+                 { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+                 1, -1);
+    testAscendMatOpAscendMat(
+        [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+        { cv::divide(src1, src2, dst, scale, dtype); },
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_AND_MAT)
+{
+    testMatOpMat(
+        cv::bitwise_and,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        noArray());
+    testAscendMatOpAscendMatMask(
+        cv::bitwise_and,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        AscendMat());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_OR_MAT)
+{
+    testMatOpMat(
+        cv::bitwise_or,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        noArray());
+    testAscendMatOpAscendMatMask(
+        cv::bitwise_or,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        AscendMat());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_XOR_MAT)
+{
+    testMatOpMat(
+        cv::bitwise_xor,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        noArray());
+    testAscendMatOpAscendMatMask(
+        cv::bitwise_xor,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        AscendMat());
+}
+
+TEST(ELEMENTWISE_OP, MAT_ADD_MAT_WITH_MASK_AND_DTYPE)
+{
+    testMatOpMat(
+        cv::add,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        genMask(), CV_32SC3);
+    testAscendMatOpAscendMatMask(
+        cv::add,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), CV_32SC3);
+}
+
+TEST(ELEMENTWISE_OP, MAT_SUB_MAT_WITH_MASK_AND_DTYPE)
+{
+    testMatOpMat(
+        cv::subtract,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        genMask(), CV_32SC3);
+    testAscendMatOpAscendMatMask(
+        cv::subtract,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), CV_32SC3);
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_AND_MAT_WITH_MASK)
+{
+    testMatOpMat(
+        cv::bitwise_and,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        genMask());
+    testAscendMatOpAscendMatMask(
+        cv::bitwise_and,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        genNpuMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_OR_MAT_WITH_MASK)
+{
+    testMatOpMat(
+        cv::bitwise_or,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        genMask());
+    testAscendMatOpAscendMatMask(
+        cv::bitwise_or,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        genNpuMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_XOR_MAT_WITH_MASK)
+{
+    testMatOpMat(
+        cv::bitwise_xor,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        genMask());
+    testAscendMatOpAscendMatMask(
+        cv::bitwise_xor,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        genNpuMask());
+}
+
+float randomScale = randomNum();
+TEST(ELEMENTWISE_OP, MAT_MUL_MAT_WITH_SCALE)
+{
+    testMatOpMat(
+        cv::multiply,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+        randomScale, -1);
+    testAscendMatOpAscendMat(
+        cv::multiply,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+        randomScale, -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_DIV_MAT_WITH_SCALE)
+{
+    testMatOpMat([](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+                 { cv::divide(src1, src2, dst, scale, dtype); },
+                 [](const InputArray src1, const InputArray src2, OutputArray dst, float scale,
+                    int dtype, AscendStream& stream)
+                 { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+                 randomScale, -1);
+    testAscendMatOpAscendMat(
+        [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+        { cv::divide(src1, src2, dst, scale, dtype); },
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+        randomScale, -1);
+}
+
+template <typename FCV, typename FCANN, typename... PARAMS>
+void testMatOpScalar(FCV cvFunc, FCANN cannFunc, PARAMS... param)
+{
+    Scalar scalar = randomScalar();
+    Mat mat(10, 10, CV_32SC3, randomScalar());
+    Mat cpuDst1, cpuDst2, checker1, checker2;
+
+    cvFunc(Mat(10, 10, CV_32SC3, scalar), mat, cpuDst1, param...);
+    cvFunc(mat, Mat(10, 10, CV_32SC3, scalar), cpuDst2, param...);
+    cv::cann::setDevice(DEVICE_ID);
+
+    cannFunc(scalar, mat, checker1, param..., AscendStream::Null());
+    cannFunc(mat, scalar, checker2, param..., AscendStream::Null());
+
+    EXPECT_MAT_NEAR(cpuDst1, checker1, 1.0);
+    EXPECT_MAT_NEAR(cpuDst2, checker2, 1.0);
+
+    AscendStream stream;
+    cannFunc(scalar, mat, checker1, param..., stream);
+    cannFunc(mat, scalar, checker2, param..., stream);
+    stream.waitForCompletion();
+    EXPECT_MAT_NEAR(cpuDst1, checker1, 1.0);
+    EXPECT_MAT_NEAR(cpuDst2, checker2, 1.0);
+
+    cv::cann::resetDevice();
+}
+
+template <typename FCV, typename FCANN, typename DTMASK, typename... PARAMS>
+void testAscendMatOpScalarMask(FCV cvFunc, FCANN cannFunc, DTMASK mask, PARAMS... param)
+{
+    Scalar scalar = randomScalar();
+    Mat mat(10, 10, CV_32SC3, randomScalar());
+    Mat cpuDst, checker, cpuMask;
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(mat);
+    if (mask.empty())
+    {
+        cvFunc(mat, Mat(10, 10, CV_32SC3, scalar), cpuDst, noArray(), param...);
+    }
+    else
+    {
+        mask.download(cpuMask);
+        cvFunc(mat, Mat(10, 10, CV_32SC3, scalar), cpuDst, cpuMask, param...);
+    }
+    cv::cann::setDevice(DEVICE_ID);
+
+    cannFunc(npuMat, scalar, npuChecker, mask, param..., AscendStream::Null());
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(cpuDst, checker, 1.0);
+
+    AscendStream stream;
+    cannFunc(npuMat, scalar, npuChecker, mask, param..., stream);
+    stream.waitForCompletion();
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(cpuDst, checker, 1.0);
+
+    cv::cann::resetDevice();
+}
+template <typename FCV, typename FCANN, typename DTMASK, typename... PARAMS>
+void testScalarOpAscendMatMask(FCV cvFunc, FCANN cannFunc, DTMASK mask, PARAMS... param)
+{
+    Scalar scalar = randomScalar();
+    Mat mat(10, 10, CV_32SC3, randomScalar());
+    Mat cpuDst, checker, cpuMask;
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(mat);
+    if (mask.empty())
+    {
+        cvFunc(Mat(10, 10, CV_32SC3, scalar), mat, cpuDst, noArray(), param...);
+    }
+    else
+    {
+        mask.download(cpuMask);
+        cvFunc(Mat(10, 10, CV_32SC3, scalar), mat, cpuDst, cpuMask, param...);
+    }
+    cv::cann::setDevice(DEVICE_ID);
+
+    cannFunc(scalar, npuMat, npuChecker, mask, param..., AscendStream::Null());
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(cpuDst, checker, 1.0);
+
+    AscendStream stream;
+    cannFunc(scalar, npuMat, npuChecker, mask, param..., stream);
+    stream.waitForCompletion();
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(cpuDst, checker, 1.0);
+
+    cv::cann::resetDevice();
+}
+template <typename FCV, typename FCANN, typename... PARAMS>
+void testAscendMatOpScalar(FCV cvFunc, FCANN cannFunc, PARAMS... param)
+{
+    Scalar scalar = randomScalar();
+    Mat mat(10, 10, CV_32SC3, randomScalar());
+    Mat cpuDst, checker;
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(mat);
+
+    cvFunc(mat, Mat(10, 10, CV_32SC3, scalar), cpuDst, param...);
+    cv::cann::setDevice(DEVICE_ID);
+
+    cannFunc(npuMat, scalar, npuChecker, param..., AscendStream::Null());
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(cpuDst, checker, 1.0);
+
+    AscendStream stream;
+    cannFunc(npuMat, scalar, npuChecker, param..., stream);
+    stream.waitForCompletion();
+    npuChecker.download(checker);
+
+    cv::cann::resetDevice();
+}
+
+TEST(ELEMENTWISE_OP, MAT_ADD_SCALAR)
+{
+    testMatOpScalar(
+        cv::add,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        noArray(), -1);
+    testAscendMatOpScalarMask(
+        cv::add,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), -1);
+    testScalarOpAscendMatMask(
+        cv::add,
+        [](const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_SUB_SCALAR)
+{
+    testMatOpScalar(
+        cv::subtract,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        noArray(), -1);
+    testAscendMatOpScalarMask(
+        cv::subtract,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_MUL_SCALAR)
+{
+    testMatOpScalar(
+        cv::multiply,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+    testAscendMatOpScalar(
+        cv::multiply,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_DIV_SCALAR)
+{
+    testMatOpScalar(
+        [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+        { cv::divide(src1, src2, dst, scale, dtype); },
+        [](const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+    testAscendMatOpScalar(
+        [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+        { cv::divide(src1, src2, dst, scale, dtype); },
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_AND_SCALAR)
+{
+    testMatOpScalar(
+        cv::bitwise_and,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        noArray());
+    testAscendMatOpScalarMask(
+        cv::bitwise_and,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        AscendMat());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_OR_SCALAR)
+{
+    testMatOpScalar(
+        cv::bitwise_or,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        noArray());
+    testAscendMatOpScalarMask(
+        cv::bitwise_or,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        AscendMat());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_XOR_SCALAR)
+{
+    testMatOpScalar(
+        cv::bitwise_xor,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        noArray());
+    testAscendMatOpScalarMask(
+        cv::bitwise_xor,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        AscendMat());
+}
+
+TEST(ELEMENTWISE_OP, MAT_ADD_SCALAR_WITH_MASK_AND_DETYPE)
+{
+    testMatOpScalar(
+        cv::add,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        genMask(), CV_32SC3);
+    testAscendMatOpScalarMask(
+        cv::add,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        genNpuMask(), CV_32SC3);
+}
+
+TEST(ELEMENTWISE_OP, MAT_SUB_SCALAR_WITH_MASK_AND_DETYPE)
+{
+    testMatOpScalar(
+        cv::subtract,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        genMask(), CV_32SC3);
+    testAscendMatOpScalarMask(
+        cv::subtract,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        genNpuMask(), CV_32SC3);
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_AND_SCALAR_WITH_MASK)
+{
+    testMatOpScalar(
+        cv::bitwise_and,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        genMask());
+    testAscendMatOpScalarMask(
+        cv::bitwise_and,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        genNpuMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_OR_SCALAR_WITH_MASK)
+{
+    testMatOpScalar(
+        cv::bitwise_or,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        genMask());
+    testAscendMatOpScalarMask(
+        cv::bitwise_or,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        genNpuMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_XOR_SCALAR_WITH_MASK)
+{
+    testMatOpScalar(
+        cv::bitwise_xor,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        genMask());
+    testAscendMatOpScalarMask(
+        cv::bitwise_xor,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        genNpuMask());
+}
+
+// TODO: I think the cv result is wrong, which has truncated middle result.
+// Disable these two test case bacause it't not stable.
+// TEST(ELEMENTWISE_OP, MAT_MUL_SCALAR_WITH_SCALE)
+// {
+//     testMatOpScalar(
+//         cv::multiply,
+//         [](const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+//            AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+//         randomScale, CV_32SC3);
+//     testAscendMatOpScalar(
+//         [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+//         { cv::divide(src1, src2, dst, scale, dtype); },
+//         [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, float scale, int dtype,
+//            AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+//         randomScale, -1);
+// }
+
+// TEST(ELEMENTWISE_OP, MAT_DIV_SCALAR_WITH_SCALE)
+// {
+//     testMatOpScalar(
+//         [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+//         { cv::divide(src1, src2, dst, scale, dtype); },
+//         [](const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+//            AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+//         randomScale, -1);
+//     testAscendMatOpScalar(
+//         [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+//         { cv::divide(src1, src2, dst, scale, dtype); },
+//         [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, float scale, int dtype,
+//            AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+//         randomScale, -1);
+// }
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_NOT)
+{
+    Mat cpuOpRet, checker, cpuMat = randomMat(10, 10, CV_32SC3);
+    cv::cann::setDevice(DEVICE_ID);
+    cv::bitwise_not(cpuMat, cpuOpRet);
+    cv::cann::bitwise_not(cpuMat, checker);
+    EXPECT_MAT_NEAR(cpuOpRet, checker, 0.0);
+
+    AscendMat npuMat, npuOpRet;
+    npuMat.upload(cpuMat);
+    cv::cann::bitwise_not(npuMat, npuOpRet);
+    npuOpRet.download(checker);
+    EXPECT_MAT_NEAR(cpuOpRet, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+// TODO random test matrix
+TEST(ELEMENTWISE_OP, MAT_ADD_WEIGHTED)
+{
+    Mat cpuOpRet, checker, cpuMat1 = Mat::ones(5, 5, CV_32S), cpuMat2 = Mat::ones(5, 5, CV_32S);
+
+    cv::cann::setDevice(DEVICE_ID);
+    cv::addWeighted(cpuMat1, 2, cpuMat2, 3, 5, cpuOpRet);
+    cv::cann::addWeighted(cpuMat1, 2, cpuMat2, 3, 5, checker);
+    EXPECT_MAT_NEAR(cpuOpRet, checker, 0.0);
+
+    AscendMat npuOpRet, npuMat1, npuMat2;
+    npuMat1.upload(cpuMat1);
+    npuMat2.upload(cpuMat2);
+    cv::cann::addWeighted(npuMat1, 2, npuMat2, 3, 5, npuOpRet);
+    npuOpRet.download(checker);
+    EXPECT_MAT_NEAR(cpuOpRet, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(ELEMENTWISE_OP, MAT_THRESHOLD)
+{
+    Mat cpuOpRet, checker, cpuMat = randomMat(10, 10, CV_16SC3, 0.0, 255.0);
+
+    AscendMat ascendMat, ascendMat16F, aclOpRet, aclOpRet16S;
+    cv::cann::setDevice(DEVICE_ID);
+    ascendMat.upload(cpuMat);
+    ascendMat.convertTo(ascendMat16F, CV_16F);
+
+    Mat cpuMat16F, checker16F;
+    cpuMat.convertTo(cpuMat16F, CV_16F);
+
+    for (int i = 0; i <= 4; i++)
+    {
+        cv::threshold(cpuMat, cpuOpRet, 128, 250, i);
+        // TODO find the reason empty AscendMat is not continuous.
+        cv::cann::threshold(ascendMat16F, aclOpRet, 128, 250, i);
+        aclOpRet.convertTo(aclOpRet16S, CV_16S);
+        aclOpRet16S.download(checker);
+
+        EXPECT_MAT_NEAR(cpuOpRet, checker, 1e-10);
+
+        cv::cann::threshold(cpuMat16F, checker16F, 128, 250, i);
+        checker16F.convertTo(checker, CV_16S);
+        EXPECT_MAT_NEAR(cpuOpRet, checker, 1e-10);
+    }
+
+    cv::cann::resetDevice();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/test/test_main.cpp b/modules/cannops/test/test_main.cpp
new file mode 100644
index 00000000000..202c6af27ee
--- /dev/null
+++ b/modules/cannops/test/test_main.cpp
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+class CannEnvironment : public ::testing::Environment
+{
+public:
+    virtual ~CannEnvironment() = default;
+    virtual void SetUp() CV_OVERRIDE { initAcl(); }
+    virtual void TearDown() CV_OVERRIDE { finalizeAcl(); }
+};
+
+static void initTests()
+{
+    CannEnvironment* cannEnv = new CannEnvironment();
+    ::testing::AddGlobalTestEnvironment(cannEnv);
+}
+
+CV_TEST_MAIN("cannops", initTests());
diff --git a/modules/cannops/test/test_npumat.cpp b/modules/cannops/test/test_npumat.cpp
new file mode 100644
index 00000000000..1ff445399f8
--- /dev/null
+++ b/modules/cannops/test/test_npumat.cpp
@@ -0,0 +1,146 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+class DummyAllocator : public AscendMat::Allocator
+{
+public:
+    std::shared_ptr<uchar> allocate(size_t size) CV_OVERRIDE
+    {
+        CV_UNUSED(size);
+        return std::shared_ptr<uchar>();
+    }
+    bool allocate(cv::cann::AscendMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE
+    {
+        CV_UNUSED(rows);
+        CV_UNUSED(cols);
+        CV_UNUSED(elemSize);
+        mat->data = std::shared_ptr<uchar>((uchar*)0x12345, [](void* ptr) { CV_UNUSED(ptr); });
+        return true;
+    }
+};
+
+TEST(AscendMat, Construct)
+{
+    cv::cann::setDevice(0);
+    // 1 Default constructor.
+    AscendMat defaultAscendMat;
+    AscendMat::Allocator* defaultAllocator = AscendMat::defaultAllocator();
+    ASSERT_EQ(defaultAscendMat.allocator, defaultAllocator);
+
+    // 2 get & set allocator.
+    DummyAllocator dummyAllocator;
+    AscendMat::setDefaultAllocator(&dummyAllocator);
+    ASSERT_EQ(defaultAscendMat.defaultAllocator(), &dummyAllocator);
+    AscendMat::setDefaultAllocator(defaultAllocator);
+
+    // 3 constructs AscendMat of the specified size and type
+    AscendMat specifiedSizeAscendMat1(5, 6, CV_8UC3);
+    AscendMat specifiedSizeAscendMat2(Size(300, 200), CV_64F);
+
+    ASSERT_EQ(specifiedSizeAscendMat1.rows, 5);
+    ASSERT_EQ(specifiedSizeAscendMat1.cols, 6);
+    ASSERT_EQ(specifiedSizeAscendMat1.depth(), CV_8U);
+    ASSERT_EQ(specifiedSizeAscendMat1.channels(), 3);
+
+    ASSERT_EQ(specifiedSizeAscendMat2.cols, 300);
+    ASSERT_EQ(specifiedSizeAscendMat2.rows, 200);
+    ASSERT_EQ(specifiedSizeAscendMat2.depth(), CV_64F);
+    ASSERT_EQ(specifiedSizeAscendMat2.channels(), 1);
+
+    // 4 constructs AscendMat and fills it with the specified value s
+    srand((unsigned int)(time(NULL)));
+    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+
+    Mat scalarToMat(7, 8, CV_8UC3, sc);
+    AscendMat scalarToAscendMat1(7, 8, CV_8UC3, sc);
+    Mat scalarToMatChecker;
+    scalarToAscendMat1.download(scalarToMatChecker);
+
+    EXPECT_MAT_NEAR(scalarToMat, scalarToMatChecker, 0.0);
+
+    AscendMat scalarToAscendMat2(Size(123, 345), CV_32S);
+
+    ASSERT_EQ(scalarToAscendMat1.rows, 7);
+    ASSERT_EQ(scalarToAscendMat1.cols, 8);
+    ASSERT_EQ(scalarToAscendMat1.depth(), CV_8U);
+    ASSERT_EQ(scalarToAscendMat1.channels(), 3);
+
+    ASSERT_EQ(scalarToAscendMat2.cols, 123);
+    ASSERT_EQ(scalarToAscendMat2.rows, 345);
+    ASSERT_EQ(scalarToAscendMat2.depth(), CV_32S);
+    ASSERT_EQ(scalarToAscendMat2.channels(), 1);
+
+    // 6 builds AscendMat from host memory
+    Scalar sc2(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+    Mat randomMat(7, 8, CV_8UC3, sc2);
+    InputArray arr = randomMat;
+
+    AscendMat fromInputArray(arr, AscendStream::Null());
+    Mat randomMatChecker;
+    fromInputArray.download(randomMatChecker);
+    EXPECT_MAT_NEAR(randomMat, randomMatChecker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(AscendMat, Assignment)
+{
+    DummyAllocator dummyAllocator;
+    AscendMat mat1;
+    AscendMat mat2(3, 4, CV_8SC1, &dummyAllocator);
+    mat1 = mat2;
+
+    ASSERT_EQ(mat1.rows, 3);
+    ASSERT_EQ(mat1.cols, 4);
+    ASSERT_EQ(mat1.depth(), CV_8S);
+    ASSERT_EQ(mat1.channels(), 1);
+    ASSERT_EQ(mat1.data.get(), (uchar*)0x12345);
+}
+
+TEST(AscendMat, SetTo)
+{
+    cv::cann::setDevice(0);
+
+    srand((unsigned int)(time(NULL)));
+    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+
+    AscendMat ascendMat(2, 2, CV_8UC4);
+    ascendMat.setTo(sc);
+    Mat mat(2, 2, CV_8UC4, sc);
+    Mat checker;
+    ascendMat.download(checker);
+
+    EXPECT_MAT_NEAR(mat, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(AscendMat, ConvertTo)
+{
+    cv::cann::setDevice(0);
+
+    srand((unsigned int)(time(NULL)));
+    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+
+    AscendMat ascendMat(2, 2, CV_8UC4, sc);
+    AscendMat convertedAscendMat;
+    ascendMat.convertTo(convertedAscendMat, CV_16S);
+    Mat mat(2, 2, CV_16SC4, sc);
+    Mat checker;
+    convertedAscendMat.download(checker);
+
+    EXPECT_MAT_NEAR(mat, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/test/test_precomp.hpp b/modules/cannops/test/test_precomp.hpp
new file mode 100644
index 00000000000..f7bdbea0b08
--- /dev/null
+++ b/modules/cannops/test/test_precomp.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/cann.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+#include "opencv2/cann_interface.hpp"
+
+using namespace cv;
+using namespace cv::cann;
+#undef EXPECT_MAT_NEAR
+#define EXPECT_MAT_NEAR(m1, m2, eps) EXPECT_PRED_FORMAT3(cvtest::assertMatNear, m1, m2, eps)
+#define ASSERT_MAT_NEAR(m1, m2, eps) ASSERT_PRED_FORMAT3(cvtest::assertMatNear, m1, m2, eps)
+
+#define DEVICE_ID 0
+
+Mat randomMat(int w, int h, int dtype, float min = 1.0f, float max = 10.0f);
+Scalar randomScalar();
+float randomNum();
+int randomInterger();
+Mat genMask();
+AscendMat genNpuMask();
+
+#endif //__OPENCV_TEST_PRECOMP_HPP__
diff --git a/modules/cannops/test/test_utils.cpp b/modules/cannops/test/test_utils.cpp
new file mode 100644
index 00000000000..d2bd31647b7
--- /dev/null
+++ b/modules/cannops/test/test_utils.cpp
@@ -0,0 +1,49 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+// Random Generator
+Mat randomMat(int w, int h, int dtype, float min, float max)
+{
+    Mat rnMat(w, h, dtype);
+    RNG rng(getTickCount());
+    rng.fill(rnMat, RNG::UNIFORM, min, max);
+    return rnMat;
+}
+Scalar randomScalar()
+{
+    RNG rng(getTickCount());
+    Scalar sc;
+    rng.fill(sc, RNG::UNIFORM, 1.0, 5.0);
+    return sc;
+}
+float randomNum()
+{
+    RNG rng(getTickCount());
+    float rdnNum = float(rng.uniform(1.0, 5.0));
+    return rdnNum;
+}
+
+int randomInterger()
+{
+    RNG rng(getTickCount());
+    float rdnNum = float(rng.uniform(1, 5));
+    return rdnNum;
+}
+
+Mat genMask()
+{
+    Mat mask = Mat::zeros(Size(10, 10), CV_8UC1);
+    rectangle(mask, cv::Rect(5, 5, 3, 3), Scalar(255), -1);
+    return mask;
+}
+
+AscendMat genNpuMask()
+{
+    cv::Mat mask = genMask();
+    cv::cann::AscendMat npuMask;
+    npuMask.upload(mask);
+    return npuMask;
+}
diff --git a/modules/cannops/tutorials/ascend_npu_image_processing.markdown b/modules/cannops/tutorials/ascend_npu_image_processing.markdown
new file mode 100644
index 00000000000..ed905831d31
--- /dev/null
+++ b/modules/cannops/tutorials/ascend_npu_image_processing.markdown
@@ -0,0 +1,130 @@
+Ascend NPU Image Processing {#tutorial_ascend_npu_image_processing}
+==========================================================
+
+## Goal
+
+In this guide, you will gain insights into the thread safety of Ascend operators already in use, as well as discover how to effectively employ Ascend operators for image preprocessing and understand their usage limitations.
+
+## Preface
+
+We provide a suite of common matrix operation operators that support the [Ascend NPU](https://www.hiascend.com/en/) within OpenCV. For user convenience, the new 'AscendMat' structure and its associated operators maintain compatibility with the 'Mat' interface in OpenCV. These operators encompass a wide range of frequently used functions, including arithmetic operations, image processing operations, and image color space conversion. All of these operators are implemented utilizing [CANN](https://www.hiascend.com/en/software/cann)(Compute Architecture of Neural Networks). The Ascend operator facilitates accelerated operations on the NPU by making use of CANN. This acceleration effect is particularly noticeable when working with larger images, such as those with dimensions like 2048x2048, 3840x2160, 7680x4320, etc.
+
+
+## Instructions on Thread Safety
+
+Our stream function is implemented by invoking the CANN operators. In the same stream, tasks are executed sequentially, while across different streams, tasks are executed in parallel. The use of event mechanisms ensures synchronization of tasks between streams, please refer to the [**Stream Management**](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/infacldevg/aclcppdevg/aclcppdevg_000147.html) documentation for details.
+
+
+## Example for Image Preprocessing
+
+In this section, you will discover how to use Ascend operators for image preprocessing, including functions below:
+
+- Add
+- Rotate
+- Flip
+
+
+### code
+
+@add_toggle_cpp
+@include opencv_contrib/modules/cannops/samples/image_processing.cpp
+@end_toggle
+
+@add_toggle_python
+@include opencv_contrib/modules/cannops/samples/image_processing.py
+@end_toggle
+
+### Explanation
+
+**Input Image**
+
+@add_toggle_cpp
+@snippet opencv_contrib/modules/cannops/samples/image_processing.cpp input_noise
+@end_toggle
+
+@add_toggle_python
+
+```python
+# Read the input image
+img = cv2.imread("/path/to/img")
+# Generate gauss noise that will be added into the input image
+gaussNoise = np.random.normal(mean=0,sigma=25,(img.shape[0],img.shape[1],img.shape[2])).astype(img.dtype)
+```
+
+@end_toggle
+
+**Setup CANN**
+
+@add_toggle_cpp
+
+@snippet opencv_contrib/modules/cannops/samples/image_processing.cpp setup
+
+@end_toggle
+
+@add_toggle_python
+
+@snippet opencv_contrib/modules/cannops/samples/image_processing.py setup
+
+@end_toggle
+**Image Preprocessing Example**
+
+@add_toggle_cpp
+
+@snippet opencv_contrib/modules/cannops/samples/image_processing.cpp image-process
+
+@end_toggle
+
+@add_toggle_python
+
+@snippet opencv_contrib/modules/cannops/samples/image_processing.py image-process
+
+@end_toggle
+
+**Tear down CANN**
+
+@add_toggle_cpp
+@snippet opencv_contrib/modules/cannops/samples/image_processing.cpp tear-down-cann
+
+@end_toggle
+
+@add_toggle_python
+
+@snippet opencv_contrib/modules/cannops/samples/image_processing.py tear-down-cann
+
+@end_toggle
+Results
+
+1. The original RGB input image with dimensions of (480, 640, 3):
+
+   ![puppy](./puppy.jpg)
+
+2. After introducing Gaussian noise, we obtain the following result:
+
+   ![puppy_noisy](./puppy_noisy.jpg)
+
+3. When applying the rotate operation with a rotation code of 0 (90 degrees clockwise), we obtain this result:
+
+   ![puppy_noisy_rotate](./puppy_noisy_rotate.jpg)
+
+4. Upon applying the flip operation with a flip code of 0 (flipping around the x-axis), we achieve the final result:
+
+   ![puppy_processed_normalized](./puppy_processed.jpg)
+
+
+
+## Usage Limitations
+
+While Ascend supports most commonly used operators, there are still some limitations that need to be addressed.
+
+- There is no strict limit on the size of the input image used for encoding; however, it depends on the available RAM size of your device.
+- Please note that not all data types (dtypes) are supported by every operator. The current dtype limitations are outlined in the following table. We are actively working on addressing these limitations through automatic dtype conversion in an upcoming commit.
+
+
+| Operator               | Supported Dtype                                              |
+| ---------------------- | ------------------------------------------------------------ |
+| multiply (with scale)  | float16,float32,int32                                        |
+| divide (with scale)    | float16,float,int32,int8,uint8                               |
+| bitwise add/or/xor/not | int32,int16,uint16                                           |
+| flip                   | float16,float,int64,int32,int16,uint16                       |
+| transpose              | float16,float,int64,int32,int16,int8,uint64,uint32,uint16,uint8,bool |
+| rotate                 | float16,float,int64,int32,int16,uint16                       |
diff --git a/modules/cannops/tutorials/puppy.jpg b/modules/cannops/tutorials/puppy.jpg
new file mode 100644
index 00000000000..b0f0595e5ce
Binary files /dev/null and b/modules/cannops/tutorials/puppy.jpg differ
diff --git a/modules/cannops/tutorials/puppy_noisy.jpg b/modules/cannops/tutorials/puppy_noisy.jpg
new file mode 100644
index 00000000000..e90cadb1720
Binary files /dev/null and b/modules/cannops/tutorials/puppy_noisy.jpg differ
diff --git a/modules/cannops/tutorials/puppy_noisy_rotate.jpg b/modules/cannops/tutorials/puppy_noisy_rotate.jpg
new file mode 100644
index 00000000000..e62b04834dc
Binary files /dev/null and b/modules/cannops/tutorials/puppy_noisy_rotate.jpg differ
diff --git a/modules/cannops/tutorials/puppy_processed.jpg b/modules/cannops/tutorials/puppy_processed.jpg
new file mode 100644
index 00000000000..296b47aefea
Binary files /dev/null and b/modules/cannops/tutorials/puppy_processed.jpg differ
diff --git a/modules/cudaarithm/CMakeLists.txt b/modules/cudaarithm/CMakeLists.txt
index d552bb4ebe9..6ee7a9f96bb 100644
--- a/modules/cudaarithm/CMakeLists.txt
+++ b/modules/cudaarithm/CMakeLists.txt
@@ -6,22 +6,35 @@ set(the_description "CUDA-accelerated Operations on Matrices")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
 
-ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudev WRAP python)
+set(extra_dependencies "")
+set(optional_dependencies "")
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  list(APPEND extra_dependencies CUDA::cudart_static CUDA::nppial${CUDA_LIB_EXT} CUDA::nppc${CUDA_LIB_EXT} CUDA::nppitc${CUDA_LIB_EXT} CUDA::nppig${CUDA_LIB_EXT} CUDA::nppist${CUDA_LIB_EXT} CUDA::nppidei${CUDA_LIB_EXT})
+  if(HAVE_CUBLAS)
+    list(APPEND optional_dependencies CUDA::cublas${CUDA_LIB_EXT})
+    if(NOT CUDA_VERSION VERSION_LESS 10.1)
+      list(APPEND optional_dependencies CUDA::cublasLt${CUDA_LIB_EXT})
+    endif()
+  endif()
+  if(HAVE_CUFFT)
+    # static version requires seperable compilation which is incompatible with opencv's current library structure
+    list(APPEND optional_dependencies CUDA::cufft)
+  endif()
+else()
+  if(HAVE_CUBLAS)
+    list(APPEND optional_dependencies ${CUDA_cublas_LIBRARY})
+  endif()
+  if(HAVE_CUFFT)
+    list(APPEND optional_dependencies ${CUDA_cufft_LIBRARY})
+  endif()
+endif()
+
+ocv_add_module(cudaarithm opencv_core ${extra_dependencies} OPTIONAL opencv_cudev ${optional_dependencies} WRAP python)
 
 ocv_module_include_directories()
 ocv_glob_module_sources()
 
-set(extra_libs "")
-
-if(HAVE_CUBLAS)
-  list(APPEND extra_libs ${CUDA_cublas_LIBRARY})
-endif()
-
-if(HAVE_CUFFT)
-  list(APPEND extra_libs ${CUDA_cufft_LIBRARY})
-endif()
-
-ocv_create_module(${extra_libs})
+ocv_create_module()
 
 ocv_add_accuracy_tests(DEPENDS_ON opencv_imgproc)
 ocv_add_perf_tests(DEPENDS_ON opencv_imgproc)
diff --git a/modules/cudabgsegm/CMakeLists.txt b/modules/cudabgsegm/CMakeLists.txt
index ffc6a628aea..1d2ef64d154 100644
--- a/modules/cudabgsegm/CMakeLists.txt
+++ b/modules/cudabgsegm/CMakeLists.txt
@@ -5,5 +5,7 @@ endif()
 set(the_description "CUDA-accelerated Background Segmentation")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
-
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  ocv_module_include_directories(${CUDAToolkit_INCLUDE_DIRS})
+endif()
 ocv_define_module(cudabgsegm opencv_video WRAP python)
diff --git a/modules/cudacodec/CMakeLists.txt b/modules/cudacodec/CMakeLists.txt
index 6ff9f1ae9d7..8df41f00a96 100644
--- a/modules/cudacodec/CMakeLists.txt
+++ b/modules/cudacodec/CMakeLists.txt
@@ -4,7 +4,11 @@ endif()
 
 set(the_description "CUDA-accelerated Video Encoding/Decoding")
 
-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wshadow)
+if(WIN32)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512)
+else()
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wshadow -Wsign-compare -Wenum-compare)
+endif()
 
 set(required_dependencies opencv_core opencv_videoio opencv_cudaarithm opencv_cudawarping)
 if(HAVE_NVCUVENC)
@@ -18,10 +22,25 @@ ocv_glob_module_sources()
 
 set(extra_libs "")
 
+if(WITH_NVCUVID AND NOT HAVE_NVCUVID)
+  message(WARNING "cudacodec::VideoReader requires Nvidia Video Codec SDK. Please resolve dependency or disable WITH_NVCUVID=OFF")
+endif()
+
+if(WITH_NVCUVENC AND NOT HAVE_NVCUVENC)
+  message(WARNING "cudacodec::VideoWriter requires Nvidia Video Codec SDK. Please resolve dependency or disable WITH_NVCUVENC=OFF")
+endif()
+
 if(HAVE_NVCUVID OR HAVE_NVCUVENC)
-  list(APPEND extra_libs ${CUDA_CUDA_LIBRARY})
+  if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+      list(APPEND extra_libs CUDA::cuda_driver)
+  else()
+    list(APPEND extra_libs ${CUDA_CUDA_LIBRARY})
+  endif()
   if(HAVE_NVCUVID)
     list(APPEND extra_libs ${CUDA_nvcuvid_LIBRARY})
+    if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+      list(APPEND extra_libs CUDA::nppicc${CUDA_LIB_EXT})
+    endif()
   endif()
   if(HAVE_NVCUVENC)
       if(WIN32)
diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index d6421c2b8a4..163417108e7 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -544,6 +544,14 @@ class CV_EXPORTS_W RawVideoSource
     @return `true` unless the property is unset set or not supported.
      */
     virtual bool get(const int propertyId, double& propertyVal) const = 0;
+
+    /** @brief Retrieve the index of the first frame that will returned after construction.
+
+    @return index of the index of the first frame that will returned after construction.
+
+    @note To reduce the decoding overhead when initializing VideoReader to start its decoding from frame N, RawVideoSource should seek to the first valid key frame less than or equal to N and return that index here.
+     */
+    virtual int getFirstFrameIdx() const = 0;
 };
 
 /** @brief VideoReader initialization parameters
@@ -561,9 +569,10 @@ but it cannot go below the number determined by NVDEC.
 @param targetRoi Region of interest (x/width should be multiples of 4 and y/height multiples of 2) within the output frame to copy and resize the decoded frame to,
 defaults to the full frame.
 @param enableHistogram Request output of decoded luma histogram \a hist from VideoReader::nextFrame(GpuMat& frame, GpuMat& hist, Stream& stream), if hardware supported.
+@param firstFrameIdx Index of the first frame to seek to on initialization of the VideoReader.
 */
 struct CV_EXPORTS_W_SIMPLE VideoReaderInitParams {
-    CV_WRAP VideoReaderInitParams() : udpSource(false), allowFrameDrop(false), minNumDecodeSurfaces(0), rawMode(0), enableHistogram(false){};
+    CV_WRAP VideoReaderInitParams() : udpSource(false), allowFrameDrop(false), minNumDecodeSurfaces(0), rawMode(0), enableHistogram(false), firstFrameIdx(0){};
     CV_PROP_RW bool udpSource;
     CV_PROP_RW bool allowFrameDrop;
     CV_PROP_RW int minNumDecodeSurfaces;
@@ -572,6 +581,7 @@ struct CV_EXPORTS_W_SIMPLE VideoReaderInitParams {
     CV_PROP_RW cv::Rect srcRoi;
     CV_PROP_RW cv::Rect targetRoi;
     CV_PROP_RW bool enableHistogram;
+    CV_PROP_RW int firstFrameIdx;
 };
 
 /** @brief Creates video reader.
diff --git a/modules/cudacodec/src/ffmpeg_video_source.cpp b/modules/cudacodec/src/ffmpeg_video_source.cpp
index 867910feed8..8718782a8c1 100644
--- a/modules/cudacodec/src/ffmpeg_video_source.cpp
+++ b/modules/cudacodec/src/ffmpeg_video_source.cpp
@@ -165,19 +165,21 @@ bool ParamSetsExist(unsigned char* parameterSets, const int szParameterSets, uns
     return paramSetStartCodeLen != 0 && packetStartCodeLen != 0 && parameterSets[paramSetStartCodeLen] == data[packetStartCodeLen];
 }
 
-cv::cudacodec::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname, const std::vector<int>& _videoCaptureParams)
+cv::cudacodec::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname, const std::vector<int>& _videoCaptureParams, const int iMaxStartFrame)
     : videoCaptureParams(_videoCaptureParams)
 {
     if (!videoio_registry::hasBackend(CAP_FFMPEG))
         CV_Error(Error::StsNotImplemented, "FFmpeg backend not found");
 
-    cap.open(fname, CAP_FFMPEG, videoCaptureParams);
-    if (!cap.isOpened())
+    videoCaptureParams.push_back(CAP_PROP_FORMAT);
+    videoCaptureParams.push_back(-1);
+    if (!cap.open(fname, CAP_FFMPEG, videoCaptureParams))
         CV_Error(Error::StsUnsupportedFormat, "Unsupported video source");
-
-    if (!cap.set(CAP_PROP_FORMAT, -1))  // turn off video decoder (extract stream)
-        CV_Error(Error::StsUnsupportedFormat, "Fetching of RAW video streams is not supported");
     CV_Assert(cap.get(CAP_PROP_FORMAT) == -1);
+    if (iMaxStartFrame) {
+        CV_Assert(cap.set(CAP_PROP_POS_FRAMES, iMaxStartFrame));
+        firstFrameIdx = static_cast<int>(cap.get(CAP_PROP_POS_FRAMES));
+    }
 
     const int codecExtradataIndex = static_cast<int>(cap.get(CAP_PROP_CODEC_EXTRADATA_INDEX));
     Mat tmpExtraData;
diff --git a/modules/cudacodec/src/ffmpeg_video_source.hpp b/modules/cudacodec/src/ffmpeg_video_source.hpp
index ce8582f6503..b2c25817a4c 100644
--- a/modules/cudacodec/src/ffmpeg_video_source.hpp
+++ b/modules/cudacodec/src/ffmpeg_video_source.hpp
@@ -51,7 +51,7 @@ namespace cv { namespace cudacodec { namespace detail {
 class FFmpegVideoSource : public RawVideoSource
 {
 public:
-    FFmpegVideoSource(const String& fname, const std::vector<int>& params);
+    FFmpegVideoSource(const String& fname, const std::vector<int>& params, const int iMaxStartFrame);
     ~FFmpegVideoSource();
 
     bool getNextPacket(unsigned char** data, size_t* size) CV_OVERRIDE;
@@ -66,12 +66,15 @@ class FFmpegVideoSource : public RawVideoSource
 
     bool get(const int propertyId, double& propertyVal) const;
 
+    int getFirstFrameIdx() const { return firstFrameIdx; }
+
 private:
     FormatInfo format_;
     VideoCapture cap;
     Mat rawFrame, extraData, dataWithHeader;
     int iFrame = 0;
     std::vector<int> videoCaptureParams;
+    int firstFrameIdx = 0;
 };
 
 }}}
diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp
index b6ef2ca5376..6d71e544fa0 100644
--- a/modules/cudacodec/src/video_reader.cpp
+++ b/modules/cudacodec/src/video_reader.cpp
@@ -112,7 +112,7 @@ namespace
     {
     public:
         explicit VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop = false , const bool udpSource = false,
-            const Size targetSz = Size(), const Rect srcRoi = Rect(), const Rect targetRoi = Rect(), const bool enableHistogram = false);
+            const Size targetSz = Size(), const Rect srcRoi = Rect(), const Rect targetRoi = Rect(), const bool enableHistogram = false, const int firstFrameIdx = 0);
         ~VideoReaderImpl();
 
         bool nextFrame(GpuMat& frame, Stream& stream) CV_OVERRIDE;
@@ -135,6 +135,9 @@ namespace
         bool get(const int propertyId, double& propertyVal) const CV_OVERRIDE;
 
     private:
+        bool skipFrame();
+        bool aquireFrameInfo(std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS>& frameInfo, Stream& stream = Stream::Null());
+        void releaseFrameInfo(const std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS>& frameInfo);
         bool internalGrab(GpuMat & frame, GpuMat & histogram, Stream & stream);
         void waitForDecoderInit();
 
@@ -154,6 +157,7 @@ namespace
         static const int rawPacketsBaseIdx = 2;
         ColorFormat colorFormat = ColorFormat::BGRA;
         static const String errorMsg;
+        int iFrame = 0;
     };
 
     const String VideoReaderImpl::errorMsg = "Parsing/Decoding video source failed, check GPU memory is available and GPU supports requested functionality.";
@@ -173,7 +177,7 @@ namespace
     }
 
     VideoReaderImpl::VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop, const bool udpSource,
-        const Size targetSz, const Rect srcRoi, const Rect targetRoi, const bool enableHistogram) :
+        const Size targetSz, const Rect srcRoi, const Rect targetRoi, const bool enableHistogram, const int firstFrameIdx) :
         videoSource_(source),
         lock_(0)
     {
@@ -190,6 +194,8 @@ namespace
         videoSource_->setVideoParser(videoParser_);
         videoSource_->start();
         waitForDecoderInit();
+        for(iFrame = videoSource_->getFirstFrameIdx(); iFrame < firstFrameIdx; iFrame++)
+            CV_Assert(skipFrame());
         videoSource_->updateFormat(videoDecoder_->format());
     }
 
@@ -209,10 +215,7 @@ namespace
         CUvideoctxlock m_lock;
     };
 
-    bool VideoReaderImpl::internalGrab(GpuMat& frame, GpuMat& histogram, Stream& stream) {
-        if (videoParser_->hasError())
-            CV_Error(Error::StsError, errorMsg);
-        cudacodec::FormatInfo fmt;
+    bool VideoReaderImpl::aquireFrameInfo(std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS>& frameInfo, Stream& stream) {
         if (frames_.empty())
         {
             CUVIDPARSERDISPINFO displayInfo;
@@ -234,8 +237,6 @@ namespace
 
             bool isProgressive = displayInfo.progressive_frame != 0;
             const int num_fields = isProgressive ? 1 : 2 + displayInfo.repeat_first_field;
-            fmt = videoDecoder_->format();
-            videoSource_->updateFormat(fmt);
 
             for (int active_field = 0; active_field < num_fields; ++active_field)
             {
@@ -243,25 +244,46 @@ namespace
                 std::memset(&videoProcParams, 0, sizeof(CUVIDPROCPARAMS));
 
                 videoProcParams.progressive_frame = displayInfo.progressive_frame;
-                videoProcParams.second_field      = active_field;
-                videoProcParams.top_field_first   = displayInfo.top_field_first;
-                videoProcParams.unpaired_field    = (num_fields == 1);
+                videoProcParams.second_field = active_field;
+                videoProcParams.top_field_first = displayInfo.top_field_first;
+                videoProcParams.unpaired_field = (num_fields == 1);
                 videoProcParams.output_stream = StreamAccessor::getStream(stream);
 
                 frames_.push_back(std::make_pair(displayInfo, videoProcParams));
             }
         }
+        else {
+            for (auto& frame : frames_)
+                frame.second.output_stream = StreamAccessor::getStream(stream);
+        }
 
         if (frames_.empty())
             return false;
 
-        std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> frameInfo = frames_.front();
+        frameInfo = frames_.front();
         frames_.pop_front();
+        return true;
+    }
+
+    void VideoReaderImpl::releaseFrameInfo(const std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS>& frameInfo) {
+        // release the frame, so it can be re-used in decoder
+        if (frames_.empty())
+            frameQueue_->releaseFrame(frameInfo.first);
+    }
+
+    bool VideoReaderImpl::internalGrab(GpuMat& frame, GpuMat& histogram, Stream& stream) {
+        if (videoParser_->hasError())
+            CV_Error(Error::StsError, errorMsg);
+
+        std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> frameInfo;
+        if (!aquireFrameInfo(frameInfo, stream))
+            return false;
 
         {
             VideoCtxAutoLock autoLock(lock_);
 
             unsigned long long cuHistogramPtr = 0;
+            const cudacodec::FormatInfo fmt = videoDecoder_->format();
             if (fmt.enableHistogram)
                 frameInfo.second.histogram_dptr = &cuHistogramPtr;
 
@@ -281,10 +303,16 @@ namespace
             videoDecoder_->unmapFrame(decodedFrame);
         }
 
-        // release the frame, so it can be re-used in decoder
-        if (frames_.empty())
-            frameQueue_->releaseFrame(frameInfo.first);
+        releaseFrameInfo(frameInfo);
+        iFrame++;
+        return true;
+    }
 
+    bool VideoReaderImpl::skipFrame() {
+        std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> frameInfo;
+        if (!aquireFrameInfo(frameInfo))
+            return false;
+        releaseFrameInfo(frameInfo);
         return true;
     }
 
@@ -399,6 +427,10 @@ namespace
     }
 
     bool VideoReaderImpl::get(const int propertyId, double& propertyVal) const {
+        if (propertyId == cv::VideoCaptureProperties::CAP_PROP_POS_FRAMES) {
+            propertyVal = static_cast<double>(iFrame);
+            return true;
+        }
         return videoSource_->get(propertyId, propertyVal);
     }
 
@@ -421,11 +453,10 @@ Ptr<VideoReader> cv::cudacodec::createVideoReader(const String& filename, const
     CV_Assert(!filename.empty());
 
     Ptr<VideoSource> videoSource;
-
     try
     {
         // prefer ffmpeg to cuvidGetSourceVideoFormat() which doesn't always return the corrct raw pixel format
-        Ptr<RawVideoSource> source(new FFmpegVideoSource(filename, sourceParams));
+        Ptr<RawVideoSource> source(new FFmpegVideoSource(filename, sourceParams, params.firstFrameIdx));
         videoSource.reset(new RawVideoSourceWrapper(source, params.rawMode));
     }
     catch (...)
@@ -433,16 +464,15 @@ Ptr<VideoReader> cv::cudacodec::createVideoReader(const String& filename, const
         if (sourceParams.size()) throw;
         videoSource.reset(new CuvidVideoSource(filename));
     }
-
     return makePtr<VideoReaderImpl>(videoSource, params.minNumDecodeSurfaces, params.allowFrameDrop, params.udpSource, params.targetSz,
-        params.srcRoi, params.targetRoi, params.enableHistogram);
+        params.srcRoi, params.targetRoi, params.enableHistogram, params.firstFrameIdx);
 }
 
 Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>& source, const VideoReaderInitParams params)
 {
     Ptr<VideoSource> videoSource(new RawVideoSourceWrapper(source, params.rawMode));
     return makePtr<VideoReaderImpl>(videoSource, params.minNumDecodeSurfaces, params.allowFrameDrop, params.udpSource, params.targetSz,
-        params.srcRoi, params.targetRoi, params.enableHistogram);
+        params.srcRoi, params.targetRoi, params.enableHistogram, params.firstFrameIdx);
 }
 
 void cv::cudacodec::MapHist(const GpuMat& hist, Mat& histFull) {
diff --git a/modules/cudacodec/src/video_source.cpp b/modules/cudacodec/src/video_source.cpp
index a81b75e366d..169ffbb9bce 100644
--- a/modules/cudacodec/src/video_source.cpp
+++ b/modules/cudacodec/src/video_source.cpp
@@ -76,6 +76,10 @@ bool cv::cudacodec::detail::RawVideoSourceWrapper::get(const int propertyId, dou
     return source_->get(propertyId, propertyVal);
 }
 
+int cv::cudacodec::detail::RawVideoSourceWrapper::getFirstFrameIdx() const {
+    return source_->getFirstFrameIdx();
+}
+
 void cv::cudacodec::detail::RawVideoSourceWrapper::start()
 {
     stop_ = false;
diff --git a/modules/cudacodec/src/video_source.hpp b/modules/cudacodec/src/video_source.hpp
index 8c96a34f2d5..f7e4c0bd15b 100644
--- a/modules/cudacodec/src/video_source.hpp
+++ b/modules/cudacodec/src/video_source.hpp
@@ -58,6 +58,7 @@ class VideoSource
     virtual FormatInfo format() const = 0;
     virtual void updateFormat(const FormatInfo& videoFormat) = 0;
     virtual bool get(const int propertyId, double& propertyVal) const { return false; }
+    virtual int getFirstFrameIdx() const { return 0; }
     virtual void start() = 0;
     virtual void stop() = 0;
     virtual bool isStarted() const = 0;
@@ -91,6 +92,7 @@ class RawVideoSourceWrapper : public VideoSource
     FormatInfo format() const CV_OVERRIDE;
     void updateFormat(const FormatInfo& videoFormat) CV_OVERRIDE;
     bool get(const int propertyId, double& propertyVal) const CV_OVERRIDE;
+    int getFirstFrameIdx() const CV_OVERRIDE;
     void start() CV_OVERRIDE;
     void stop() CV_OVERRIDE;
     bool isStarted() const CV_OVERRIDE;
diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
index 45365dab230..88df2fb1afb 100644
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -113,6 +113,10 @@ struct CheckParams : SetDevice
 {
 };
 
+struct Seek : SetDevice
+{
+};
+
 #if defined(HAVE_NVCUVID)
 //////////////////////////////////////////////////////
 // VideoReader
@@ -542,36 +546,22 @@ CUDA_TEST_P(CheckParams, Reader)
         ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_OPEN_TIMEOUT_MSEC, msActual));
         ASSERT_EQ(msActual, msReference);
     }
-
-    {
-        std::vector<bool> exceptionsThrown = { false,true };
-        std::vector<int> capPropFormats = { -1,0 };
-        for (int i = 0; i < capPropFormats.size(); i++) {
-            bool exceptionThrown = false;
-            try {
-                cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile, {
-                    cv::VideoCaptureProperties::CAP_PROP_FORMAT, capPropFormats.at(i) });
-            }
-            catch (cv::Exception &ex) {
-                if (ex.code == Error::StsUnsupportedFormat)
-                    exceptionThrown = true;
-            }
-            ASSERT_EQ(exceptionThrown, exceptionsThrown.at(i));
-        }
-    }
 }
 
 CUDA_TEST_P(CheckParams, CaptureProps)
 {
     std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.mp4";
     cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
-    double width, height, fps;
+    double width, height, fps, iFrame;
     ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_FRAME_WIDTH, width));
     ASSERT_EQ(672, width);
     ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_FRAME_HEIGHT, height));
     ASSERT_EQ(384, height);
     ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_FPS, fps));
     ASSERT_EQ(24, fps);
+    ASSERT_TRUE(reader->grab());
+    ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_POS_FRAMES, iFrame));
+    ASSERT_EQ(iFrame, 1.);
 }
 
 CUDA_TEST_P(CheckDecodeSurfaces, Reader)
@@ -619,6 +609,37 @@ CUDA_TEST_P(CheckInitParams, Reader)
     ASSERT_TRUE(reader->get(cv::cudacodec::VideoReaderProps::PROP_RAW_MODE, rawMode) && static_cast<bool>(rawMode) == params.rawMode);
 }
 
+CUDA_TEST_P(Seek, Reader)
+{
+#if defined(WIN32)
+    throw SkipTestException("Test disabled on Windows until the FFMpeg wrapper is updated to include PR24012.");
+#endif
+    std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.mp4";
+    // seek to a non key frame
+    const int firstFrameIdx = 18;
+
+    GpuMat frameGs;
+    {
+        cv::Ptr<cv::cudacodec::VideoReader> readerGs = cv::cudacodec::createVideoReader(inputFile);
+        ASSERT_TRUE(readerGs->set(cudacodec::ColorFormat::GRAY));
+        for (int i = 0; i <= firstFrameIdx; i++)
+            ASSERT_TRUE(readerGs->nextFrame(frameGs));
+    }
+
+    cudacodec::VideoReaderInitParams params;
+    params.firstFrameIdx = firstFrameIdx;
+    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile, {}, params);
+    double iFrame = 0.;
+    ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_POS_FRAMES, iFrame));
+    ASSERT_EQ(iFrame, static_cast<double>(firstFrameIdx));
+    ASSERT_TRUE(reader->set(cudacodec::ColorFormat::GRAY));
+    GpuMat frame;
+    ASSERT_TRUE(reader->nextFrame(frame));
+    ASSERT_EQ(cuda::norm(frameGs, frame, NORM_INF), 0.0);
+    ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_POS_FRAMES, iFrame));
+    ASSERT_EQ(iFrame, static_cast<double>(firstFrameIdx+1));
+}
+
 #endif // HAVE_NVCUVID
 
 #if defined(HAVE_NVCUVID) && defined(HAVE_NVCUVENC)
@@ -958,5 +979,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckInitParams, testing::Combine(
     testing::Values("highgui/video/big_buck_bunny.mp4"),
     testing::Values(true,false), testing::Values(true,false), testing::Values(true,false)));
 
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, Seek, ALL_DEVICES);
+
 #endif // HAVE_NVCUVID || HAVE_NVCUVENC
 }} // namespace
diff --git a/modules/cudafeatures2d/CMakeLists.txt b/modules/cudafeatures2d/CMakeLists.txt
index aba40283dd9..2b6023c0b66 100644
--- a/modules/cudafeatures2d/CMakeLists.txt
+++ b/modules/cudafeatures2d/CMakeLists.txt
@@ -5,5 +5,7 @@ endif()
 set(the_description "CUDA-accelerated Feature Detection and Description")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter -Wshadow)
-
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  ocv_module_include_directories(${CUDAToolkit_INCLUDE_DIRS})
+endif()
 ocv_define_module(cudafeatures2d opencv_features2d opencv_cudafilters opencv_cudawarping WRAP python)
diff --git a/modules/cudafilters/CMakeLists.txt b/modules/cudafilters/CMakeLists.txt
index 08281c135ce..75ff3b26718 100644
--- a/modules/cudafilters/CMakeLists.txt
+++ b/modules/cudafilters/CMakeLists.txt
@@ -5,5 +5,8 @@ endif()
 set(the_description "CUDA-accelerated Image Filtering")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
-
-ocv_define_module(cudafilters opencv_imgproc opencv_cudaarithm WRAP python)
+set(extra_libs "")
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  set(extra_libs CUDA::nppif${CUDA_LIB_EXT} CUDA::nppim${CUDA_LIB_EXT})
+endif()
+ocv_define_module(cudafilters opencv_imgproc opencv_cudaarithm ${extra_libs} WRAP python)
diff --git a/modules/cudaimgproc/CMakeLists.txt b/modules/cudaimgproc/CMakeLists.txt
index 8d06804ddcc..de818f6a8b3 100644
--- a/modules/cudaimgproc/CMakeLists.txt
+++ b/modules/cudaimgproc/CMakeLists.txt
@@ -5,5 +5,8 @@ endif()
 set(the_description "CUDA-accelerated Image Processing")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)
-
-ocv_define_module(cudaimgproc opencv_imgproc OPTIONAL opencv_cudev opencv_cudaarithm opencv_cudafilters WRAP python)
+set(extra_libs "")
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  set(extra_libs CUDA::nppial${CUDA_LIB_EXT} CUDA::nppist${CUDA_LIB_EXT} CUDA::nppicc${CUDA_LIB_EXT} CUDA::nppidei${CUDA_LIB_EXT})
+endif()
+ocv_define_module(cudaimgproc opencv_imgproc ${extra_libs} OPTIONAL opencv_cudev opencv_cudaarithm opencv_cudafilters WRAP python)
diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
index 9ee50c73052..4c9ee0f48e8 100644
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -57,6 +57,7 @@
     @{
       @defgroup cudaimgproc_color Color space processing
       @defgroup cudaimgproc_hist Histogram Calculation
+      @defgroup cudaimgproc_shape Structural Analysis and Shape Descriptors
       @defgroup cudaimgproc_hough Hough Transform
       @defgroup cudaimgproc_feature Feature Detection
     @}
@@ -779,9 +780,84 @@ CV_EXPORTS_AS(connectedComponentsWithAlgorithm) void connectedComponents(InputAr
 CV_EXPORTS_W void connectedComponents(InputArray image, OutputArray labels,
     int connectivity = 8, int ltype = CV_32S);
 
-
 //! @}
 
+//! @addtogroup cudaimgproc_shape
+//! @{
+
+ /** @brief Order of image moments.
+ * @param FIRST_ORDER_MOMENTS First order moments
+ * @param SECOND_ORDER_MOMENTS Second order moments.
+ * @param THIRD_ORDER_MOMENTS Third order moments.
+ * */
+enum MomentsOrder {
+    FIRST_ORDER_MOMENTS = 1,
+    SECOND_ORDER_MOMENTS = 2,
+    THIRD_ORDER_MOMENTS = 3
+};
+
+/** @brief Returns the number of image moments less than or equal to the largest image moments \a order.
+@param order Order of largest moments to calculate with lower order moments requiring less computation.
+@returns number of image moments.
+
+@sa cuda::moments, cuda::spatialMoments, cuda::MomentsOrder
+ */
+CV_EXPORTS_W int numMoments(const MomentsOrder order);
+
+/** @brief Calculates all of the spatial moments up to the 3rd order of a rasterized shape.
+
+Asynchronous version of cuda::moments() which only calculates the spatial (not centralized or normalized) moments, up to the 3rd order, of a rasterized shape.
+Each moment is returned as a column entry in the 1D \a moments array.
+
+@param src Raster image (single-channel 2D array).
+@param [out] moments 1D array with each column entry containing a spatial image moment.
+@param binaryImage If it is true, all non-zero image pixels are treated as 1's.
+@param order Order of largest moments to calculate with lower order moments requiring less computation.
+@param momentsType Precision to use when calculating moments. Available types are `CV_32F` and `CV_64F` with the performance of `CV_32F` an order of magnitude greater than `CV_64F`. If the image is small the accuracy from `CV_32F` can be equal or very close to `CV_64F`.
+@param stream Stream for the asynchronous version.
+
+@note For maximum performance pre-allocate a 1D GpuMat for \a moments of the correct type and size large enough to store the all the image moments of up to the desired \a order. e.g. With \a order === MomentsOrder::SECOND_ORDER_MOMENTS and \a momentsType == `CV_32F` \a moments can be allocated as
+```
+GpuMat momentsDevice(1,numMoments(MomentsOrder::SECOND_ORDER_MOMENTS),CV_32F)
+```
+The central and normalized moments can easily be calculated on the host by downloading the \a moments array and using the cv::Moments constructor. e.g.
+```
+HostMem momentsHostMem(1, numMoments(MomentsOrder::SECOND_ORDER_MOMENTS), CV_32F);
+momentsDevice.download(momentsHostMem, stream);
+stream.waitForCompletion();
+Mat momentsMat = momentsHostMem.createMatHeader();
+cv::Moments cvMoments(momentsMat.at<float>(0), momentsMat.at<float>(1), momentsMat.at<float>(2), momentsMat.at<float>(3), momentsMat.at<float>(4), momentsMat.at<float>(5), momentsMat.at<float>(6), momentsMat.at<float>(7), momentsMat.at<float>(8), momentsMat.at<float>(9));
+```
+see the \a CUDA_TEST_P(Moments, Async) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
+@returns cv::Moments.
+@sa cuda::moments
+*/
+CV_EXPORTS_W void spatialMoments(InputArray src, OutputArray moments, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F, Stream& stream = Stream::Null());
+
+/** @brief Calculates all of the moments up to the 3rd order of a rasterized shape.
+
+The function computes moments, up to the 3rd order, of a rasterized shape. The
+results are returned in the structure cv::Moments.
+
+@param src Raster image (single-channel 2D array).
+@param binaryImage If it is true, all non-zero image pixels are treated as 1's.
+@param order Order of largest moments to calculate with lower order moments requiring less computation.
+ @param momentsType Precision to use when calculating moments. Available types are `CV_32F` and `CV_64F` with the performance of `CV_32F` an order of magnitude greater than `CV_64F`. If the image is small the accuracy from `CV_32F` can be equal or very close to `CV_64F`.
+
+@note For maximum performance use the asynchronous version cuda::spatialMoments() as this version interally allocates and deallocates both GpuMat and HostMem to respectively perform the calculation on the device and download the result to the host.
+The costly HostMem allocation cannot be avoided however the GpuMat device allocation can be by using BufferPool, e.g.
+```
+    setBufferPoolUsage(true);
+    setBufferPoolConfig(getDevice(), numMoments(order) * ((momentsType == CV_64F) ? sizeof(double) : sizeof(float)), 1);
+```
+see the \a CUDA_TEST_P(Moments, Accuracy) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
+@returns cv::Moments.
+@sa cuda::spatialMoments
+ */
+CV_EXPORTS_W Moments moments(InputArray src, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F);
+
+//! @} cudaimgproc_shape
+
 }} // namespace cv { namespace cuda {
 
 #endif /* OPENCV_CUDAIMGPROC_HPP */
diff --git a/modules/cudaimgproc/misc/python/test/test_cudaimgproc.py b/modules/cudaimgproc/misc/python/test/test_cudaimgproc.py
index 0548cbcd8bc..f07617f53e5 100644
--- a/modules/cudaimgproc/misc/python/test/test_cudaimgproc.py
+++ b/modules/cudaimgproc/misc/python/test/test_cudaimgproc.py
@@ -89,5 +89,30 @@ def test_cvtColor(self):
         self.assertTrue(np.allclose(cv.cuda.cvtColor(cuMat, cv.COLOR_BGR2HSV).download(),
                                          cv.cvtColor(npMat, cv.COLOR_BGR2HSV)))
 
+    def test_moments(self):
+        # setup
+        src_host = (np.ones([10,10])).astype(np.uint8)*255
+        cpu_moments = cv.moments(src_host, True)
+        moments_order = cv.cuda.THIRD_ORDER_MOMENTS
+        n_moments = cv.cuda.numMoments(cv.cuda.THIRD_ORDER_MOMENTS)
+        src_device = cv.cuda.GpuMat(src_host)
+
+        # synchronous
+        cv.cuda.setBufferPoolUsage(True)
+        cv.cuda.setBufferPoolConfig(cv.cuda.getDevice(), n_moments * np.dtype(float).itemsize, 1);
+        gpu_moments = cv.cuda.moments(src_device, True, moments_order, cv.CV_64F)
+        self.assertTrue(len([1 for moment_type in cpu_moments if moment_type in gpu_moments and cpu_moments[moment_type] == gpu_moments[moment_type]]) == 24)
+
+        # asynchronous
+        stream = cv.cuda.Stream()
+        moments_array_host = np.empty([1, n_moments], np.float64)
+        cv.cuda.registerPageLocked(moments_array_host)
+        moments_array_device = cv.cuda.GpuMat(1, n_moments, cv.CV_64F)
+        cv.cuda.spatialMoments(src_device, moments_array_device, True, moments_order, cv.CV_64F, stream)
+        moments_array_device.download(stream, moments_array_host);
+        stream.waitForCompletion()
+        cv.cuda.unregisterPageLocked(moments_array_host)
+        self.assertTrue(len([ 1 for moment_type,gpu_moment in zip(cpu_moments,moments_array_host[0]) if cpu_moments[moment_type] == gpu_moment]) == 10)
+
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
\ No newline at end of file
diff --git a/modules/cudaimgproc/perf/perf_moments.cpp b/modules/cudaimgproc/perf/perf_moments.cpp
new file mode 100644
index 00000000000..ba91afbacfb
--- /dev/null
+++ b/modules/cudaimgproc/perf/perf_moments.cpp
@@ -0,0 +1,61 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+static void drawCircle(cv::Mat& dst, const cv::Vec3i& circle, bool fill)
+{
+    dst.setTo(Scalar::all(0));
+    cv::circle(dst, Point2i(circle[0], circle[1]), circle[2], Scalar::all(255), fill ? -1 : 1, cv::LINE_AA);
+}
+
+DEF_PARAM_TEST(Sz_Depth, Size, MatDepth);
+PERF_TEST_P(Sz_Depth, SpatialMoments, Combine(CUDA_TYPICAL_MAT_SIZES, Values(MatDepth(CV_32F), MatDepth((CV_64F)))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int momentsType = GET_PARAM(1);
+    Mat imgHost(size, CV_8U);
+    const Vec3i circle(size.width / 2, size.height / 2, static_cast<int>(static_cast<float>(size.width / 2) * 0.9));
+    drawCircle(imgHost, circle, true);
+    if (PERF_RUN_CUDA()) {
+        const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS;
+        const int nMoments = numMoments(order);
+        GpuMat momentsDevice(1, nMoments, momentsType);
+        const GpuMat imgDevice(imgHost);
+        TEST_CYCLE() cuda::spatialMoments(imgDevice, momentsDevice, false, order, momentsType);
+        SANITY_CHECK_NOTHING();
+    }
+    else {
+        cv::Moments momentsHost;
+        TEST_CYCLE() momentsHost = cv::moments(imgHost, false);
+        SANITY_CHECK_NOTHING();
+    }
+}
+
+PERF_TEST_P(Sz_Depth, Moments, Combine(CUDA_TYPICAL_MAT_SIZES, Values(MatDepth(CV_32F), MatDepth(CV_64F))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int momentsType = GET_PARAM(1);
+    Mat imgHost(size, CV_8U);
+    const Vec3i circle(size.width / 2, size.height / 2, static_cast<int>(static_cast<float>(size.width / 2) * 0.9));
+    drawCircle(imgHost, circle, true);
+    if (PERF_RUN_CUDA()) {
+        const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS;
+        const int nMoments = numMoments(order);
+        setBufferPoolUsage(true);
+        setBufferPoolConfig(getDevice(), nMoments * ((momentsType == CV_64F) ? sizeof(double) : sizeof(float)), 1);
+        const GpuMat imgDevice(imgHost);
+        cv::Moments momentsHost;
+        TEST_CYCLE() momentsHost = cuda::moments(imgDevice, false, order, momentsType);
+        SANITY_CHECK_NOTHING();
+    }
+    else {
+        cv::Moments momentsHost;
+        TEST_CYCLE() momentsHost = cv::moments(imgHost, false);
+        SANITY_CHECK_NOTHING();
+    }
+}
+
+}}
diff --git a/modules/cudaimgproc/src/cuda/moments.cu b/modules/cudaimgproc/src/cuda/moments.cu
new file mode 100644
index 00000000000..9828c5614b2
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/moments.cu
@@ -0,0 +1,186 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#if !defined CUDA_DISABLER
+
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/cudev/util/atomic.hpp>
+#include "moments.cuh"
+
+namespace cv { namespace cuda { namespace device { namespace imgproc {
+
+constexpr int blockSizeX = 32;
+constexpr int blockSizeY = 16;
+
+template <typename T>
+__device__ T butterflyWarpReduction(T value) {
+    for (int i = 16; i >= 1; i /= 2)
+        value += __shfl_xor_sync(0xffffffff, value, i, 32);
+    return value;
+}
+
+template <typename T>
+__device__ T butterflyHalfWarpReduction(T value) {
+    for (int i = 8; i >= 1; i /= 2)
+        value += __shfl_xor_sync(0xffff, value, i, 32);
+    return value;
+}
+
+template<typename T, int nMoments>
+__device__ void updateSums(const T val, const unsigned int x, T r[4]) {
+    const T x2 = x * x;
+    const T x3 = static_cast<T>(x) * x2;
+    r[0] += val;
+    r[1] += val * x;
+    if (nMoments >= n12) r[2] += val * x2;
+    if (nMoments >= n123) r[3] += val * x3;
+}
+
+template<typename TSrc, typename TMoments, int nMoments>
+__device__ void rowReductions(const PtrStepSz<TSrc> img, const bool binary, const unsigned int y, TMoments r[4], TMoments smem[][nMoments + 1]) {
+    for (int x = threadIdx.x; x < img.cols; x += blockDim.x) {
+        const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1;
+        updateSums<TMoments,nMoments>(val, x, r);
+    }
+}
+
+template<typename TSrc, typename TMoments, bool fourByteAligned, int nMoments>
+__device__ void rowReductionsCoalesced(const PtrStepSz<TSrc> img, const bool binary, const unsigned int y, TMoments r[4], const int offsetX, TMoments smem[][nMoments + 1]) {
+    const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
+    // load uncoalesced head
+    if (!fourByteAligned && threadIdx.x == 0) {
+        for (int x = 0; x < ::min(alignedOffset, static_cast<int>(img.cols)); x++) {
+            const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1;
+            updateSums<TMoments, nMoments>(val, x, r);
+        }
+    }
+
+    // coalesced loads
+    const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? img.ptr(y) : img.ptr(y) + alignedOffset);
+    const int cols4 = fourByteAligned ? img.cols / 4 : (img.cols - alignedOffset) / 4;
+    for (int x = threadIdx.x; x < cols4; x += blockDim.x) {
+        const unsigned int data = rowPtrIntAligned[x];
+#pragma unroll 4
+        for (int i = 0; i < 4; i++) {
+            const int iX = alignedOffset + 4 * x + i;
+            const uchar ucharVal = ((data >> i * 8) & 0xFFU);
+            const TMoments val = (!binary || ucharVal == 0) ? ucharVal : 1;
+            updateSums<TMoments, nMoments>(val, iX, r);
+        }
+    }
+
+    // load uncoalesced tail
+    if (threadIdx.x == 0) {
+        const int iTailStart = fourByteAligned ? cols4 * 4 : cols4 * 4 + alignedOffset;
+        for (int x = iTailStart; x < img.cols; x++) {
+            const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1;
+            updateSums<TMoments, nMoments>(val, x, r);
+        }
+    }
+}
+
+template <typename TSrc, typename TMoments, bool coalesced = false, bool fourByteAligned = false, int nMoments>
+__global__ void spatialMoments(const PtrStepSz<TSrc> img, const bool binary, TMoments* moments, const int offsetX = 0) {
+    const unsigned int y = blockIdx.x * blockDim.y + threadIdx.y;
+    __shared__ TMoments smem[blockSizeY][nMoments + 1];
+    if (threadIdx.y < nMoments && threadIdx.x < blockSizeY)
+        smem[threadIdx.x][threadIdx.y] = 0;
+    __syncthreads();
+
+    TMoments r[4] = { 0 };
+    if (y < img.rows) {
+        if (coalesced)
+            rowReductionsCoalesced<TSrc, TMoments, fourByteAligned, nMoments>(img, binary, y, r, offsetX, smem);
+        else
+            rowReductions<TSrc, TMoments, nMoments>(img, binary, y, r, smem);
+    }
+
+    const unsigned long y2 = y * y;
+    const TMoments y3 = static_cast<TMoments>(y2) * y;
+    const TMoments res = butterflyWarpReduction<float>(r[0]);
+    if (res) {
+        smem[threadIdx.y][0] = res; //0th
+        smem[threadIdx.y][1] = butterflyWarpReduction(r[1]); //1st
+        smem[threadIdx.y][2] = y * res; //1st
+        if (nMoments >= n12) {
+            smem[threadIdx.y][3] = butterflyWarpReduction(r[2]); //2nd
+            smem[threadIdx.y][4] = smem[threadIdx.y][1] * y; //2nd
+            smem[threadIdx.y][5] = y2 * res; //2nd
+        }
+        if (nMoments >= n123) {
+            smem[threadIdx.y][6] = butterflyWarpReduction(r[3]); //3rd
+            smem[threadIdx.y][7] = smem[threadIdx.y][3] * y; //3rd
+            smem[threadIdx.y][8] = smem[threadIdx.y][1] * y2; //3rd
+            smem[threadIdx.y][9] = y3 * res; //3rd
+        }
+    }
+    __syncthreads();
+
+    if (threadIdx.x < blockSizeY && threadIdx.y < nMoments)
+        smem[threadIdx.y][nMoments] = butterflyHalfWarpReduction(smem[threadIdx.x][threadIdx.y]);
+    __syncthreads();
+
+    if (threadIdx.y == 0 && threadIdx.x < nMoments) {
+        if (smem[threadIdx.x][nMoments])
+            cudev::atomicAdd(&moments[threadIdx.x], smem[threadIdx.x][nMoments]);
+    }
+}
+
+template <typename TSrc, typename TMoments, int nMoments> struct momentsDispatcherNonChar {
+    static void call(const PtrStepSz<TSrc> src, PtrStepSz<TMoments> moments, const bool binary, const int offsetX, const cudaStream_t stream) {
+        dim3 blockSize(blockSizeX, blockSizeY);
+        dim3 gridSize = dim3(divUp(src.rows, blockSizeY));
+        spatialMoments<TSrc, TMoments, false, false, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr());
+        if (stream == 0)
+            cudaSafeCall(cudaStreamSynchronize(stream));
+    };
+};
+
+template <typename TSrc, int nMoments> struct momentsDispatcherChar {
+    static void call(const PtrStepSz<TSrc> src, PtrStepSz<float> moments, const bool binary, const int offsetX, const cudaStream_t stream) {
+        dim3 blockSize(blockSizeX, blockSizeY);
+        dim3 gridSize = dim3(divUp(src.rows, blockSizeY));
+        if (offsetX)
+            spatialMoments<TSrc, float, true, false, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr(), offsetX);
+        else
+            spatialMoments<TSrc, float, true, true, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr());
+
+        if (stream == 0)
+            cudaSafeCall(cudaStreamSynchronize(stream));
+    };
+};
+
+template <typename TSrc, typename TMoments, int nMoments> struct momentsDispatcher : momentsDispatcherNonChar<TSrc, TMoments, nMoments> {};
+template <int nMoments> struct momentsDispatcher<uchar, float, nMoments> : momentsDispatcherChar<uchar, nMoments> {};
+template <int nMoments> struct momentsDispatcher<schar, float, nMoments> : momentsDispatcherChar<schar, nMoments> {};
+
+template <typename TSrc, typename TMoments>
+void moments(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream) {
+    if (order == 1)
+        momentsDispatcher<TSrc, TMoments, n1>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream);
+    else if (order == 2)
+        momentsDispatcher<TSrc, TMoments, n12>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream);
+    else if (order == 3)
+        momentsDispatcher<TSrc, TMoments, n123>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream);
+};
+
+template void moments<uchar, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<schar, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<ushort, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<short, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<int, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<float, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<double, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+
+template void moments<uchar, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<schar, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<ushort, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<short, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<int, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<float, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<double, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+
+}}}}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/moments.cuh b/modules/cudaimgproc/src/cuda/moments.cuh
new file mode 100644
index 00000000000..0041882b64f
--- /dev/null
+++ b/modules/cudaimgproc/src/cuda/moments.cuh
@@ -0,0 +1,6 @@
+#pragma once
+namespace cv { namespace cuda { namespace device { namespace imgproc {
+    constexpr int n1 = 3;
+    constexpr int n12 = 6;
+    constexpr int n123 = 10;
+}}}}
diff --git a/modules/cudaimgproc/src/moments.cpp b/modules/cudaimgproc/src/moments.cpp
new file mode 100644
index 00000000000..ced5b2f8c66
--- /dev/null
+++ b/modules/cudaimgproc/src/moments.cpp
@@ -0,0 +1,67 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+#include "cuda/moments.cuh"
+
+using namespace cv;
+using namespace cv::cuda;
+
+int cv::cuda::numMoments(const MomentsOrder order) {
+    return order == MomentsOrder::FIRST_ORDER_MOMENTS ? device::imgproc::n1 : order == MomentsOrder::SECOND_ORDER_MOMENTS ? device::imgproc::n12 : device::imgproc::n123;
+}
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+    Moments cv::cuda::moments(InputArray src, const bool binary, const MomentsOrder order, const int momentsType) { throw_no_cuda(); }
+    void spatialMoments(InputArray src, OutputArray moments, const bool binary, const MomentsOrder order, const int momentsType, Stream& stream) { throw_no_cuda(); }
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device { namespace imgproc {
+        template <typename TSrc, typename TMoments>
+        void moments(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+}}}}
+
+void cv::cuda::spatialMoments(InputArray src, OutputArray moments, const bool binary, const MomentsOrder order, const int momentsType, Stream& stream) {
+    CV_Assert(src.depth() <= CV_64F);
+    const GpuMat srcDevice = getInputMat(src, stream);
+
+    CV_Assert(momentsType == CV_32F || momentsType == CV_64F);
+    const int nMoments = numMoments(order);
+    const int momentsCols = nMoments < moments.cols() ? moments.cols() : nMoments;
+    GpuMat momentsDevice = getOutputMat(moments, 1, momentsCols, momentsType, stream);
+    momentsDevice.setTo(0);
+
+    Point ofs; Size wholeSize;
+    srcDevice.locateROI(wholeSize, ofs);
+
+    typedef void (*func_t)(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+    static const func_t funcs[7][2] =
+    {
+        {device::imgproc::moments<uchar, float>,  device::imgproc::moments<uchar, double> },
+        {device::imgproc::moments<schar, float>,  device::imgproc::moments<schar, double> },
+        {device::imgproc::moments<ushort, float>, device::imgproc::moments<ushort, double>},
+        {device::imgproc::moments<short, float>,  device::imgproc::moments<short, double> },
+        {device::imgproc::moments<int, float>,    device::imgproc::moments<int, double> },
+        {device::imgproc::moments<float, float>,  device::imgproc::moments<float, double> },
+        {device::imgproc::moments<double, float>, device::imgproc::moments<double, double> }
+    };
+
+    const func_t func = funcs[srcDevice.depth()][momentsType == CV_64F];
+    func(srcDevice, momentsDevice, binary, static_cast<int>(order), ofs.x, StreamAccessor::getStream(stream));
+    syncOutput(momentsDevice, moments, stream);
+}
+
+Moments cv::cuda::moments(InputArray src, const bool binary, const MomentsOrder order, const int momentsType) {
+    Stream& stream = Stream::Null();
+    HostMem dst;
+    spatialMoments(src, dst, binary, order, momentsType, stream);
+    stream.waitForCompletion();
+    Mat moments = dst.createMatHeader();
+    if(momentsType == CV_32F)
+        return Moments(moments.at<float>(0), moments.at<float>(1), moments.at<float>(2), moments.at<float>(3), moments.at<float>(4), moments.at<float>(5), moments.at<float>(6), moments.at<float>(7), moments.at<float>(8), moments.at<float>(9));
+    else
+        return Moments(moments.at<double>(0), moments.at<double>(1), moments.at<double>(2), moments.at<double>(3), moments.at<double>(4), moments.at<double>(5), moments.at<double>(6), moments.at<double>(7), moments.at<double>(8), moments.at<double>(9));
+}
+
+#endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudaimgproc/test/test_moments.cpp b/modules/cudaimgproc/test/test_moments.cpp
new file mode 100644
index 00000000000..c5dd889f095
--- /dev/null
+++ b/modules/cudaimgproc/test/test_moments.cpp
@@ -0,0 +1,124 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace opencv_test { namespace {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Moments
+
+CV_ENUM(MaxMomentsOrder, MomentsOrder::FIRST_ORDER_MOMENTS, MomentsOrder::SECOND_ORDER_MOMENTS, MomentsOrder::THIRD_ORDER_MOMENTS)
+
+PARAM_TEST_CASE(Moments, cv::cuda::DeviceInfo, cv::Size, bool, MatDepth, MatDepth, UseRoi, MaxMomentsOrder)
+{
+    DeviceInfo devInfo;
+    Size size;
+    bool isBinary;
+    float pcWidth = 0.6f;
+    int momentsType;
+    int imgType;
+    bool useRoi;
+    MomentsOrder order;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        isBinary = GET_PARAM(2);
+        momentsType = GET_PARAM(3);
+        imgType = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+        order = static_cast<MomentsOrder>(static_cast<int>(GET_PARAM(6)));
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+
+    static void drawCircle(cv::Mat& dst, const cv::Vec3i& circle, bool fill)
+    {
+        dst.setTo(Scalar::all(0));
+        cv::circle(dst, Point2i(circle[0], circle[1]), circle[2], Scalar::all(255), fill ? -1 : 1, cv::LINE_AA);
+    }
+};
+
+bool Equal(const double m0, const double m1, const double absPcErr) {
+    if (absPcErr == 0) return m0 == m1;
+    if (m0 == 0) {
+        if (m1 < absPcErr) return true;
+        else return false;
+    }
+    const double pcDiff = abs(m0 - m1) / m1;
+    return pcDiff < absPcErr;
+}
+
+void CheckMoments(const cv::Moments m0, const cv::Moments m1, const MomentsOrder order, const int momentsType) {
+    double absPcErr = momentsType == CV_64F ? 0 : 5e-7;
+    ASSERT_TRUE(Equal(m0.m00, m1.m00, absPcErr)) << "m0.m00: " << m0.m00 << ", m1.m00: " << m1.m00 << ", absPcErr: " << absPcErr;
+    ASSERT_TRUE(Equal(m0.m10, m1.m10, absPcErr)) << "m0.m10: " << m0.m10 << ", m1.m10: " << m1.m10 << ", absPcErr: " << absPcErr;
+    ASSERT_TRUE(Equal(m0.m01, m1.m01, absPcErr)) << "m0.m01: " << m0.m01 << ", m1.m01: " << m1.m01 << ", absPcErr: " << absPcErr;
+    if (static_cast<int>(order) >= static_cast<int>(MomentsOrder::SECOND_ORDER_MOMENTS)) {
+        ASSERT_TRUE(Equal(m0.m20, m1.m20, absPcErr)) << "m0.m20: " << m0.m20 << ", m1.m20: " << m1.m20 << ", absPcErr: " << absPcErr;
+        ASSERT_TRUE(Equal(m0.m11, m1.m11, absPcErr)) << "m0.m11: " << m0.m11 << ", m1.m11: " << m1.m11 << ", absPcErr: " << absPcErr;
+        ASSERT_TRUE(Equal(m0.m02, m1.m02, absPcErr)) << "m0.m02: " << m0.m02 << ", m1.m02: " << m1.m02 << ", absPcErr: " << absPcErr;
+    }
+    if (static_cast<int>(order) >= static_cast<int>(MomentsOrder::THIRD_ORDER_MOMENTS)) {
+        ASSERT_TRUE(Equal(m0.m30, m1.m30, absPcErr)) << "m0.m30: " << m0.m30 << ", m1.m30: " << m1.m30 << ", absPcErr: " << absPcErr;
+        ASSERT_TRUE(Equal(m0.m21, m1.m21, absPcErr)) << "m0.m21: " << m0.m21 << ", m1.m21: " << m1.m21 << ", absPcErr: " << absPcErr;
+        ASSERT_TRUE(Equal(m0.m12, m1.m12, absPcErr)) << "m0.m12: " << m0.m12 << ", m1.m12: " << m1.m12 << ", absPcErr: " << absPcErr;
+        ASSERT_TRUE(Equal(m0.m03, m1.m03, absPcErr)) << "m0.m03: " << m0.m03 << ", m1.m03: " << m1.m03 << ", absPcErr: " << absPcErr;
+    }
+}
+
+CUDA_TEST_P(Moments, Accuracy)
+{
+    Mat imgHost(size, imgType);
+    const Rect roi = useRoi ? Rect(1, 0, imgHost.cols - 2, imgHost.rows) : Rect(0, 0, imgHost.cols, imgHost.rows);
+    const Vec3i circle(size.width / 2, size.height / 2, static_cast<int>(static_cast<float>(size.width/2) * pcWidth));
+    drawCircle(imgHost, circle, true);
+    const GpuMat imgDevice(imgHost);
+    const int nMoments = numMoments(order);
+    setBufferPoolUsage(true);
+    setBufferPoolConfig(getDevice(), nMoments * ((momentsType == CV_64F) ? sizeof(double) : sizeof(float)), 1);
+    const cv::Moments moments = cuda::moments(imgDevice(roi), isBinary, order, momentsType);
+    Mat imgHostFloat; imgHost(roi).convertTo(imgHostFloat, CV_32F);
+    const cv::Moments momentsGs = cv::moments(imgHostFloat, isBinary);
+    CheckMoments(momentsGs, moments, order, momentsType);
+}
+
+CUDA_TEST_P(Moments, Async)
+{
+    Stream stream;
+    const int nMoments = numMoments(order);
+    GpuMat momentsDevice(1, nMoments, momentsType);
+    Mat imgHost(size, imgType);
+    const Rect roi = useRoi ? Rect(1, 0, imgHost.cols - 2, imgHost.rows) : Rect(0, 0, imgHost.cols, imgHost.rows);
+    const Vec3i circle(size.width / 2, size.height / 2, static_cast<int>(static_cast<float>(size.width/2) * pcWidth));
+    drawCircle(imgHost, circle, true);
+    const GpuMat imgDevice(imgHost);
+    cuda::spatialMoments(imgDevice(roi), momentsDevice, isBinary, order, momentsType, stream);
+    HostMem momentsHost(1, nMoments, momentsType);
+    momentsDevice.download(momentsHost, stream);
+    stream.waitForCompletion();
+    Mat momentsHost64F = momentsHost.createMatHeader();
+    if (momentsType == CV_32F)
+        momentsHost.createMatHeader().convertTo(momentsHost64F, CV_64F);
+    const cv::Moments moments = cv::Moments(momentsHost64F.at<double>(0), momentsHost64F.at<double>(1), momentsHost64F.at<double>(2), momentsHost64F.at<double>(3), momentsHost64F.at<double>(4), momentsHost64F.at<double>(5), momentsHost64F.at<double>(6), momentsHost64F.at<double>(7), momentsHost64F.at<double>(8), momentsHost64F.at<double>(9));
+    Mat imgHostAdjustedType = imgHost(roi);
+    if (imgType != CV_8U && imgType != CV_32F)
+        imgHost(roi).convertTo(imgHostAdjustedType, CV_32F);
+    const cv::Moments momentsGs = cv::moments(imgHostAdjustedType, isBinary);
+    CheckMoments(momentsGs, moments, order, momentsType);
+}
+
+#define SIZES DIFFERENT_SIZES
+#define GRAYSCALE_BINARY testing::Bool()
+#define MOMENTS_TYPE testing::Values(MatDepth(CV_32F), MatDepth(CV_64F))
+#define IMG_TYPE ALL_DEPTH
+#define USE_ROI WHOLE_SUBMAT
+#define MOMENTS_ORDER testing::Values(MaxMomentsOrder(MomentsOrder::FIRST_ORDER_MOMENTS), MaxMomentsOrder(MomentsOrder::SECOND_ORDER_MOMENTS), MaxMomentsOrder(MomentsOrder::THIRD_ORDER_MOMENTS))
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Moments, testing::Combine(ALL_DEVICES, SIZES, GRAYSCALE_BINARY, MOMENTS_TYPE, IMG_TYPE, USE_ROI, MOMENTS_ORDER));
+}} // namespace
+
+#endif // HAVE_CUDA
diff --git a/modules/cudalegacy/CMakeLists.txt b/modules/cudalegacy/CMakeLists.txt
index 379ea88164a..ecb54de44af 100644
--- a/modules/cudalegacy/CMakeLists.txt
+++ b/modules/cudalegacy/CMakeLists.txt
@@ -5,6 +5,8 @@ endif()
 set(the_description "CUDA-accelerated Computer Vision (legacy)")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4130 /wd4324 /wd4512 /wd4310 -Wundef -Wmissing-declarations -Wuninitialized -Wshadow -Wdeprecated-declarations -Wstrict-aliasing -Wtautological-compare)
-
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  ocv_module_include_directories(${CUDAToolkit_INCLUDE_DIRS})
+endif()
 ocv_define_module(cudalegacy opencv_core opencv_video
   OPTIONAL opencv_objdetect opencv_imgproc opencv_3d opencv_stereo opencv_calib opencv_cudaarithm opencv_cudafilters opencv_cudaimgproc)
diff --git a/modules/cudaoptflow/src/farneback.cpp b/modules/cudaoptflow/src/farneback.cpp
index 7cc8373f72b..eb82d0c34e4 100644
--- a/modules/cudaoptflow/src/farneback.cpp
+++ b/modules/cudaoptflow/src/farneback.cpp
@@ -140,6 +140,7 @@ namespace
         int polyN_;
         double polySigma_;
         int flags_;
+        Event sourceStreamComplete;
 
     private:
         void prepareGaussian(
@@ -317,7 +318,10 @@ namespace
 
         Stream streams[5];
         if (stream)
+        {
             streams[0] = stream;
+            sourceStreamComplete.record();
+        }
 
         Size size = frame0.size();
         GpuMat prevFlowX, prevFlowY, curFlowX, curFlowY;
@@ -336,6 +340,8 @@ namespace
         }
 
         frame0.convertTo(frames_[0], CV_32F, streams[0]);
+        if (stream)
+            streams[1].waitEvent(sourceStreamComplete);
         frame1.convertTo(frames_[1], CV_32F, streams[1]);
 
         if (fastPyramids_)
diff --git a/modules/cudaoptflow/test/test_optflow.cpp b/modules/cudaoptflow/test/test_optflow.cpp
index 214e6e48ffe..985143165df 100644
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@@ -355,6 +355,66 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, FarnebackOpticalFlow, testing::Combine(
     testing::Values(FarnebackOptFlowFlags(0), FarnebackOptFlowFlags(cv::OPTFLOW_FARNEBACK_GAUSSIAN)),
     testing::Values(UseInitFlow(false), UseInitFlow(true))));
 
+
+PARAM_TEST_CASE(FarnebackOpticalFlowAsync, cv::cuda::DeviceInfo, PyrScale, PolyN, FarnebackOptFlowFlags)
+{
+    cv::cuda::DeviceInfo devInfo;
+    double pyrScale;
+    int polyN;
+    int flags;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        pyrScale = GET_PARAM(1);
+        polyN = GET_PARAM(2);
+        flags = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(FarnebackOpticalFlowAsync, Accuracy)
+{
+    cv::Mat frame0Mat = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0Mat.empty());
+
+    cv::Mat frame1Mat = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1Mat.empty());
+
+    cv::Ptr<cv::cuda::FarnebackOpticalFlow> farn = cv::cuda::FarnebackOpticalFlow::create();
+    farn->setPyrScale(pyrScale);
+    farn->setPolyN(polyN);
+    farn->setPolySigma(1.1);
+    farn->setFlags(flags);
+
+    Stream sourceStream;
+    HostMem dummyHost(4000, 4000, CV_8UC3), frame0(frame0Mat), frame1(frame1Mat);
+    GpuMat d_flow, dummyDevice(dummyHost.size(), dummyHost.type()), frame0Device(frame0.size(), frame0.type()), frame1Device(frame1.size(), frame1.type());
+
+    // initialize and warm up CUDA kernels to ensure this doesn't occur during the test
+    farn->calc(loadMat(frame0Mat), loadMat(frame1Mat), d_flow);
+    d_flow.setTo(0);
+
+    frame0Device.upload(frame0, sourceStream);
+    // place extra work in sourceStream to test internal stream synchronization by delaying the upload of frame1 that stream, see https://github.com/opencv/opencv/issues/24540
+    dummyDevice.upload(dummyHost, sourceStream);
+    frame1Device.upload(frame1, sourceStream);
+    farn->calc(frame0Device, frame1Device, d_flow, sourceStream);
+
+    Mat flow;
+    cv::calcOpticalFlowFarneback(
+        frame0, frame1, flow, farn->getPyrScale(), farn->getNumLevels(), farn->getWinSize(),
+        farn->getNumIters(), farn->getPolyN(), farn->getPolySigma(), farn->getFlags());
+        EXPECT_MAT_SIMILAR(flow, d_flow, 1e-4);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, FarnebackOpticalFlowAsync, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(PyrScale(0.3)),
+    testing::Values(PolyN(5)),
+    testing::Values(FarnebackOptFlowFlags(0))));
+
 //////////////////////////////////////////////////////
 // OpticalFlowDual_TVL1
 
diff --git a/modules/cudastereo/CMakeLists.txt b/modules/cudastereo/CMakeLists.txt
index 3d54f44a9f1..45d54a4bc73 100644
--- a/modules/cudastereo/CMakeLists.txt
+++ b/modules/cudastereo/CMakeLists.txt
@@ -5,5 +5,7 @@ endif()
 set(the_description "CUDA-accelerated Stereo Correspondence")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
-
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  ocv_module_include_directories(${CUDAToolkit_INCLUDE_DIRS})
+endif()
 ocv_define_module(cudastereo opencv_3d opencv_stereo OPTIONAL opencv_cudev WRAP python)
diff --git a/modules/cudastereo/src/cuda/stereobm.cu b/modules/cudastereo/src/cuda/stereobm.cu
index 73df35ff63d..afd922c318a 100644
--- a/modules/cudastereo/src/cuda/stereobm.cu
+++ b/modules/cudastereo/src/cuda/stereobm.cu
@@ -504,7 +504,7 @@ namespace cv { namespace cuda { namespace device
                 CV_Error(cv::Error::StsBadArg, "Unsupported window size");
 
             cudaSafeCall( cudaMemset2DAsync(disp.data, disp.step, 0, disp.cols, disp.rows, stream) );
-            cudaSafeCall( cudaMemset2DAsync(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows, stream) );
+            cudaSafeCall( cudaMemset2DAsync(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), minSSD_buf.rows, stream) );
 
             size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
             callers[winsz2](left, right, disp, maxdisp, uniquenessRatio, minSSD_buf.data, minssd_step, left.cols, left.rows, stream);
diff --git a/modules/cudawarping/CMakeLists.txt b/modules/cudawarping/CMakeLists.txt
index 6370189b75c..67b4a1d6a62 100644
--- a/modules/cudawarping/CMakeLists.txt
+++ b/modules/cudawarping/CMakeLists.txt
@@ -5,5 +5,8 @@ endif()
 set(the_description "CUDA-accelerated Image Warping")
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
-
-ocv_define_module(cudawarping opencv_core opencv_imgproc OPTIONAL opencv_cudev WRAP python)
+set(extra_libs "")
+if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  set(extra_libs CUDA::nppial${CUDA_LIB_EXT} CUDA::nppig${CUDA_LIB_EXT})
+endif()
+ocv_define_module(cudawarping opencv_core opencv_imgproc ${extra_libs} OPTIONAL opencv_cudev WRAP python)
diff --git a/modules/cudev/include/opencv2/cudev/util/atomic.hpp b/modules/cudev/include/opencv2/cudev/util/atomic.hpp
index 190e8ee48b3..600f836749c 100644
--- a/modules/cudev/include/opencv2/cudev/util/atomic.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/atomic.hpp
@@ -83,7 +83,7 @@ __device__ __forceinline__ float atomicAdd(float* address, float val)
 
 __device__ static double atomicAdd(double* address, double val)
 {
-#if CV_CUDEV_ARCH >= 130
+#if CV_CUDEV_ARCH < 600
     unsigned long long int* address_as_ull = (unsigned long long int*) address;
     unsigned long long int old = *address_as_ull, assumed;
     do {
@@ -93,9 +93,7 @@ __device__ static double atomicAdd(double* address, double val)
     } while (assumed != old);
     return __longlong_as_double(old);
 #else
-    CV_UNUSED(address);
-    CV_UNUSED(val);
-    return 0.0;
+    return ::atomicAdd(address, val);
 #endif
 }
 
diff --git a/modules/cudev/test/CMakeLists.txt b/modules/cudev/test/CMakeLists.txt
index d036daf5372..ff936cad54c 100644
--- a/modules/cudev/test/CMakeLists.txt
+++ b/modules/cudev/test/CMakeLists.txt
@@ -15,17 +15,23 @@ if(OCV_DEPENDENCIES_FOUND)
 
   ocv_cuda_filter_options()
 
-  if(CUDA_VERSION VERSION_LESS "11.0")
-    ocv_update(OPENCV_CUDA_OPTIONS_opencv_test_cudev "-std=c++11")
+  set(target_libs ${test_deps} ${OPENCV_LINKER_LIBS})
+  if(NOT ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+    ocv_check_windows_crt_linkage()
+    set(target_libs ${target_libs} ${CUDA_LIBRARIES})
+    if(CUDA_VERSION VERSION_LESS "11.0")
+      ocv_update(OPENCV_CUDA_OPTIONS_opencv_test_cudev "-std=c++11")
+    else()
+      ocv_update(OPENCV_CUDA_OPTIONS_opencv_test_cudev "-std=c++14")
+      ocv_warnings_disable(CMAKE_CXX_FLAGS -Wdeprecated-declarations)
+    endif()
+    CUDA_ADD_EXECUTABLE(${the_target} ${OPENCV_TEST_${the_module}_SOURCES} OPTIONS ${OPENCV_CUDA_OPTIONS_opencv_test_cudev})
   else()
-    ocv_update(OPENCV_CUDA_OPTIONS_opencv_test_cudev "-std=c++14")
-    ocv_warnings_disable(CMAKE_CXX_FLAGS -Wdeprecated-declarations)
+    ocv_add_executable(${the_target} ${OPENCV_TEST_${the_module}_SOURCES})
   endif()
 
-  CUDA_ADD_EXECUTABLE(${the_target} ${OPENCV_TEST_${the_module}_SOURCES} OPTIONS ${OPENCV_CUDA_OPTIONS_opencv_test_cudev})
-  ocv_target_link_libraries(${the_target} PRIVATE
-      ${test_deps} ${OPENCV_LINKER_LIBS} ${CUDA_LIBRARIES}
-  )
+  ocv_target_link_libraries(${the_target} PRIVATE ${target_libs})
+
   add_dependencies(opencv_tests ${the_target})
 
   set_target_properties(${the_target} PROPERTIES LABELS "${OPENCV_MODULE_${the_module}_LABEL}")
diff --git a/modules/freetype/include/opencv2/freetype.hpp b/modules/freetype/include/opencv2/freetype.hpp
index e62d058a876..90007badd1d 100644
--- a/modules/freetype/include/opencv2/freetype.hpp
+++ b/modules/freetype/include/opencv2/freetype.hpp
@@ -76,7 +76,7 @@ class CV_EXPORTS_W FreeType2 : public Algorithm
 public:
 /** @brief Load font data.
 
-The function loadFontData loads font data.
+The function loadFontData loads font data from file.
 
 @param fontFileName FontFile Name
 @param idx face_index to select a font faces in a single file.
@@ -84,6 +84,19 @@ The function loadFontData loads font data.
 
     CV_WRAP virtual void loadFontData(String fontFileName, int idx) = 0;
 
+/** @brief Load font data.
+
+The function loadFontData loads font data from memory.
+The data is not copied, the user needs to make sure the data lives at least as long as FreeType2.
+After the FreeType2 object is destroyed, the buffer can be safely deallocated.
+
+@param pBuf pointer to buffer containing font data
+@param bufSize size of buffer
+@param idx face_index to select a font faces in a single file.
+*/
+
+    CV_WRAP virtual void loadFontData(char* pBuf, size_t bufSize, int idx) = 0;
+
 /** @brief Set Split Number from Bezier-curve to line
 
 The function setSplitNumber set the number of split points from bezier-curve to line.
diff --git a/modules/freetype/src/freetype.cpp b/modules/freetype/src/freetype.cpp
index b8e605e5104..d8934e361a2 100644
--- a/modules/freetype/src/freetype.cpp
+++ b/modules/freetype/src/freetype.cpp
@@ -67,6 +67,7 @@ class CV_EXPORTS_W FreeType2Impl CV_FINAL : public FreeType2
     FreeType2Impl();
     ~FreeType2Impl();
     void loadFontData(String fontFileName, int idx) CV_OVERRIDE;
+    void loadFontData(char* pBuf, size_t bufSize, int idx) CV_OVERRIDE;
     void setSplitNumber( int num ) CV_OVERRIDE;
     void putText(
         InputOutputArray img, const String& text, Point org,
@@ -87,6 +88,8 @@ class CV_EXPORTS_W FreeType2Impl CV_FINAL : public FreeType2
     int              mCtoL;
     hb_font_t        *mHb_font;
 
+    void loadFontData(FT_Open_Args &args, int idx);
+
     void putTextBitmapMono(
         InputOutputArray img, const String& text, Point org,
         int fontHeight, Scalar color,
@@ -179,18 +182,54 @@ FreeType2Impl::~FreeType2Impl()
 }
 
 void FreeType2Impl::loadFontData(String fontFileName, int idx)
+{
+    FT_Open_Args args
+    {
+        FT_OPEN_PATHNAME,
+        nullptr, // memory_base
+        0,       // memory_size
+        const_cast<FT_String*>(fontFileName.c_str()),
+        nullptr, // stream
+        nullptr, // driver
+        0,       // num_params
+        nullptr  // params
+    };
+
+    this->loadFontData(args, idx);
+}
+
+void FreeType2Impl::loadFontData(char* pBuf, size_t bufSize, int idx)
+{
+    CV_Assert( pBuf != nullptr );
+
+    FT_Open_Args args
+    {
+        FT_OPEN_MEMORY,
+        reinterpret_cast<FT_Byte*>(pBuf),
+        static_cast<FT_Long>(bufSize),
+        nullptr, // pathname
+        nullptr, // stream
+        nullptr, // driver
+        0,       // num_params
+        nullptr  // params
+    };
+
+    this->loadFontData(args, idx);
+}
+
+void FreeType2Impl::loadFontData(FT_Open_Args &args, int idx)
 {
     CV_Assert( idx >= 0 );
-    if( mIsFaceAvailable  == true )
+    if ( mIsFaceAvailable  == true )
     {
-        hb_font_destroy (mHb_font);
+        hb_font_destroy(mHb_font);
         CV_Assert(!FT_Done_Face(mFace));
     }
 
     mIsFaceAvailable = false;
-    CV_Assert( !FT_New_Face( mLibrary, fontFileName.c_str(), static_cast<FT_Long>(idx), &(mFace) ) );
+    CV_Assert( !FT_Open_Face(mLibrary, &args, idx, &mFace) );
 
-    mHb_font = hb_ft_font_create (mFace, NULL);
+    mHb_font = hb_ft_font_create(mFace, NULL);
     if ( mHb_font == NULL )
     {
         CV_Assert(!FT_Done_Face(mFace));
diff --git a/modules/freetype/test/test_basic.cpp b/modules/freetype/test/test_basic.cpp
index 4c4e0c3d7ce..5a646db45f5 100644
--- a/modules/freetype/test/test_basic.cpp
+++ b/modules/freetype/test/test_basic.cpp
@@ -55,6 +55,39 @@ TEST(Freetype_Basic, success )
     EXPECT_NO_THROW( ft2->putText(dst, "Basic,success", Point( 0,  50), 50, col, -1, LINE_AA, true ) );
 }
 
+TEST(Freetype_Basic, in_memory_font )
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string font_path = root + "freetype/mplus/Mplus1-Regular.ttf";
+
+    cv::Ptr<cv::freetype::FreeType2> ft2;
+    EXPECT_NO_THROW( ft2 = cv::freetype::createFreeType2() );
+    EXPECT_NO_THROW( ft2->loadFontData( font_path, 0 ) );
+
+    Mat dst(600,600, CV_8UC3, Scalar::all(255) );
+    Scalar col(128,64,255,192);
+    EXPECT_NO_THROW( ft2->putText(dst, "Basic,success", Point( 0,  50), 50, col, -1, LINE_AA, true ) );
+
+    FILE* fp = fopen(font_path.c_str(), "rb");
+    ASSERT_TRUE(fp != NULL);
+    fseek(fp, 0, SEEK_END);
+    const size_t file_size = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+
+    std::vector<char> font_buffer(file_size);
+    const size_t actual_read = fread(&font_buffer[0], 1, file_size, fp);
+    fclose(fp);
+    ASSERT_EQ(file_size, actual_read);
+
+    cv::Ptr<cv::freetype::FreeType2> ft2_in_memory;
+    EXPECT_NO_THROW( ft2_in_memory = cv::freetype::createFreeType2() );
+    EXPECT_NO_THROW( ft2_in_memory->loadFontData( &font_buffer[0], file_size, 0 ) );
+    Mat dst_in_memory(600,600, CV_8UC3, Scalar::all(255) );
+    EXPECT_NO_THROW( ft2_in_memory->putText(dst_in_memory, "Basic,success", Point( 0,  50), 50, col, -1, LINE_AA, true ) );
+
+    EXPECT_PRED_FORMAT2(cvtest::MatComparator(0, 0), dst, dst_in_memory);
+}
+
 /******************
  * loadFontData()
  *****************/
@@ -105,6 +138,37 @@ TEST(Freetype_loadFontData, call_multiple)
     EXPECT_NO_THROW( ft2->putText(dst, "call_mutilple", Point( 0,  50), 50, col, -1, LINE_AA, true ) );
 }
 
+TEST(Freetype_loadFontDataMemory, nullptr )
+{
+    cv::Ptr<cv::freetype::FreeType2> ft2;
+    EXPECT_NO_THROW( ft2 = cv::freetype::createFreeType2() );
+    EXPECT_ANY_THROW( ft2->loadFontData( nullptr, 0, 0 ) );
+}
+
+TEST(Freetype_loadFontDataMemory, broken_data )
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string font_path = root + "freetype/mplus/Mplus1-Regular.ttf";
+
+    FILE* fp = fopen(font_path.c_str(), "rb");
+    ASSERT_TRUE(fp != NULL);
+    fseek(fp, 0, SEEK_END);
+    const size_t file_size = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+
+    std::vector<char> font_buffer(file_size);
+    const size_t actual_read = fread(&font_buffer[0], 1, file_size, fp);
+    fclose(fp);
+    ASSERT_EQ(file_size, actual_read);
+
+    cv::Ptr<cv::freetype::FreeType2> ft2_in_memory;
+    EXPECT_NO_THROW( ft2_in_memory = cv::freetype::createFreeType2() );
+
+    font_buffer[0] = ~font_buffer[0]; // font buffer was broken.
+
+    EXPECT_ANY_THROW( ft2_in_memory->loadFontData( &font_buffer[0], file_size, 0 ) );
+}
+
 typedef testing::TestWithParam<int> idx_range;
 
 TEST_P(idx_range, failed )
diff --git a/modules/hfs/CMakeLists.txt b/modules/hfs/CMakeLists.txt
index 69fb0c940cd..c21eaee62a0 100644
--- a/modules/hfs/CMakeLists.txt
+++ b/modules/hfs/CMakeLists.txt
@@ -1,7 +1,10 @@
 if(HAVE_CUDA)
   add_definitions(-D_HFS_CUDA_ON_)
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+  if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+    ocv_module_include_directories(${CUDAToolkit_INCLUDE_DIRS})
+  endif()
 endif()
 
 set(the_description "Hierarchical Feature Selection for Efficient Image Segmentation")
-ocv_define_module(hfs opencv_core opencv_imgproc WRAP python)
\ No newline at end of file
+ocv_define_module(hfs opencv_core opencv_imgproc OPTIONAL WRAP python)
diff --git a/modules/xfeatures2d/include/opencv2/xfeatures2d.hpp b/modules/xfeatures2d/include/opencv2/xfeatures2d.hpp
index 0df14995f3d..841784782de 100644
--- a/modules/xfeatures2d/include/opencv2/xfeatures2d.hpp
+++ b/modules/xfeatures2d/include/opencv2/xfeatures2d.hpp
@@ -1269,7 +1269,7 @@ CV_EXPORTS_W void matchGMS(const Size& size1, const Size& size2, const std::vect
  */
 CV_EXPORTS_W void matchLOGOS(const std::vector<KeyPoint>& keypoints1, const std::vector<KeyPoint>& keypoints2,
                              const std::vector<int>& nn1, const std::vector<int>& nn2,
-                             std::vector<DMatch>& matches1to2);
+                             CV_OUT std::vector<DMatch>& matches1to2);
 
 //! @}
 
diff --git a/modules/xfeatures2d/misc/python/test/test_descriptors.py b/modules/xfeatures2d/misc/python/test/test_descriptors.py
index ca8bbcbc026..7e69311c860 100644
--- a/modules/xfeatures2d/misc/python/test/test_descriptors.py
+++ b/modules/xfeatures2d/misc/python/test/test_descriptors.py
@@ -19,6 +19,18 @@ def test_create(self):
         img1 = np.zeros((100, 100, 3), dtype=np.uint8)
         kp1_ = msd.detect(img1, None)
 
+class matchLOGOS_test(NewOpenCVTests):
+
+    def test_basic(self):
+
+        frame = self.get_sample('python/images/baboon.png', cv.IMREAD_COLOR)
+        detector = cv.AKAZE_create(threshold = 0.003)
+
+        keypoints1, descrs1 = detector.detectAndCompute(frame, None)
+        keypoints2, descrs2 = detector.detectAndCompute(frame, None)
+        matches1to2 = cv.xfeatures2d.matchLOGOS(keypoints1, keypoints2, range(len(keypoints1)), range(len(keypoints2)))
+        self.assertFalse(matches1to2 is None)
+
 
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/xfeatures2d/src/cuda/surf.cu b/modules/xfeatures2d/src/cuda/surf.cu
index b8ef4d627e4..2630cf9e64b 100644
--- a/modules/xfeatures2d/src/cuda/surf.cu
+++ b/modules/xfeatures2d/src/cuda/surf.cu
@@ -42,7 +42,7 @@
 
 #include "opencv2/opencv_modules.hpp"
 
-#ifdef HAVE_OPENCV_CUDAARITHM
+#if defined(HAVE_OPENCV_CUDAARITHM) && defined(OPENCV_ENABLE_NONFREE)
 
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/limits.hpp"