diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml
index 250b5e157a6..f33cc37d5d5 100644
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@@ -26,3 +26,10 @@ jobs:
 
   macOS-X64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-macOS-x86_64.yaml@main
+
+  Linux-RISC-V-Clang:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-RISCV.yaml@main
+
+  openEuler2203-x64:
+    if: "${{ contains(github.event.pull_request.labels.*.name, 'category: cann') }}"
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-O22-CANN.yaml@main
diff --git a/modules/aruco/src/charuco.cpp b/modules/aruco/src/charuco.cpp
index acf691a28b0..42dbd0d4a05 100644
--- a/modules/aruco/src/charuco.cpp
+++ b/modules/aruco/src/charuco.cpp
@@ -39,7 +39,7 @@ void detectCharucoDiamond(InputArray _image, InputArrayOfArrays _markerCorners,
     vector<Mat> markerCorners;
     _markerCorners.getMatVector(markerCorners);
 
-    detector.detectBoard(_image, _diamondCorners, _diamondIds, markerCorners, _markerIds.getMat());
+    detector.detectDiamonds(_image, _diamondCorners, _diamondIds, markerCorners, _markerIds.getMat());
 }
 
 
diff --git a/modules/cannops/CMakeLists.txt b/modules/cannops/CMakeLists.txt
new file mode 100644
index 00000000000..0c16c5eb143
--- /dev/null
+++ b/modules/cannops/CMakeLists.txt
@@ -0,0 +1,17 @@
+ if(IOS OR WINRT OR ANDROID OR APPLE OR WIN32 OR (NOT HAVE_CANN))
+   ocv_module_disable(cannops)
+ endif()
+
+set(the_description "Ascend-accelerated Operations on Matrices")
+
+ocv_add_module(cannops opencv_core WRAP python)
+ocv_module_include_directories(${CANN_INCLUDE_DIRS})
+ocv_glob_module_sources()
+ocv_install_used_external_targets(${CANN_LIBRARIES})
+ocv_create_module(${CANN_LIBRARIES})
+
+ocv_include_directories(${CMAKE_SOURCE_DIR}/modules/ts/include)
+
+ocv_add_accuracy_tests(DEPENDS_ON opencv_cannops)
+ocv_add_perf_tests(DEPENDS_ON opencv_cannops)
+ocv_add_samples(opencv_cannops)
diff --git a/modules/cannops/Dockerfile b/modules/cannops/Dockerfile
new file mode 100644
index 00000000000..939999eed4f
--- /dev/null
+++ b/modules/cannops/Dockerfile
@@ -0,0 +1,67 @@
+# User guides
+#
+# 0. Install Ascend driver on host.
+#    (https://www.hiascend.com/en/hardware/firmware-drivers)
+#
+# 1. Run docker container.
+# docker run -it \
+#    --name opencv \
+#    --device /dev/davinci0 \
+#    --device /dev/davinci_manager \
+#    --device /dev/devmm_svm \
+#    --device /dev/hisi_hdc \
+#    -v /usr/local/dcmi:/usr/local/dcmi \
+#    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+#    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+#    opencv bash
+#
+# 2. Check environment.
+# npu-smi info
+#
+# 3. Compile opencv with Ascend NPU backend.
+# cmake -DWITH_CANN=1
+#
+# 4. Run opencv_test_cannops.
+# ./bin/opencv_test_cannops
+
+FROM openeuler/openeuler:22.03-lts-sp2
+
+RUN yum install -y \
+    git \
+    wget \
+    gcc \
+    g++ \
+    cmake \
+    make \
+    python-pip \
+    python3-devel
+
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple \
+    numpy \
+    sympy \
+    decorator \
+    scipy \
+    attrs \
+    psutil
+
+# Install CANN
+RUN wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%207.0.RC1/Ascend-cann-toolkit_7.0.RC1_linux-"$(uname -i)".run && \
+    chmod +x Ascend-cann-toolkit_7.0.RC1_linux-"$(uname -i)".run && \
+    ./Ascend-cann-toolkit_7.0.RC1_linux-"$(uname -i)".run --quiet --install && \
+    rm -f ./Ascend-cann-toolkit_7.0.RC1_linux-"$(uname -i)".run
+
+# Install kernel
+RUN wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%207.0.RC1/Ascend-cann-kernels-310p_7.0.RC1_linux.run && \
+    chmod +x Ascend-cann-kernels-310p_7.0.RC1_linux.run && \
+    ./Ascend-cann-kernels-310p_7.0.RC1_linux.run --quiet --install && \
+    rm -f ./Ascend-cann-kernels-310p_7.0.RC1_linux.run
+
+ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH:/usr/lib64
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:$LD_LIBRARY_PATH
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:$PYTHONPATH
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:$PATH
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
diff --git a/modules/cannops/include/opencv2/cann.hpp b/modules/cannops/include/opencv2/cann.hpp
new file mode 100644
index 00000000000..30555dd8257
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann.hpp
@@ -0,0 +1,328 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_HPP
+#define OPENCV_CANNOPS_CANN_HPP
+
+#include "opencv2/core.hpp"
+
+/**
+  @defgroup cann Ascend-accelerated Computer Vision
+  @{
+    @defgroup canncore Core part
+    @{
+      @defgroup cann_struct Data Structures
+      @defgroup cann_init Initializeation and Information
+    @}
+  @}
+ */
+
+namespace cv
+{
+namespace cann
+{
+class AscendStream;
+
+//! @addtogroup cann_struct
+//! @{
+
+//===================================================================================
+// AscendMat
+//===================================================================================
+
+/** @brief Base storage class for NPU memory with reference counting.
+ * AscendMat class has a similar interface with Mat and AscendMat, and work on [Ascend
+ * NPU](https://www.hiascend.com/) backend.
+ * @sa Mat cuda::GpuMat
+ */
+class AscendStream;
+class CV_EXPORTS_W AscendMat
+{
+public:
+    class CV_EXPORTS_W Allocator
+    {
+    public:
+        virtual ~Allocator() {}
+        // basic allocator
+        virtual std::shared_ptr<uchar> allocate(size_t size) = 0;
+        // allocator must fill data, step and refcount fields
+        virtual bool allocate(AscendMat* mat, int rows, int cols, size_t elemSize) = 0;
+    };
+
+    /**
+     * @brief Create default allocator for AscendMat. This allocator alloc memory from device for
+     * specific size.
+     */
+    CV_WRAP static AscendMat::Allocator* defaultAllocator();
+
+    /**
+     * @brief Set allocator for AscendMat.
+     * @param allocator
+     */
+    CV_WRAP static void setDefaultAllocator(AscendMat::Allocator* allocator);
+
+    //! default constructor
+    CV_WRAP explicit AscendMat(AscendMat::Allocator* allocator_ = AscendMat::defaultAllocator());
+
+    //! constructs AscendMat of the specified size and type
+    CV_WRAP AscendMat(int rows, int cols, int type,
+                      AscendMat::Allocator* allocator = AscendMat::defaultAllocator());
+    //! constructs AscendMat of the specified size and type
+    CV_WRAP AscendMat(Size size, int type,
+                      AscendMat::Allocator* allocator = AscendMat::defaultAllocator());
+
+    //! constructs AscendMat and fills it with the specified value s
+    CV_WRAP AscendMat(int rows, int cols, int type, Scalar& s,
+                      AscendMat::Allocator* allocator = AscendMat::defaultAllocator());
+    //! constructs AscendMat and fills it with the specified value s
+    CV_WRAP AscendMat(Size size, int type, Scalar& s,
+                      AscendMat::Allocator* allocator = AscendMat::defaultAllocator());
+
+    //! copy constructor
+    CV_WRAP AscendMat(const AscendMat& m);
+
+    //! constructs AscendMat by crop a certain area from another
+    CV_WRAP AscendMat(InputArray _m, const Rect& roi);
+    CV_WRAP AscendMat(InputArray _m, const Rect& roi, AscendStream& stream);
+
+    //! builds AscendMat from host memory (Blocking call)
+    CV_WRAP explicit AscendMat(InputArray arr, AscendStream& stream,
+                               AscendMat::Allocator* allocator = AscendMat::defaultAllocator());
+
+    //! assignment operators
+    AscendMat& operator=(const AscendMat& m);
+
+    //! sets some of the AscendMat elements to s (Blocking call)
+    CV_WRAP AscendMat& setTo(const Scalar& s);
+    //! sets some of the AscendMat elements to s (Non-Blocking call)
+    CV_WRAP AscendMat& setTo(const Scalar& s, AscendStream& stream);
+
+    //! sets all of the AscendMat elements to float (Blocking call)
+    CV_WRAP AscendMat& setTo(float sc);
+
+    //! sets all of the AscendMat elements to float (Non-Blocking call)
+    CV_WRAP AscendMat& setTo(float sc, AscendStream& stream);
+
+    //! swaps with other smart pointer
+    CV_WRAP void swap(AscendMat& mat);
+
+    //! allocates new AscendMat data unless the AscendMat already has specified size and type
+    CV_WRAP void create(int rows, int cols, int type);
+
+    //! upload host memory data to AscendMat (Blocking call)
+    CV_WRAP void upload(InputArray arr);
+    //! upload host memory data to AscendMat (Non-Blocking call)
+    CV_WRAP void upload(InputArray arr, AscendStream& stream);
+
+    //! download data from AscendMat to host (Blocking call)
+    CV_WRAP void download(OutputArray dst) const;
+    //! download data from AscendMat to host (Non-Blocking call)
+    CV_WRAP void download(OutputArray dst, AscendStream& stream) const;
+
+    //! converts AscendMat to another datatype (Blocking call)
+    CV_WRAP void convertTo(CV_OUT AscendMat& dst, int rtype) const;
+
+    //! converts AscendMat to another datatype (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT AscendMat& dst, int rtype, AscendStream& stream) const;
+
+    //! converts AscendMat to another datatype, dst mat is allocated. (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT AscendMat& dst, AscendStream& stream) const;
+
+    //! returns true iff the AscendMat data is continuous
+    //! (i.e. when there are no gaps between successive rows)
+    CV_WRAP bool isContinuous() const;
+
+    //! returns element size in bytes
+    CV_WRAP size_t elemSize() const;
+
+    //! returns the size of element channel in bytes
+    CV_WRAP size_t elemSize1() const;
+
+    //! returns element type
+    CV_WRAP int type() const;
+
+    //! returns element type
+    CV_WRAP int depth() const;
+
+    //! returns number of channels
+    CV_WRAP int channels() const;
+
+    //! returns step/elemSize1()
+    CV_WRAP size_t step1() const;
+
+    //! returns AscendMat size : width == number of columns, height == number of rows
+    CV_WRAP Size size() const;
+
+    //! returns true if AscendMat data is NULL
+    CV_WRAP bool empty() const;
+
+    //! internal use method: updates the continuity flag
+    CV_WRAP void updateContinuityFlag();
+
+    /*! includes several bit-fields:
+     - the magic signature
+     - continuity flag
+     - depth
+     - number of channels
+     */
+    int flags;
+
+    //! the number of rows and columns
+    int rows, cols;
+
+    //! a distance between successive rows in bytes; includes the gap if any
+    CV_PROP size_t step;
+
+    //! pointer to the data
+    std::shared_ptr<uchar> data;
+
+    //! helper fields used in locateROI and adjustROI
+    uchar* datastart;
+    const uchar* dataend;
+
+    //! allocator
+    Allocator* allocator;
+};
+
+class AscendStream;
+class AscendStreamAccessor;
+class AscendEvent;
+class AscendEventAccessor;
+class DefaultDeviceInitializer;
+
+//===================================================================================
+// AscendStream
+//===================================================================================
+
+/** @brief In AscendCL Stream(AscendStream) is a task queue. Stream is used to manage the
+ * parallelism of tasks. The tasks inside a Stream are executed sequentially, that is, the Stream
+ * executes sequentially according to the sent tasks; the tasks in different Streams are executed in
+ * parallel.
+ *
+ * All Non-blocking functions should pass parameter stream, These function returns immediately after
+ * the task is submitted. Caller should wait stream until completion.
+ *
+ * Blocking functions implicityly use the default stream, and synchronize stream before function
+ * return.
+ * @sa cuda::Stream
+ */
+
+// TODO: Stream is defined in namespace cuda, and pybind code does not use a namespace of stream,
+// change stream name to AscendStream to avoid confilct.
+class CV_EXPORTS_W AscendStream
+{
+public:
+    CV_WRAP AscendStream();
+
+    //! blocks the current CPU thread until all operations in the stream are complete.
+    CV_WRAP void waitForCompletion();
+
+    //! blocks the current CPU thread until event trigger.
+    CV_WRAP void waitAscendEvent(const cv::cann::AscendEvent& event);
+
+    /**
+     * @brief return default AscendStream object for default Acl stream.
+     */
+    CV_WRAP static AscendStream& Null();
+
+    // acl symbols CANNOT used in any hpp files. Use a inner class to avoid acl symbols defined in
+    // hpp.
+    class Impl;
+
+    void addTensorHolder(const std::shared_ptr<uchar>& holder);
+
+private:
+    Ptr<Impl> impl_;
+    AscendStream(const Ptr<Impl>& impl);
+
+    friend class AscendStreamAccessor;
+    friend class DefaultDeviceInitializer;
+};
+
+/**
+ * @brief AscendEvent to synchronize between different streams.
+ */
+class CV_EXPORTS_W AscendEvent
+{
+public:
+    CV_WRAP AscendEvent();
+
+    //! records an event
+    CV_WRAP void record(AscendStream& stream);
+
+    //! waits for an event to complete
+    CV_WRAP void waitForComplete() const;
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    AscendEvent(const Ptr<Impl>& impl);
+
+    friend class AscendEventAccessor;
+};
+
+/** @brief Bindings overload to create a Stream object from the address stored in an existing CANN
+ * Runtime API stream pointer (aclrtStream).
+ * @param AscendStreamAddress Memory address stored in a CANN Runtime API stream pointer
+ * (aclrtStream). The created Stream object does not perform any allocation or deallocation and
+ * simply wraps existing raw CANN Runtime API stream pointer.
+ * @note Overload for generation of bindings only, not exported or intended for use internally fro
+ * C++.
+ */
+CV_EXPORTS_W AscendStream wrapStream(size_t AscendStreamAddress);
+
+//! @} cann_struct
+
+//===================================================================================
+// Initialization & Info
+//===================================================================================
+
+//! @addtogroup cann_init
+//! @{
+
+//! Get Ascend matrix object from Input array, upload matrix memory if need. (Non-Blocking call)
+AscendMat getInputMat(InputArray src, AscendStream& stream);
+
+//! Get Ascend matrix object from Output array, upload matrix memory if need.
+AscendMat getOutputMat(OutputArray dst, int rows, int cols, int type, AscendStream& stream);
+
+//! Sync output matrix to Output array, download matrix memory if need.
+void syncOutput(const AscendMat& dst, OutputArray _dst, AscendStream& stream);
+
+/**
+ * @brief Choose Ascend npu device.
+ */
+CV_EXPORTS_W void setDevice(int device);
+
+/**
+ * @brief Clear all context created in current Ascend device.
+ */
+CV_EXPORTS_W void resetDevice();
+
+/**
+ * @brief Get current Ascend device.
+ */
+CV_EXPORTS_W int32_t getDevice();
+
+/**
+ * @brief init AscendCL.
+ */
+CV_EXPORTS_W void initAcl();
+
+/**
+ * @brief finalize AscendCL.
+ * @note finalizeAcl only can be called once for a process. Call this function after all AscendCL
+ * options finished.
+ */
+CV_EXPORTS_W void finalizeAcl();
+
+//! @} cann_init
+
+} // namespace cann
+} // namespace cv
+
+#include "opencv2/cann.inl.hpp"
+
+#endif // OPENCV_CANNOPS_CANN_HPP
diff --git a/modules/cannops/include/opencv2/cann.inl.hpp b/modules/cannops/include/opencv2/cann.inl.hpp
new file mode 100644
index 00000000000..4a97466b375
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann.inl.hpp
@@ -0,0 +1,97 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_INL_HPP
+#define OPENCV_CANNOPS_CANN_INL_HPP
+
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+inline AscendMat::AscendMat(AscendMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    // Empty mat is also continuous.
+    flags |= Mat::CONTINUOUS_FLAG;
+}
+
+inline AscendMat::AscendMat(int rows_, int cols_, int type_, AscendMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline AscendMat::AscendMat(Size size_, int type_, AscendMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline AscendMat::AscendMat(InputArray arr, AscendStream& stream, AscendMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    upload(arr, stream);
+}
+
+inline AscendMat::AscendMat(const AscendMat& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data),
+      datastart(m.datastart), dataend(m.dataend), allocator(m.allocator)
+{}
+
+inline AscendMat& AscendMat::operator=(const AscendMat& m)
+{
+    if (this != &m)
+    {
+        AscendMat temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline void AscendMat::swap(AscendMat& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(allocator, b.allocator);
+}
+
+inline bool AscendMat::isContinuous() const { return (flags & Mat::CONTINUOUS_FLAG) != 0; }
+
+inline size_t AscendMat::elemSize() const { return CV_ELEM_SIZE(flags); }
+
+inline size_t AscendMat::elemSize1() const { return CV_ELEM_SIZE1(flags); }
+
+inline int AscendMat::type() const { return CV_MAT_TYPE(flags); }
+
+inline int AscendMat::depth() const { return CV_MAT_DEPTH(flags); }
+
+inline int AscendMat::channels() const { return CV_MAT_CN(flags); }
+
+inline size_t AscendMat::step1() const { return step / elemSize1(); }
+
+inline Size AscendMat::size() const { return Size(cols, rows); }
+
+inline bool AscendMat::empty() const { return data == 0; }
+
+inline AscendStream::AscendStream(const Ptr<AscendStream::Impl>& impl) : impl_(impl) {}
+
+inline AscendEvent::AscendEvent(const Ptr<AscendEvent::Impl>& impl) : impl_(impl) {}
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_CANN_INL_HPP
diff --git a/modules/cannops/include/opencv2/cann_call.hpp b/modules/cannops/include/opencv2/cann_call.hpp
new file mode 100644
index 00000000000..651bff8bba0
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann_call.hpp
@@ -0,0 +1,157 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_CALL_HPP
+#define OPENCV_CANNOPS_CANN_CALL_HPP
+
+#include <vector>
+#include <set>
+#include <string>
+#include <acl/acl_base.h>
+#include "opencv2/cann.hpp"
+
+class aclopAttr;
+
+namespace cv
+{
+namespace cann
+{
+// Warpper for functions in CANN, callers should not call CANN's api directly, but should call the
+// function provided in cann_call.
+void aclrtMallocWarpper(void** data, size_t size);
+void aclrtFreeWarpper(void* data);
+void aclrtMemcpyWarpper(std::shared_ptr<uchar>& dst, size_t offset, const void* src, size_t size,
+                        AscendStream& stream);
+void aclrtMemcpyWarpper(void* dst, const std::shared_ptr<uchar>& src, size_t offset, size_t size,
+                        AscendStream& stream);
+void aclrtMemcpyWarpper(std::shared_ptr<uchar>& dst, size_t dstOffset,
+                        const std::shared_ptr<uchar>& src, size_t srcOffset, size_t size,
+                        AscendStream& stream);
+void aclrtMemcpy2dWarpper(std::shared_ptr<uchar>& dst, size_t offset, size_t dpitch,
+                          const void* src, size_t spitch, size_t width, size_t length,
+                          AscendStream& stream);
+void aclrtMemcpy2dWarpper(void* dst, size_t dpitch, const std::shared_ptr<uchar>& src,
+                          size_t offset, size_t spitch, size_t width, size_t length,
+                          AscendStream& stream);
+void aclrtMemsetWarpper(std::shared_ptr<uchar>& ptr, int32_t value, size_t count,
+                        AscendStream& stream);
+//! Type mapping between opencv and cann.
+aclDataType getACLType(int opencvdepth);
+//! Malloc and upload raw data to devices.
+std::shared_ptr<uchar> mallocAndUpload(const void* data, size_t size, AscendStream& stream,
+                                       AscendMat::Allocator* allocator);
+/**
+ * @brief Warpper of CANN streams.
+ */
+class AscendStream::Impl
+{
+public:
+    aclrtStream stream;
+    bool ownStream;
+    /**
+     * @brief Ascend and CANN use stream to implement asynchronous calls. Which means when function
+     * returns, operator may not finish, even not start. If caller free any tensors that participate
+     * in this operatation, it have a chance to access invalid memory.
+     * All tensors should add to holder, holder will be cleaned by waitForCompletion function, or when
+     * the stream is destructing.
+     */
+    std::set<std::shared_ptr<uchar>> tensorHolders;
+    Impl();
+    explicit Impl(aclrtStream stream);
+    void AddTensorHolder(const std::shared_ptr<uchar>& tensorData);
+};
+
+/**
+ * @brief Warpper of CANN event.
+ */
+class AscendEvent::Impl
+{
+public:
+    aclrtEvent event;
+    bool ownEvent;
+
+    Impl();
+    explicit Impl(aclrtEvent event);
+    ~Impl();
+};
+
+/**
+ * @brief Parameter type for call_call interfaces.
+ */
+struct AscendTensor
+{
+    const char* name;
+    std::shared_ptr<uchar> data;
+    size_t dataSize;
+    std::vector<int64_t> dims;
+    aclDataType dtype;
+    aclFormat format;
+    AscendTensor(){};
+    AscendTensor(std::shared_ptr<uchar> _data, size_t _dataSize, int64_t* _dims, size_t _dimSize,
+                 aclDataType _dtype, const char* _name = "", aclFormat _format = ACL_FORMAT_ND);
+    AscendTensor(std::shared_ptr<uchar> _data, size_t _dataSize, std::vector<int64_t>& _dims,
+                 aclDataType _dtype, const char* _name = "", aclFormat _format = ACL_FORMAT_ND)
+        : name(_name), data(_data), dataSize(_dataSize), dims(_dims), dtype(_dtype),
+          format(_format){};
+    AscendTensor(const AscendMat& ascendMat, const char* _name = "",
+                 aclFormat format = ACL_FORMAT_ND);
+};
+
+/**
+ * @brief Interface to call operators in CANN package.
+ */
+class OperatorRunner
+{
+private:
+    std::vector<aclDataBuffer*> inputBuffers_;
+    std::vector<aclDataBuffer*> outputBuffers_;
+    std::vector<aclTensorDesc*> inputDesc_;
+    std::vector<aclTensorDesc*> outputDesc_;
+    aclopAttr* opAttr_;
+    bool opAttrInit;
+    std::string op;
+
+    std::set<std::shared_ptr<uchar>> holder;
+
+    OperatorRunner& addInput(AscendTensor& mat);
+    OperatorRunner& addOutput(AscendTensor& mat);
+
+public:
+    OperatorRunner() : opAttrInit(false) {}
+    virtual ~OperatorRunner() { reset(); }
+    OperatorRunner& setOp(const char* op);
+    OperatorRunner& addInput(const AscendMat& mat);
+    OperatorRunner& addOutput(AscendMat& mat);
+    OperatorRunner& addAttr(float value, const char* name);
+    OperatorRunner& addAttr(const char* value, const char* name);
+    OperatorRunner& addAttr(int value, const char* name);
+    OperatorRunner& addAttr(bool value, const char* name);
+    OperatorRunner& addAttr(const int64_t* value, int size, const char* name);
+    OperatorRunner& addInput(const AscendMat& mat, const char* name);
+    OperatorRunner& addInput(const Scalar& sc, int type, const char* name);
+
+    template <typename T>
+    OperatorRunner& addInput(const T* value, int64_t* dims, size_t dimSize, aclDataType type,
+                             const char* name)
+    {
+        int64_t size = dims[0];
+        for (size_t i = 1; i < dimSize; i++)
+            size *= dims[i];
+
+        size_t dataSize = size * sizeof(T);
+        std::shared_ptr<uchar> ptr =
+            mallocAndUpload(value, dataSize, AscendStream::Null(), AscendMat::defaultAllocator());
+
+        AscendTensor tensor(ptr, dataSize, dims, dimSize, type, name);
+        return addInput(tensor);
+    }
+    OperatorRunner& addOutput(AscendMat& mat, const char* name);
+    OperatorRunner& reset();
+    OperatorRunner& run(AscendStream& stream);
+};
+
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_CANN_CALL_HPP
diff --git a/modules/cannops/include/opencv2/cann_interface.hpp b/modules/cannops/include/opencv2/cann_interface.hpp
new file mode 100644
index 00000000000..6667eb58519
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann_interface.hpp
@@ -0,0 +1,516 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_INTERFACE_HPP
+#define OPENCV_CANNOPS_CANN_INTERFACE_HPP
+
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+
+/**
+  @addtogroup cann
+  @{
+    @defgroup cannops Operations for Ascend Backend.
+    @{
+        @defgroup cannops_elem Per-element Operations
+        @defgroup cannops_core Core Operations on Matrices
+        @defgroup cannimgproc Image Processing
+    @}
+  @}
+ */
+
+//! @addtogroup cannops_elem
+//! @{
+
+/** @brief Computes a matrix-matrix or matrix-scalar sum.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param dtype Optional depth of the output array.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::add cuda::add
+ */
+CV_EXPORTS_W void add(const InputArray src1, const InputArray src2, OutputArray dst,
+                      const InputArray mask = noArray(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+// This code should not be compiled nor analyzed by doxygen. This interface only for python binding
+// code generation. add(InputArray, InputArray ...) can accept Scalar as its parametr.(Scalar -> Mat
+// -> InputArray)
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void add(const InputArray src1, const Scalar& src2, OutputArray dst,
+                      const InputArray mask = noArray(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void add(const Scalar& src1, const InputArray src2, OutputArray dst,
+                      const InputArray mask = noArray(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+#endif
+// More overload functions. In order to decouple from the main opencv repository and simplify
+// user calling methods, besides the traditional Input/OutputArray parameters, some
+// overloaded functions for the AcendMat parameter is also provided.
+/** @overload */
+CV_EXPORTS_W void add(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                      const AscendMat& mask = AscendMat(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void add(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                      const AscendMat& mask = AscendMat(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void add(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                      const AscendMat& mask = AscendMat(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+
+/** @brief Computes a matrix-matrix or matrix-scalar difference.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param dtype Optional depth of the output array.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::subtract cuda::subtract
+ */
+CV_EXPORTS_W void subtract(const InputArray src1, const InputArray src2, OutputArray dst,
+                           const InputArray mask = noArray(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void subtract(const InputArray src1, const Scalar& src2, OutputArray dst,
+                           const InputArray mask = noArray(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void subtract(const Scalar& src1, const InputArray src2, OutputArray dst,
+                           const InputArray mask = noArray(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+#endif
+/** @overload */
+CV_EXPORTS_W void subtract(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                           const AscendMat& mask = AscendMat(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void subtract(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                           const AscendMat& mask = AscendMat(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void subtract(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                           const AscendMat& mask = AscendMat(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+
+/** @brief Computes a matrix-matrix or matrix-scalar per-element product.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param scale Optional scale factor.
+ * @param dtype Optional depth of the output array.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::multiply cuda::multiply
+ */
+CV_EXPORTS_W void multiply(const InputArray src1, const InputArray src2, OutputArray dst,
+                           float scale = 1, int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void multiply(const InputArray src1, const Scalar& src2, OutputArray dst,
+                           float scale = 1, int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void multiply(const Scalar& src1, const InputArray src2, OutputArray dst,
+                           float scale = 1, int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+#endif
+/** @overload */
+CV_EXPORTS_W void multiply(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                           float scale = 1, int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void multiply(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                           float scale = 1, int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void multiply(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                           float scale = 1, int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+
+/** @brief Computes a matrix-matrix or matrix-scalar division.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param scale Optional scale factor.
+ * @param dtype Optional depth of the output array.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::divide cuda::divide
+ */
+CV_EXPORTS_W void divide(const InputArray src1, const InputArray src2, OutputArray dst,
+                         float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void divide(const InputArray src1, const Scalar& src2, OutputArray dst,
+                         float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void divide(const Scalar& src1, const InputArray src2, OutputArray dst,
+                         float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+#endif
+CV_EXPORTS_W void divide(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                         float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void divide(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                         float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void divide(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                         float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+
+/** @brief Performs a per-element bitwise conjunction of two matrices (or of matrix and scalar).
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::bitwise_and cuda::bitwise_and
+ */
+CV_EXPORTS_W void bitwise_and(const InputArray src1, const InputArray src2, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void bitwise_and(const InputArray src1, const Scalar& src2, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void bitwise_and(const Scalar& src1, const InputArray src2, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+#endif
+CV_EXPORTS_W void bitwise_and(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_and(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_and(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+
+/** @brief Performs a per-element bitwise disjunction of two matrices (or of matrix and scalar).
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::bitwise_or cuda::bitwise_or
+ */
+CV_EXPORTS_W void bitwise_or(const InputArray src1, const InputArray src2, OutputArray dst,
+                             const InputArray mask = noArray(),
+                             AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void bitwise_or(const InputArray src1, const Scalar& src2, OutputArray dst,
+                             const InputArray mask = noArray(),
+                             AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void bitwise_or(const Scalar& src1, const InputArray src2, OutputArray dst,
+                             const InputArray mask = noArray(),
+                             AscendStream& stream = AscendStream::Null());
+#endif
+CV_EXPORTS_W void bitwise_or(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                             const AscendMat& mask = AscendMat(),
+                             AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_or(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                             const AscendMat& mask = AscendMat(),
+                             AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_or(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                             const AscendMat& mask = AscendMat(),
+                             AscendStream& stream = AscendStream::Null());
+
+/** @brief Performs a per-element bitwise exclusive or operation of two matrices (or of matrix and
+ * scalar).
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::bitwise_xor cuda::bitwise_xor
+ */
+CV_EXPORTS_W void bitwise_xor(const InputArray src1, const InputArray src2, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void bitwise_xor(const InputArray src1, const Scalar& src2, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void bitwise_xor(const Scalar& src1, const InputArray src2, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+#endif
+CV_EXPORTS_W void bitwise_xor(const AscendMat& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_xor(const AscendMat& src1, const Scalar& src2, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_xor(const Scalar& src1, const AscendMat& src2, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+
+/** @brief Performs a per-element bitwise inversion.
+ * @param src First source matrix.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::bitwise_not cuda::bitwise_not
+ */
+CV_EXPORTS_W void bitwise_not(const InputArray src, OutputArray dst,
+                              const InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void bitwise_not(const AscendMat& src, CV_OUT AscendMat& dst,
+                              const AscendMat& mask = AscendMat(),
+                              AscendStream& stream = AscendStream::Null());
+
+/** @brief Computes the weighted sum of two arrays.
+
+@param src1 First source array.
+@param alpha Weight for the first array elements.
+@param src2 Second source array of the same size and channel number as src1 .
+@param beta Weight for the second array elements.
+@param dst Destination array that has the same size and number of channels as the input arrays.
+@param gamma Scalar added to each sum.
+@param dtype Optional depth of the destination array. When both input arrays have the same depth,
+dtype can be set to -1, which will be equivalent to src1.depth().
+@param stream Stream for the asynchronous version.
+
+The function addWeighted calculates the weighted sum of two arrays as follows:
+
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{src1} (I)* \texttt{alpha} +  \texttt{src2} (I)*
+\texttt{beta} +  \texttt{gamma} )\f]
+
+where I is a multi-dimensional index of array elements. In case of multi-channel arrays, each
+channel is processed independently.
+
+@sa cv::addWeighted cv::cuda::addWeighted
+ */
+CV_EXPORTS_W void addWeighted(const InputArray src1, double alpha, const InputArray src2,
+                              double beta, double gamma, OutputArray dst, int dtype = -1,
+                              AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2,
+                              double beta, double gamma, CV_OUT AscendMat& dst, int dtype = -1,
+                              AscendStream& stream = AscendStream::Null());
+
+/** @brief Applies a fixed-level threshold to each array element.
+
+@param src Source array (single-channel).
+@param dst Destination array with the same size and type as src .
+@param thresh Threshold value.
+@param maxval Maximum value to use with THRESH_BINARY and THRESH_BINARY_INV threshold types.
+@param type Threshold type. For details, see threshold . The THRESH_MASK, THRESH_OTSU and
+THRESH_TRIANGLE threshold types are not supported.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::threshold cv::cuda::threshold
+*/
+CV_EXPORTS_W double threshold(const InputArray src, OutputArray dst, double thresh, double maxval,
+                              int type, AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W double threshold(const AscendMat& src, CV_OUT AscendMat& dst, double thresh,
+                              double maxval, int type, AscendStream& stream = AscendStream::Null());
+
+//! @} cannops_elem
+
+//! @addtogroup cannops_core
+//! @{
+
+/** @brief Makes a multi-channel matrix out of several single-channel matrices.
+
+@param src Array/vector of source matrices.
+@param n Number of source matrices.
+@param dst Destination matrix.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::merge cv::cuda::merge
+ */
+CV_EXPORTS_W void merge(const AscendMat* src, size_t n, CV_OUT AscendMat& dst,
+                      AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void merge(const std::vector<AscendMat>& src, CV_OUT AscendMat& dst,
+                        AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void merge(const AscendMat* src, size_t n, OutputArray& dst,
+                        AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void merge(const std::vector<AscendMat>& src, OutputArray& dst,
+                        AscendStream& stream = AscendStream::Null());
+
+/** @brief Copies each plane of a multi-channel matrix into an array.
+
+@param src Source matrix.
+@param dst Destination array/vector of single-channel matrices.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::split cv::cuda::split
+ */
+CV_EXPORTS_W void split(const AscendMat& src, AscendMat* dst,
+                      AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void split(const AscendMat& src, CV_OUT std::vector<AscendMat>& dst,
+                        AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void split(const InputArray src, AscendMat* dst,
+                        AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void split(const InputArray src, CV_OUT std::vector<AscendMat>& dst,
+                        AscendStream& stream = AscendStream::Null());
+
+/** @brief Transposes a matrix.
+
+@param src Source matrix.
+@param dst Destination matrix.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::transpose cv::cuda::transpose
+ */
+CV_EXPORTS_W void transpose(InputArray src, OutputArray dst,
+                            AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void transpose(const AscendMat& src, CV_OUT AscendMat& dst,
+                            AscendStream& stream = AscendStream::Null());
+/** @brief Flips a 2D matrix around vertical, horizontal, or both axes.
+
+@param src Source matrix.
+@param dst Destination matrix.
+@param flipCode Flip mode for the source:
+-   0 Flips around x-axis.
+-   \> 0 Flips around y-axis.
+-   \< 0 Flips around both axes.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::flip cv::cuda::flip
+ */
+CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode,
+                       AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void flip(const AscendMat& src, CV_OUT AscendMat& dst, int flipCode,
+                       AscendStream& stream = AscendStream::Null());
+/** @brief Rotates a 2D array in multiples of 90 degrees.
+The function cv::rotate rotates the array in one of three different ways:
+*   Rotate by 90 degrees clockwise (rotateCode = ROTATE_90_CLOCKWISE).
+*   Rotate by 180 degrees clockwise (rotateCode = ROTATE_180).
+*   Rotate by 270 degrees clockwise (rotateCode = ROTATE_90_COUNTERCLOCKWISE).
+@param src input array.
+@param dst output array of the same type as src.  The size is the same with ROTATE_180,
+and the rows and cols are switched for ROTATE_90_CLOCKWISE and ROTATE_90_COUNTERCLOCKWISE.
+@param rotateCode an enum to specify how to rotate the array; see the enum #RotateFlags
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::rotate
+*/
+CV_EXPORTS_W void rotate(InputArray src, OutputArray dst, int rotateCode,
+                         AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void rotate(const AscendMat& src, CV_OUT AscendMat& dst, int rotateMode,
+                         AscendStream& stream = AscendStream::Null());
+
+/** @brief crop a 2D array.
+The function crops the matrix by given cv::Rect.
+Output matrix must be of the same depth as input one, size is specified by given rect size.
+
+@param src input array.
+@param rect a rect to crop a array to
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::gapi::crop
+*/
+CV_EXPORTS_W AscendMat crop(InputArray src, const Rect& rect,
+                            AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W AscendMat crop(const AscendMat& src, const Rect& rect,
+                            AscendStream& stream = AscendStream::Null());
+/** @brief Resizes an image src down to or up to the specified size.
+@param src    input image
+@param dst    output image; it has the size dsize (when it is non-zero) or the size computed from
+src.size(), fx, and fy; the type of dst is the same as of src.
+@param dsize  output image size; if it equals zero, it is computed as:
+     \f[𝚍𝚜𝚒𝚣𝚎 = 𝚂𝚒𝚣𝚎(𝚛𝚘𝚞𝚗𝚍(𝚏𝚡*𝚜𝚛𝚌.𝚌𝚘𝚕𝚜), 𝚛𝚘𝚞𝚗𝚍(𝚏𝚢*𝚜𝚛𝚌.𝚛𝚘𝚠𝚜))\f]
+     Either dsize or both fx and fy must be non-zero.
+@param fx     scale factor along the horizontal axis; when it equals 0, it is computed as
+\f[(𝚍𝚘𝚞𝚋𝚕𝚎)𝚍𝚜𝚒𝚣𝚎.𝚠𝚒𝚍𝚝𝚑/𝚜𝚛𝚌.𝚌𝚘𝚕𝚜\f]
+
+@param fy     scale factor along the vertical axis; when it equals 0, it is computed as
+\f[(𝚍𝚘𝚞𝚋𝚕𝚎)𝚍𝚜𝚒𝚣𝚎.𝚑𝚎𝚒𝚐𝚑𝚝/𝚜𝚛𝚌.𝚛𝚘𝚠𝚜\f]
+@param interpolation    interpolation method(see **cv.cann.InterpolationFlags**)
+@sa cv::resize
+*/
+
+//! interpolation algorithm
+enum InterpolationFlags
+{
+    /** nearest neighbor interpolation */
+    INTER_NEAREST = 0,
+    /** bilinear interpolation */
+    INTER_LINEAR = 1,
+    /** bicubic interpolation */
+    INTER_CUBIC = 2,
+    /** resampling using pixel area relation. It may be a preferred method for image decimation, as
+    it gives moire'-free results. But when the image is zoomed, it is similar to the INTER_NEAREST
+    method. */
+    INTER_AREA = 3,
+    /** mask for interpolation codes */
+    INTER_MAX = 7,
+};
+
+CV_EXPORTS_W void resize(InputArray _src, OutputArray _dst, Size dsize, double inv_scale_x,
+                         double inv_scale_y, int interpolation,
+                         AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void resize(const AscendMat& src, CV_OUT AscendMat& dst, Size dsize, double inv_scale_x,
+                         double inv_scale_y, int interpolation,
+                         AscendStream& stream = AscendStream::Null());
+
+//! @} cannops_core
+
+//! @addtogroup cannimgproc
+//! @{
+
+/** @brief Converts an image from one color space to another.
+
+@param src Source image with CV_8U , CV_16U , or CV_32F depth and 1, 3, or 4 channels.
+@param dst Destination image.
+@param code Color space conversion code. For details, see cvtColor .
+@param dstCn Number of channels in the destination image. If the parameter is 0, the number of the
+channels is derived automatically from src and the code .
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::cvtColor cv::cuda::cvtColor
+ */
+CV_EXPORTS_W void cvtColor(const InputArray src, OutputArray dst, int code, int dstCn = 0,
+                           AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void cvtColor(const AscendMat& src, CV_OUT AscendMat& dst, int code, int dstCn = 0,
+                           AscendStream& stream = AscendStream::Null());
+
+//! @} cannimgproc
+
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_CANN_INTERFACE_HPP
diff --git a/modules/cannops/include/opencv2/cann_private.hpp b/modules/cannops/include/opencv2/cann_private.hpp
new file mode 100644
index 00000000000..bcbe33feb19
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann_private.hpp
@@ -0,0 +1,33 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_PRIVATE_HPP
+#define OPENCV_CANNOPS_CANN_PRIVATE_HPP
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+void arithm_op(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const char* op,
+               AscendStream& stream);
+void arithm_op(const AscendMat& src, const Scalar& sc, AscendMat& dst, const char* op,
+               AscendStream& stream);
+void arithm_op(const Scalar& sc, const AscendMat& src, AscendMat& dst, const char* op,
+               AscendStream& stream);
+void arithm_op(const AscendMat& src, AscendMat& dst, const char* op, AscendStream& stream);
+void arithm_op(const AscendMat& src, float scalar, AscendMat& dst, const char* op,
+               AscendStream& stream);
+void transpose(const AscendMat& src, int64_t* perm, AscendMat& dst, AscendStream& stream);
+void flip(const AscendMat& src, std::vector<int32_t>& asixs, AscendMat& dst, AscendStream& stream);
+void crop(const AscendMat& src, AscendMat& dst, const AscendMat& sizeSrcNpu, int64_t* offset,
+          AscendStream& stream);
+void transData(const AscendMat& src, AscendMat& dst, const char* from, const char* to,
+               AscendStream& stream);
+void resize(const AscendMat& src, AscendMat& dst, int32_t* dstSize, int interpolation,
+            AscendStream& stream);
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_CANN_PRIVATE_HPP
diff --git a/modules/cannops/include/opencv2/stream_accessor.hpp b/modules/cannops/include/opencv2/stream_accessor.hpp
new file mode 100644
index 00000000000..ff64d7dcbc0
--- /dev/null
+++ b/modules/cannops/include/opencv2/stream_accessor.hpp
@@ -0,0 +1,39 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_STREAM_ACCESSOR_HPP
+#define OPENCV_CANNOPS_STREAM_ACCESSOR_HPP
+
+#include <acl/acl_base.h>
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+//! @addtogroup cann_struct
+//! @{
+
+/** @brief Class that enables getting aclrtAscendStream from cann::AscendStream
+ */
+struct AscendStreamAccessor
+{
+    CV_EXPORTS static aclrtStream getStream(const AscendStream& stream);
+    CV_EXPORTS static AscendStream wrapStream(aclrtStream stream);
+};
+
+/** @brief Class that enables getting aclrtAscendEvent from cann::AscendEvent
+ */
+struct AscendEventAccessor
+{
+    CV_EXPORTS static aclrtEvent getEvent(const AscendEvent& event);
+    CV_EXPORTS static AscendEvent wrapEvent(aclrtEvent event);
+};
+
+//! @} cann_struct
+
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_STREAM_ACCESSOR_HPP
diff --git a/modules/cannops/misc/python/pyopencv_cann.hpp b/modules/cannops/misc/python/pyopencv_cann.hpp
new file mode 100644
index 00000000000..02d62487c6a
--- /dev/null
+++ b/modules/cannops/misc/python/pyopencv_cann.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_PYOPENCV_CANN_HPP
+#define OPENCV_CANNOPS_PYOPENCV_CANN_HPP
+
+#ifdef HAVE_OPENCV_CORE
+
+#include "opencv2/cann.hpp"
+
+typedef std::vector<cann::AscendMat> vector_AscendMat;
+typedef cann::AscendMat::Allocator AscendMat_Allocator;
+
+CV_PY_TO_CLASS(cann::AscendMat);
+CV_PY_TO_CLASS(cann::AscendStream);
+
+CV_PY_TO_CLASS_PTR(cann::AscendMat);
+CV_PY_TO_CLASS_PTR(cann::AscendMat::Allocator);
+
+CV_PY_FROM_CLASS(cann::AscendMat);
+CV_PY_FROM_CLASS(cann::AscendStream);
+
+CV_PY_FROM_CLASS_PTR(cann::AscendMat::Allocator);
+
+#endif // HAVE_OPENCV_CORE
+
+#endif // OPENCV_CANNOPS_PYOPENCV_CANN_HPP
diff --git a/modules/cannops/misc/python/test/test_cannops.py b/modules/cannops/misc/python/test/test_cannops.py
new file mode 100644
index 00000000000..f1b53bc192c
--- /dev/null
+++ b/modules/cannops/misc/python/test/test_cannops.py
@@ -0,0 +1,281 @@
+# This file is part of OpenCV project.
+# It is subject to the license terms in the LICENSE file found in the top-level directory
+# of this distribution and at http://opencv.org/license.html.
+
+import cv2 as cv
+from tests_common import NewOpenCVTests
+import numpy as np
+
+def genMask(mask, listx, listy):
+    for row in range(mask.shape[0]):
+        for col in range(mask.shape[1]):
+            if (row in listx and col in listx) or (row in listy and col in listy):
+                mask[row][col] = 1
+    mask = mask.astype(np.uint8)
+    return mask
+
+
+mask = np.zeros((5, 5))
+listx = [0, 1]
+listy = [1, 2]
+mask = genMask(mask, listx, listy)
+
+
+class cannop_test(NewOpenCVTests):
+    def test_ascend(self):
+        cv.cann.initAcl()
+        cv.cann.getDevice()
+        cv.cann.setDevice(0)
+        stream = cv.cann.AscendStream_Null()
+        cv.cann.wrapStream(id(stream))
+        cv.cann.resetDevice()
+
+    def test_arithmetic(self):
+        # input data
+        npMat1 = np.random.random((5, 5, 3)).astype(int)
+        npMat2 = np.random.random((5, 5, 3)).astype(int)
+        cv.cann.setDevice(0)
+
+        # ACLMat input data
+        aclMat1 = cv.cann.AscendMat()
+        aclMat1.upload(npMat1)
+        aclMat2 = cv.cann.AscendMat()
+        aclMat2.upload(npMat2)
+        aclMask = cv.cann.AscendMat()
+        aclMask.upload(mask)
+        aclMatDst = cv.cann.AscendMat(aclMat1.size(), aclMat1.type())
+
+        # InputArray interface test
+        self.assertTrue(np.allclose(cv.cann.add(
+            npMat1, npMat2), cv.add(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.subtract(
+            npMat1, npMat2), cv.subtract(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.multiply(
+            npMat1, npMat2, scale=2), cv.multiply(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.divide(
+            npMat1, npMat2, scale=2), cv.divide(npMat1, npMat2, scale=2)))
+
+        # AscendMat interface test
+        self.assertTrue(np.allclose(cv.cann.add(aclMat1, aclMat2).download(),
+                                    cv.add(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.subtract(aclMat1, aclMat2).download(),
+                                    cv.subtract(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.multiply(aclMat1, aclMat2, scale=2).download(),
+                                    cv.multiply(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.divide(aclMat1, aclMat2, scale=2).download(),
+                                    cv.divide(npMat1, npMat2, scale=2)))
+
+        # mask
+        self.assertTrue(np.allclose(cv.cann.add(
+            npMat1, npMat2, mask=mask), cv.add(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.subtract(
+            npMat1, npMat2, mask=mask), cv.subtract(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.multiply(npMat1, npMat2, scale=2),
+                                    cv.multiply(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.divide(npMat1, npMat2, scale=2),
+                                    cv.divide(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.addWeighted(npMat1, 2, npMat2, 4, 3),
+                                    cv.addWeighted(npMat1, 2, npMat2, 4, 3)))
+
+        self.assertTrue(np.allclose(cv.cann.add(aclMat1, aclMat2, mask=aclMask).download(),
+                                    cv.add(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.subtract(aclMat1, aclMat2, mask=aclMask).download(),
+                                    cv.subtract(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.multiply(aclMat1, aclMat2, scale=2).download(),
+                                    cv.multiply(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.divide(aclMat1, aclMat2, scale=2).download(),
+                                    cv.divide(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.addWeighted(aclMat1, 2, aclMat2, 4, 3).download(),
+                                    cv.addWeighted(npMat1, 2, npMat2, 4, 3)))
+
+        # stream
+        stream = cv.cann.AscendStream()
+        matDst = cv.cann.add(npMat1, npMat2, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(matDst, cv.add(npMat1, npMat2)))
+        matDst = cv.cann.add(npMat1, npMat2, mask=mask, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(matDst, cv.add(npMat1, npMat2, mask=mask)))
+        matDst = cv.cann.subtract(npMat1, npMat2, mask=mask, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(
+            matDst, cv.subtract(npMat1, npMat2, mask=mask)))
+
+        # stream AsceendMat
+        aclMatDst = cv.cann.add(aclMat1, aclMat2, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(aclMatDst.download(),
+                        cv.add(npMat1, npMat2)))
+
+        aclMatDst = cv.cann.add(aclMat1, aclMat2, mask=aclMask, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(aclMatDst.download(),
+                        cv.add(npMat1, npMat2, mask=mask)))
+
+        aclMatDst = cv.cann.subtract(aclMat1, aclMat2, mask=aclMask, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(aclMatDst.download(),
+                        cv.subtract(npMat1, npMat2, mask=mask)))
+
+        cv.cann.resetDevice()
+
+    def test_logical(self):
+        npMat1 = np.random.random((5, 5, 3)).astype(np.uint16)
+        npMat2 = np.random.random((5, 5, 3)).astype(np.uint16)
+        cv.cann.setDevice(0)
+
+        # ACLMat input data
+        aclMat1 = cv.cann.AscendMat()
+        aclMat1.upload(npMat1)
+        aclMat2 = cv.cann.AscendMat()
+        aclMat2.upload(npMat2)
+        aclMask = cv.cann.AscendMat()
+        aclMask.upload(mask)
+
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(npMat1, npMat2),
+                                    cv.bitwise_or(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(
+            npMat1, npMat2), cv.bitwise_or(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(npMat1, npMat2),
+                                    cv.bitwise_and(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(
+            npMat1, npMat2), cv.bitwise_and(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(npMat1, npMat2),
+                                    cv.bitwise_xor(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(
+            npMat1, npMat2), cv.bitwise_xor(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_not(npMat1),
+                                    cv.bitwise_not(npMat1)))
+        self.assertTrue(np.allclose(
+            cv.cann.bitwise_not(npMat1), cv.bitwise_not(npMat1)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(npMat1, npMat2, mask=mask),
+                                    cv.bitwise_and(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(npMat1, npMat2, mask=mask),
+                                    cv.bitwise_or(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_not(npMat1, mask=mask),
+                                    cv.bitwise_not(npMat1, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(npMat1, npMat2, mask=mask),
+                                    cv.bitwise_xor(npMat1, npMat2, mask=mask)))
+
+        # AscendMat interface
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(aclMat1, aclMat2).download(),
+                                    cv.bitwise_or(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(aclMat1, aclMat2).download(),
+                                    cv.bitwise_or(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(aclMat1, aclMat2).download(),
+                                    cv.bitwise_and(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(
+            aclMat1, aclMat2).download(), cv.bitwise_and(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(aclMat1, aclMat2).download(),
+                                    cv.bitwise_xor(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(
+            aclMat1, aclMat2).download(), cv.bitwise_xor(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_not(aclMat1).download(),
+                                    cv.bitwise_not(npMat1)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_not(aclMat1).download(),
+                                    cv.bitwise_not(npMat1)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(aclMat1, aclMat2, mask=aclMask).download(),
+                                    cv.bitwise_and(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(aclMat1, aclMat2, mask=aclMask).download(),
+                                    cv.bitwise_or(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_not(aclMat1, mask=aclMask).download(),
+                                    cv.bitwise_not(npMat1, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(aclMat1, aclMat2, mask=aclMask).download(),
+                                    cv.bitwise_xor(npMat1, npMat2, mask=mask)))
+        cv.cann.resetDevice()
+
+    def test_imgproc(self):
+        npMat = (np.random.random((128, 128, 3)) * 255).astype(np.uint8)
+        cv.cann.setDevice(0)
+        aclMat = cv.cann.AscendMat()
+        aclMatDst = aclMat
+        aclMat.upload(npMat)
+
+        # TODO try pass out param, not use return value.
+        # merge & split
+        self.assertTrue(np.allclose(
+            cv.cann.merge(cv.cann.split(npMat)).download(), npMat))
+        self.assertTrue(np.allclose(
+            cv.cann.merge(cv.cann.split(aclMat)).download(), npMat))
+
+        # transpose
+        self.assertTrue(np.allclose(
+            cv.cann.transpose(npMat), cv.transpose(npMat)))
+        self.assertTrue(np.allclose(
+            cv.cann.transpose(aclMat).download(), cv.transpose(npMat)))
+
+        # crop
+        w_off, h_off, crop_w, crop_h = 0, 0, 64, 64
+        roi = [w_off, h_off, crop_w, crop_h]
+        self.assertTrue(np.allclose(
+            cv.cann.crop(npMat, roi).download(), npMat[w_off:crop_w, h_off:crop_h]))
+        self.assertTrue(np.allclose(
+            cv.cann.crop(aclMat, roi).download(), npMat[w_off:crop_w, h_off:crop_h]))
+
+        # resize
+        dstSize = np.array([crop_w, crop_h])
+        aclMat32F = cv.cann.AscendMat()
+        aclMat32F.upload(npMat.astype(np.float32))
+        self.assertTrue(np.allclose(cv.cann.resize(npMat.astype(np.float32), dstSize, 0, 0, 3),
+                        cv.resize(npMat.astype(np.float32), dstSize, 0, 0, 3)))
+        self.assertTrue(np.allclose(cv.cann.resize(aclMat32F, dstSize, 0, 0, 3).download(),
+                        cv.resize(npMat.astype(np.float32), dstSize, 0, 0, 3)))
+        # flip
+        flipMode = [0, 1, -1]
+        for fMode in flipMode:
+            self.assertTrue(np.allclose(cv.cann.flip(
+                npMat, fMode), cv.flip(npMat, fMode)))
+            self.assertTrue(np.allclose(cv.cann.flip(
+                aclMat, fMode).download(), cv.flip(npMat, fMode)))
+
+        # rotate
+        rotateMode = [0, 1, 2]
+        for rMode in rotateMode:
+            self.assertTrue(np.allclose(cv.cann.rotate(
+                npMat, rMode), cv.rotate(npMat, rMode)))
+            self.assertTrue(np.allclose(cv.cann.rotate(
+                aclMat, rMode).download(), cv.rotate(npMat, rMode)))
+
+        # cvtColcor
+        cvtModeC1 = [cv.COLOR_GRAY2BGR, cv.COLOR_GRAY2BGRA]
+        cvtModeC3 = [cv.COLOR_BGR2GRAY, cv.COLOR_BGRA2BGR, cv.COLOR_BGR2RGBA, cv.COLOR_RGBA2BGR,
+                     cv.COLOR_BGR2RGB, cv.COLOR_BGRA2RGBA, cv.COLOR_RGB2GRAY, cv.COLOR_BGRA2GRAY,
+                     cv.COLOR_RGBA2GRAY, cv.COLOR_BGR2BGRA, cv.COLOR_BGR2YUV, cv.COLOR_RGB2YUV,
+                     cv.COLOR_YUV2BGR, cv.COLOR_YUV2RGB, cv.COLOR_BGR2YCrCb, cv.COLOR_RGB2YCrCb,
+                     cv.COLOR_YCrCb2BGR, cv.COLOR_YCrCb2RGB, cv.COLOR_BGR2XYZ, cv.COLOR_RGB2XYZ,
+                     cv.COLOR_XYZ2BGR, cv.COLOR_XYZ2RGB,]
+        for cvtM in cvtModeC3:
+            self.assertTrue(np.allclose(cv.cann.cvtColor(
+                npMat, cvtM), cv.cvtColor(npMat, cvtM), 1))
+            self.assertTrue(np.allclose(cv.cann.cvtColor(
+                aclMat, cvtM).download(), cv.cvtColor(npMat, cvtM), 1))
+
+        npMatC1 = (np.random.random((128, 128, 1)) * 255).astype(np.uint8)
+        aclMatC1 = cv.cann.AscendMat()
+        aclMatC1.upload(npMatC1)
+        for cvtM in cvtModeC1:
+            self.assertTrue(np.allclose(cv.cann.cvtColor(
+                npMatC1, cvtM), cv.cvtColor(npMatC1, cvtM), 1))
+            self.assertTrue(np.allclose(cv.cann.cvtColor(
+                aclMatC1, cvtM).download(), cv.cvtColor(npMatC1, cvtM), 1))
+
+        # threshold
+        threshType = [cv.THRESH_BINARY, cv.THRESH_BINARY_INV,
+                      cv.THRESH_TRUNC, cv.THRESH_TOZERO, cv.THRESH_TOZERO_INV]
+        for tType in threshType:
+            cvRet, cvThresh = cv.threshold(
+                npMat.astype(np.uint8), 127, 255, tType)
+            cannRet, cannThresh = cv.cann.threshold(
+                npMat.astype(np.float32), 127, 255, tType)
+            self.assertTrue(np.allclose(cvThresh, cannThresh))
+            self.assertTrue(np.allclose(cvRet, cannRet))
+
+            aclMat.upload(npMat.astype(np.float32))
+            cannRet, cannThresh = cv.cann.threshold(
+                aclMat, 127, 255, tType)
+            self.assertTrue(np.allclose(cvThresh, cannThresh.download()))
+            self.assertTrue(np.allclose(cvRet, cannRet))
+        cv.cann.resetDevice()
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/cannops/perf/perf_core.cpp b/modules/cannops/perf/perf_core.cpp
new file mode 100644
index 00000000000..a9d86fca881
--- /dev/null
+++ b/modules/cannops/perf/perf_core.cpp
@@ -0,0 +1,161 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_interface.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+#define TYPICAL_ASCEND_MAT_SIZES \
+    Values(::perf::sz1080p, ::perf::sz2K, ::perf::sz2160p, ::perf::sz4320p)
+#define DEF_PARAM_TEST(name, ...) \
+    typedef ::perf::TestBaseWithParam<testing::tuple<__VA_ARGS__>> name
+
+DEF_PARAM_TEST(NPU, Size);
+DEF_PARAM_TEST(CPU, Size);
+
+PERF_TEST_P(NPU, MERGE, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC1);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    AscendMat ascendMat[3];
+    ascendMat[0].upload(mat);
+    ascendMat[1].upload(mat);
+    ascendMat[2].upload(mat);
+
+    TEST_CYCLE() { cv::cann::merge(&ascendMat[0], 3, dst); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MERGE, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC1);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    Mat mats[3] = {mat, mat, mat};
+    TEST_CYCLE() { cv::merge(&mats[0], 3, dst); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, SPLIT, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    AscendMat ascendMat[3];
+
+    TEST_CYCLE() { cv::cann::split(mat, &ascendMat[0]); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, SPLIT, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    declare.in(mat, WARMUP_RNG);
+    Mat mats[3] = {mat, mat, mat};
+    TEST_CYCLE() { cv::split(mat, &mats[0]); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, TRANSPOSE, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::transpose(mat, dst); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, TRANSPOSE, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::transpose(mat, dst); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, FLIP, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::flip(mat, dst, -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, FLIP, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::flip(mat, dst, -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, ROTATE, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::rotate(mat, dst, 1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, ROTATE, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::rotate(mat, dst, 1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, CROP, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    Rect b(1, 2, 4, 4);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { AscendMat cropped_cann(mat, b); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, CROP, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    Rect b(1, 2, 4, 4);
+    TEST_CYCLE() { Mat cropped_cv(mat, b); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, CROP_OVERLOAD, TYPICAL_ASCEND_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    Rect b(1, 2, 4, 4);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::crop(mat, b); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/perf/perf_cvtcolor.cpp b/modules/cannops/perf/perf_cvtcolor.cpp
new file mode 100644
index 00000000000..c868d4fec04
--- /dev/null
+++ b/modules/cannops/perf/perf_cvtcolor.cpp
@@ -0,0 +1,69 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_interface.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+#define CVT_COLORS_3                                                                         \
+    Values(COLOR_BGR2BGRA, COLOR_BGRA2BGR, COLOR_BGR2RGBA, COLOR_RGBA2BGR, COLOR_BGR2RGB,    \
+           COLOR_BGRA2RGBA, COLOR_BGR2GRAY, COLOR_BGRA2GRAY, COLOR_RGBA2GRAY, COLOR_BGR2XYZ, \
+           COLOR_RGB2XYZ, COLOR_XYZ2BGR, COLOR_XYZ2RGB, COLOR_BGR2YCrCb, COLOR_RGB2YCrCb,    \
+           COLOR_YCrCb2BGR, COLOR_YCrCb2RGB, COLOR_BGR2YUV, COLOR_RGB2YUV, COLOR_YUV2BGR,    \
+           COLOR_YUV2RGB)
+#define CVT_COLORS_1 Values(COLOR_GRAY2BGR, COLOR_GRAY2BGRA)
+#define TYPICAL_ASCEND_MAT_SIZES \
+    Values(::perf::sz1080p, ::perf::sz2K)
+#define DEF_PARAM_TEST(name, ...) \
+    typedef ::perf::TestBaseWithParam<testing::tuple<__VA_ARGS__>> name
+
+DEF_PARAM_TEST(NPU, Size, ColorConversionCodes);
+DEF_PARAM_TEST(CPU, Size, ColorConversionCodes);
+
+PERF_TEST_P(NPU, CVT_COLOR_3, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, CVT_COLORS_3))
+{
+    Mat mat(GET_PARAM(0), CV_32FC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::cvtColor(mat, dst, GET_PARAM(1)); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, CVT_COLOR_3, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, CVT_COLORS_3))
+{
+    Mat mat(GET_PARAM(0), CV_32FC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::cvtColor(mat, dst, GET_PARAM(1)); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, CVT_COLOR_1, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, CVT_COLORS_1))
+{
+    Mat mat(GET_PARAM(0), CV_32FC1);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::cvtColor(mat, dst, GET_PARAM(1)); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, CVT_COLOR_1, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, CVT_COLORS_1))
+{
+    Mat mat(GET_PARAM(0), CV_32FC1);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::cvtColor(mat, dst, GET_PARAM(1)); }
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/perf/perf_element_operations.cpp b/modules/cannops/perf/perf_element_operations.cpp
new file mode 100644
index 00000000000..0612abe6085
--- /dev/null
+++ b/modules/cannops/perf/perf_element_operations.cpp
@@ -0,0 +1,211 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_interface.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+#define ARITHM_MAT_DEPTH Values(CV_32S, CV_32SC3)
+#define TYPICAL_ASCEND_MAT_SIZES \
+    Values(::perf::sz1080p, ::perf::sz2K, ::perf::sz2160p, ::perf::sz4320p)
+#define DEF_PARAM_TEST(name, ...) \
+    typedef ::perf::TestBaseWithParam<testing::tuple<__VA_ARGS__>> name
+
+DEF_PARAM_TEST(NPU, Size, int);
+DEF_PARAM_TEST(CPU, Size, int);
+
+PERF_TEST_P(NPU, MAT_ADD_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::add(mat1, mat2, dst, noArray(), -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_ADD_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::add(mat1, mat2, dst, noArray(), -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_SUB_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::subtract(mat1, mat2, dst, noArray(), -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_SUB_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::subtract(mat1, mat2, dst, noArray(), -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_MUL_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::multiply(mat1, mat2, dst, 1, -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_MUL_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::multiply(mat1, mat2, dst, 1, -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_DIV_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::divide(mat1, mat2, dst, 1, -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_DIV_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::divide(mat1, mat2, dst, 1, -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_BITWISE_AND_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::bitwise_and(mat1, mat2, dst, noArray()); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_BITWISE_AND_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::bitwise_and(mat1, mat2, dst, noArray()); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_BITWISE_OR_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::bitwise_or(mat1, mat2, dst, noArray()); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_BITWISE_OR_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::bitwise_or(mat1, mat2, dst, noArray()); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_BITWISE_XOR_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::bitwise_xor(mat1, mat2, dst, noArray()); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_BITWISE_XOR_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::bitwise_xor(mat1, mat2, dst, noArray()); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_BITWISE_NOT_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::bitwise_not(mat, dst, noArray()); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_BITWISE_NOT_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::bitwise_not(mat, dst, noArray()); }
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/perf/perf_main.cpp b/modules/cannops/perf/perf_main.cpp
new file mode 100644
index 00000000000..33503ac4158
--- /dev/null
+++ b/modules/cannops/perf/perf_main.cpp
@@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_interface.hpp"
+using namespace perf;
+
+class CannEnvironment : public ::testing::Environment
+{
+public:
+    virtual ~CannEnvironment() = default;
+    virtual void SetUp() CV_OVERRIDE { cv::cann::initAcl(); }
+    virtual void TearDown() CV_OVERRIDE { cv::cann::finalizeAcl(); }
+};
+
+static void initTests()
+{
+    CannEnvironment* cannEnv = new CannEnvironment();
+    ::testing::AddGlobalTestEnvironment(cannEnv);
+}
+
+CV_PERF_TEST_MAIN("cannops", initTests())
diff --git a/modules/cannops/perf/perf_precomp.hpp b/modules/cannops/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..59e2fa03d7b
--- /dev/null
+++ b/modules/cannops/perf/perf_precomp.hpp
@@ -0,0 +1,19 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+#include "opencv2/cann.hpp"
+
+#define DEVICE_ID 0
+
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::cann;
+
+#endif
diff --git a/modules/cannops/samples/image_processing.cpp b/modules/cannops/samples/image_processing.cpp
new file mode 100644
index 00000000000..9dca2176dfd
--- /dev/null
+++ b/modules/cannops/samples/image_processing.cpp
@@ -0,0 +1,60 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <iostream>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/cann.hpp>
+#include <opencv2/cann_interface.hpp>
+
+int main(int argc, char* argv[])
+{
+    cv::CommandLineParser parser(argc, argv,
+                                 "{@input|puppy.png|path to input image}"
+                                 "{@output|output.png|path to output image}"
+                                 "{help||show help}");
+    parser.about("This is a sample for image processing with Ascend NPU. \n");
+    if (argc != 3 || parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    std::string imagePath = parser.get<std::string>(0);
+    std::string outputPath = parser.get<std::string>(1);
+
+    // read input image and generate guass noise
+    //! [input_noise]
+    cv::Mat img = cv::imread(imagePath);
+    // Generate gauss noise that will be added into the input image
+    cv::Mat gaussNoise(img.rows, img.cols, img.type());
+    cv::RNG rng;
+    rng.fill(gaussNoise, cv::RNG::NORMAL, 0, 25);
+    //! [input_noise]
+
+    // setup cann
+    //! [setup]
+    cv::cann::initAcl();
+    cv::cann::setDevice(0);
+    //! [setup]
+
+    //! [image-process]
+    cv::Mat output;
+    // add gauss noise to the image
+    cv::cann::add(img, gaussNoise, output);
+    // rotate the image with a certain mode (0, 1 and 2, correspond to rotation of 90, 180 and 270
+    // degrees clockwise respectively)
+    cv::cann::rotate(output, output, 0);
+    // flip the image with a certain mode (0, positive and negative number, correspond to flipping
+    // around the x-axis, y-axis and both axes respectively)
+    cv::cann::flip(output, output, 0);
+    //! [image-process]
+
+    cv::imwrite(outputPath, output);
+
+    //! [tear-down-cann]
+    cv::cann::resetDevice();
+    cv::cann::finalizeAcl();
+    //! [tear-down-cann]
+    return 0;
+}
diff --git a/modules/cannops/samples/image_processing.py b/modules/cannops/samples/image_processing.py
new file mode 100644
index 00000000000..dc974bdd78c
--- /dev/null
+++ b/modules/cannops/samples/image_processing.py
@@ -0,0 +1,42 @@
+# This file is part of OpenCV project.
+# It is subject to the license terms in the LICENSE file found in the top-level directory
+# of this distribution and at http://opencv.org/license.html.
+
+import numpy as np
+import cv2
+import argparse
+
+parser = argparse.ArgumentParser(description='This is a sample for image processing with Ascend NPU.')
+parser.add_argument('image', help='path to input image')
+parser.add_argument('output', help='path to output image')
+args = parser.parse_args()
+
+# read input image and generate guass noise
+#! [input_noise]
+img = cv2.imread(args.image)
+# Generate gauss noise that will be added into the input image
+gaussNoise = np.random.normal(0, 25,(img.shape[0], img.shape[1], img.shape[2])).astype(img.dtype)
+#! [input_noise]
+
+# setup cann
+#! [setup]
+cv2.cann.initAcl()
+cv2.cann.setDevice(0)
+#! [setup]
+
+#! [image-process]
+# add gauss noise to the image
+output = cv2.cann.add(img, gaussNoise)
+# rotate the image with a certain mode (0, 1 and 2, correspond to rotation of 90, 180
+# and 270 degrees clockwise respectively)
+output = cv2.cann.rotate(output, 0)
+# flip the image with a certain mode (0, positive and negative number, correspond to flipping
+# around the x-axis, y-axis and both axes respectively)
+output = cv2.cann.flip(output, 0)
+#! [image-process]
+
+cv2.imwrite(args.output, output)
+
+#! [tear-down-cann]
+cv2.cann.finalizeAcl()
+#! [tear-down-cann]
diff --git a/modules/cannops/src/ascend_mat.cpp b/modules/cannops/src/ascend_mat.cpp
new file mode 100644
index 00000000000..ba17a545bb7
--- /dev/null
+++ b/modules/cannops/src/ascend_mat.cpp
@@ -0,0 +1,232 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+#include <iostream>
+
+namespace
+{
+class DefaultAllocator : public cv::cann::AscendMat::Allocator
+{
+public:
+    std::shared_ptr<uchar> allocate(size_t size) CV_OVERRIDE;
+    bool allocate(cv::cann::AscendMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE;
+};
+
+std::shared_ptr<uchar> DefaultAllocator::allocate(size_t size)
+{
+    uchar* data;
+    cv::cann::aclrtMallocWarpper((void**)(&data), size);
+    return std::shared_ptr<uchar>(data, [](void* ptr) { cv::cann::aclrtFreeWarpper(ptr); });
+}
+
+bool DefaultAllocator::allocate(cv::cann::AscendMat* mat, int rows, int cols, size_t elemSize)
+{
+    mat->data = allocate(elemSize * cols * rows);
+    mat->step = cols * elemSize;
+
+    return true;
+}
+
+DefaultAllocator cannDefaultAllocator;
+cv::cann::AscendMat::Allocator* g_defaultAllocator = &cannDefaultAllocator;
+} // namespace
+
+namespace cv
+{
+namespace cann
+{
+AscendMat::Allocator* AscendMat::defaultAllocator() { return g_defaultAllocator; }
+
+void AscendMat::setDefaultAllocator(AscendMat::Allocator* allocator)
+{
+    CV_Assert(allocator != 0);
+    g_defaultAllocator = allocator;
+}
+
+// TODO: this function is copied from matrix.cpp, which is a local symbol there and can not
+// be refreneced, consider optimizing.
+static int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step)
+{
+    int i, j;
+    for (i = 0; i < dims; i++)
+    {
+        if (size[i] > 1)
+            break;
+    }
+
+    uint64 t = (uint64)size[std::min(i, dims - 1)] * CV_MAT_CN(flags);
+    for (j = dims - 1; j > i; j--)
+    {
+        t *= size[j];
+        if (step[j] * size[j] < step[j - 1])
+            break;
+    }
+
+    if (j <= i && t == (uint64)(int)t)
+        return flags | Mat::CONTINUOUS_FLAG;
+    return flags & ~Mat::CONTINUOUS_FLAG;
+}
+
+void AscendMat::updateContinuityFlag()
+{
+    int sz[] = {rows, cols};
+    size_t steps[] = {step, elemSize()};
+    flags = cv::cann::updateContinuityFlag(flags, 2, sz, steps);
+}
+
+void AscendMat::create(int _rows, int _cols, int _type)
+{
+    CV_DbgAssert(_rows >= 0 && _cols >= 0);
+
+    _type &= Mat::TYPE_MASK;
+
+    if (rows == _rows && cols == _cols && type() == _type && data)
+        return;
+
+    if (_rows > 0 && _cols > 0)
+    {
+        flags = Mat::MAGIC_VAL + _type;
+        rows = _rows;
+        cols = _cols;
+
+        const size_t esz = elemSize();
+
+        bool allocSuccess = allocator->allocate(this, rows, cols, esz);
+
+        if (!allocSuccess)
+        {
+            // custom allocator fails, try default allocator
+            allocator = defaultAllocator();
+            allocSuccess = allocator->allocate(this, rows, cols, esz);
+            CV_Assert(allocSuccess);
+        }
+
+        if (esz * cols == step)
+            flags |= Mat::CONTINUOUS_FLAG;
+
+        datastart = data.get();
+        dataend = data.get() + step * (rows - 1) + cols * esz;
+    }
+}
+
+void AscendMat::upload(InputArray arr) { upload(arr, AscendStream::Null()); }
+
+void AscendMat::upload(InputArray arr, AscendStream& stream)
+{
+    Mat mat = arr.getMat();
+    CV_DbgAssert(!mat.empty());
+    create(mat.rows, mat.cols, mat.type());
+    aclrtMemcpy2dWarpper(data, 0, step, mat.data, mat.step[0], cols * elemSize(), rows, stream);
+}
+
+void AscendMat::download(OutputArray dst) const { download(dst, AscendStream::Null()); }
+
+void AscendMat::download(OutputArray _dst, AscendStream& stream) const
+{
+    CV_DbgAssert(!empty());
+
+    _dst.create(size(), type());
+    Mat dst = _dst.getMat();
+    aclrtMemcpy2dWarpper(dst.data, dst.step[0], data, 0, step, cols * elemSize(), rows, stream);
+}
+
+AscendMat::AscendMat(int rows_, int cols_, int type_, Scalar& s_, AscendMat::Allocator* allocator_)
+    : flags(0), rows(rows_), cols(cols_), step(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    create(rows_, cols_, type_);
+    setTo(s_);
+}
+
+AscendMat::AscendMat(Size size_, int type_, Scalar& s_, AscendMat::Allocator* allocator_)
+    : flags(0), rows(size_.height), cols(size_.width), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    create(size_.height, size_.width, type_);
+    setTo(s_);
+}
+
+AscendMat::AscendMat(InputArray _m, const Rect& roi) : AscendMat(_m, roi, AscendStream::Null()) {}
+
+AscendMat::AscendMat(InputArray _m, const Rect& roi, AscendStream& stream)
+    : rows(roi.height), cols(roi.width), allocator(defaultAllocator())
+{
+    AscendMat m;
+    m.upload(_m, stream);
+    step = m.step;
+    data = m.data;
+    flags = m.flags;
+    CV_Assert(0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y &&
+              0 <= roi.height && roi.y + roi.height <= m.rows);
+    size_t esz = CV_ELEM_SIZE(flags);
+    size_t sizeMem = esz * roi.width * roi.height * m.channels();
+    size_t offset = roi.y * m.step + roi.x * esz;
+
+    void* dst = malloc(sizeMem);
+    size_t dpitch = roi.width * esz;
+    std::shared_ptr<uchar> dstDevice = allocator->allocate(sizeMem);
+    aclrtMemcpy2dWarpper(dst, dpitch, data, offset, step, dpitch, roi.height, stream);
+    aclrtMemcpy2dWarpper(dstDevice, 0, dpitch, dst, dpitch, dpitch, roi.height, stream);
+    data = dstDevice;
+    step = dpitch;
+    free(dst);
+    updateContinuityFlag();
+}
+
+AscendMat& AscendMat::setTo(const Scalar& sc) { return setTo(sc, AscendStream::Null()); }
+
+AscendMat& AscendMat::setTo(const Scalar& sc, AscendStream& stream)
+{
+    size_t totalBytes = (size_t)rows * cols * elemSize();
+    if (totalBytes == 0)
+        return *this;
+
+    aclrtMemsetWarpper(data, 0, totalBytes, stream);
+    AscendMat dst(rows, cols, type());
+    arithm_op(*this, sc, dst, "Add", stream);
+    swap(dst);
+
+    return *this;
+}
+
+AscendMat& AscendMat::setTo(float sc) { return setTo(sc, AscendStream::Null()); }
+
+AscendMat& AscendMat::setTo(float sc, AscendStream& stream)
+{
+    size_t totalBytes = (size_t)rows * cols * elemSize();
+    if (totalBytes == 0)
+        return *this;
+
+    aclrtMemsetWarpper(data, 0, totalBytes, stream);
+
+    AscendMat dst(rows, cols, type());
+    arithm_op(*this, sc, dst, "Adds", stream);
+    swap(dst);
+
+    return *this;
+}
+
+void AscendMat::convertTo(AscendMat& dst, int rtype) const
+{
+    convertTo(dst, rtype, AscendStream::Null());
+}
+
+void AscendMat::convertTo(AscendMat& dst, int _rtype, AscendStream& stream) const
+{
+    int cn = channels();
+    dst.create(rows, cols, CV_MAKE_TYPE(_rtype, cn));
+    convertTo(dst, stream);
+}
+
+void AscendMat::convertTo(AscendMat& dst, AscendStream& stream) const
+{
+    OperatorRunner runner;
+    runner.setOp("Cast")
+        .addInput(*this, "x")
+        .addOutput(dst, "y")
+        .addAttr((int32_t)(getACLType(dst.depth())), "dst_type")
+        .run(stream);
+}
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannops/src/cann_call.cpp b/modules/cannops/src/cann_call.cpp
new file mode 100644
index 00000000000..3b83052ccbe
--- /dev/null
+++ b/modules/cannops/src/cann_call.cpp
@@ -0,0 +1,524 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include "precomp.hpp"
+#include "opencv2/core/private.hpp"
+namespace cv
+{
+namespace cann
+{
+/*******************************Acl Error Checker*****************************/
+static inline void checkAclError(aclError err, const char* file, const int line, const char* func)
+{
+    if (ACL_SUCCESS != err)
+    {
+        const char* errMsg = aclGetRecentErrMsg();
+        cv::error(cv::Error::StsError, errMsg == nullptr ? "" : errMsg, func, file, line);
+    }
+}
+
+static inline void checkAclPtr(void* ptr, const char* file, const int line, const char* func)
+{
+    if (nullptr == ptr)
+    {
+        const char* errMsg = aclGetRecentErrMsg();
+        cv::error(cv::Error::StsError, errMsg == nullptr ? "" : errMsg, func, file, line);
+    }
+}
+
+#define CV_ACL_SAFE_CALL(expr) checkAclError((expr), __FILE__, __LINE__, CV_Func)
+#define CV_ACL_SAFE_CALL_PTR(expr)                     \
+    ({                                                 \
+        auto ptr = (expr);                             \
+        checkAclPtr(ptr, __FILE__, __LINE__, CV_Func); \
+        ptr;                                           \
+    })
+
+/******************************Acl Runtime Warpper****************************/
+void aclrtMallocWarpper(void** data, size_t size)
+{
+    CV_ACL_SAFE_CALL(aclrtMalloc(data, size, ACL_MEM_MALLOC_HUGE_FIRST));
+}
+
+void aclrtFreeWarpper(void* data) { CV_ACL_SAFE_CALL(aclrtFree(data)); }
+
+void aclrtMemcpyWarpper(std::shared_ptr<uchar>& dst, size_t offset, const void* src, size_t size,
+                        AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(
+            aclrtMemcpy(dst.get() + offset, size, src, size, ACL_MEMCPY_HOST_TO_DEVICE));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpyAsync(dst.get() + offset, size, src, size,
+                                          ACL_MEMCPY_HOST_TO_DEVICE, rawStream));
+        if (offset == 0)
+            stream.addTensorHolder(dst);
+    }
+}
+
+void aclrtMemcpyWarpper(void* dst, const std::shared_ptr<uchar>& src, size_t offset, size_t size,
+                        AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(
+            aclrtMemcpy(dst, size, src.get() + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpyAsync(dst, size, src.get() + offset, size,
+                                          ACL_MEMCPY_DEVICE_TO_HOST, rawStream));
+        if (offset == 0)
+            stream.addTensorHolder(src);
+    }
+}
+
+void aclrtMemcpyWarpper(std::shared_ptr<uchar>& dst, size_t dstOffset,
+                        const std::shared_ptr<uchar>& src, size_t srcOffset, size_t size,
+                        AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemcpy(dst.get() + dstOffset, size, src.get() + srcOffset, size,
+                                     ACL_MEMCPY_DEVICE_TO_DEVICE));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpyAsync(dst.get() + dstOffset, size, src.get() + srcOffset, size,
+                                          ACL_MEMCPY_DEVICE_TO_DEVICE, rawStream));
+        if (srcOffset == 0)
+            stream.addTensorHolder(src);
+        if (dstOffset == 0)
+            stream.addTensorHolder(dst);
+    }
+}
+
+void aclrtMemcpy2dWarpper(std::shared_ptr<uchar>& dst, size_t offset, size_t dpitch,
+                          const void* src, size_t spitch, size_t width, size_t length,
+                          AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemcpy2d(dst.get() + offset, dpitch, src, spitch, width, length,
+                                       ACL_MEMCPY_HOST_TO_DEVICE));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpy2dAsync(dst.get() + offset, dpitch, src, spitch, width, length,
+                                            ACL_MEMCPY_HOST_TO_DEVICE, rawStream));
+        stream.addTensorHolder(dst);
+    }
+}
+
+void aclrtMemcpy2dWarpper(void* dst, size_t dpitch, const std::shared_ptr<uchar>& src,
+                          size_t offset, size_t spitch, size_t width, size_t length,
+                          AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemcpy2d(dst, dpitch, src.get() + offset, spitch, width, length,
+                                       ACL_MEMCPY_DEVICE_TO_HOST));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpy2dAsync(dst, dpitch, src.get() + offset, spitch, width, length,
+                                            ACL_MEMCPY_DEVICE_TO_HOST, rawStream));
+        stream.addTensorHolder(src);
+    }
+}
+
+void aclrtMemsetWarpper(std::shared_ptr<uchar>& ptr, int32_t value, size_t count,
+                        AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemset(ptr.get(), count, value, count));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemsetAsync(ptr.get(), count, value, count, rawStream));
+        stream.addTensorHolder(ptr);
+    }
+}
+
+aclDataType getACLType(int opencvdepth)
+{
+    switch (opencvdepth)
+    {
+        case CV_8S:
+            return ACL_INT8;
+        case CV_16S:
+            return ACL_INT16;
+        case CV_8U:
+            return ACL_UINT8;
+        case CV_16U:
+            return ACL_UINT16;
+        case CV_32S:
+            return ACL_INT32;
+        case CV_32F:
+            return ACL_FLOAT;
+        case CV_64F:
+            return ACL_DOUBLE;
+        case CV_16F:
+            return ACL_FLOAT16;
+        default:
+            return ACL_DT_UNDEFINED;
+    }
+}
+
+std::shared_ptr<uchar> mallocAndUpload(const void* data, size_t size, AscendStream& stream,
+                                       AscendMat::Allocator* allocator)
+{
+    std::shared_ptr<uchar> ptr = allocator->allocate(size);
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemcpy(ptr.get(), size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+    else
+        CV_ACL_SAFE_CALL(
+            aclrtMemcpyAsync(ptr.get(), size, data, size, ACL_MEMCPY_HOST_TO_DEVICE, rawStream));
+    return ptr;
+}
+
+/**************************Acl attribute preparation**************************/
+
+OperatorRunner& OperatorRunner::reset()
+{
+    holder.clear();
+    op.clear();
+    for (auto desc : inputDesc_)
+    {
+        aclDestroyTensorDesc(desc);
+    }
+    for (auto desc : outputDesc_)
+    {
+        aclDestroyTensorDesc(desc);
+    }
+    for (auto buf : inputBuffers_)
+    {
+        CV_ACL_SAFE_CALL(aclDestroyDataBuffer(buf));
+    }
+    for (auto buf : outputBuffers_)
+    {
+        CV_ACL_SAFE_CALL(aclDestroyDataBuffer(buf));
+    }
+    if (opAttrInit)
+        aclopDestroyAttr(opAttr_);
+    inputDesc_.clear();
+    outputDesc_.clear();
+    inputBuffers_.clear();
+    outputBuffers_.clear();
+    opAttrInit = false;
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::setOp(const char* opName)
+{
+    reset();
+    opAttr_ = CV_ACL_SAFE_CALL_PTR(aclopCreateAttr());
+    opAttrInit = true;
+    op = std::string(opName);
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addAttr(float value, const char* name)
+{
+    CV_ACL_SAFE_CALL(aclopSetAttrFloat(opAttr_, name, value));
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addAttr(const char* value, const char* name)
+{
+    CV_ACL_SAFE_CALL(aclopSetAttrString(opAttr_, name, value));
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addAttr(int value, const char* name)
+{
+    CV_ACL_SAFE_CALL(aclopSetAttrInt(opAttr_, name, value));
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addAttr(bool value, const char* name)
+{
+    CV_ACL_SAFE_CALL(aclopSetAttrBool(opAttr_, name, value));
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addAttr(const int64_t* value, int size, const char* name)
+{
+    CV_ACL_SAFE_CALL(aclopSetAttrListInt(opAttr_, name, size, value));
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addInput(AscendTensor& tensor)
+{
+    auto descPtr = CV_ACL_SAFE_CALL_PTR(
+        aclCreateTensorDesc(tensor.dtype, tensor.dims.size(), &tensor.dims[0], tensor.format));
+    if (descPtr != nullptr)
+    {
+        if (tensor.name != nullptr && strlen(tensor.name) != 0)
+            aclSetTensorDescName(descPtr, tensor.name);
+        inputDesc_.push_back(descPtr);
+    }
+    auto bufPtr = CV_ACL_SAFE_CALL_PTR(aclCreateDataBuffer(tensor.data.get(), tensor.dataSize));
+    if (bufPtr != nullptr)
+        inputBuffers_.push_back(bufPtr);
+    holder.insert(tensor.data);
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addOutput(AscendTensor& tensor)
+{
+    auto descPtr = CV_ACL_SAFE_CALL_PTR(
+        aclCreateTensorDesc(tensor.dtype, tensor.dims.size(), &tensor.dims[0], tensor.format));
+    if (descPtr != nullptr)
+    {
+        if (tensor.name != nullptr && strlen(tensor.name) != 0)
+            aclSetTensorDescName(descPtr, tensor.name);
+        outputDesc_.push_back(descPtr);
+    }
+    auto bufPtr = CV_ACL_SAFE_CALL_PTR(aclCreateDataBuffer(tensor.data.get(), tensor.dataSize));
+    if (bufPtr != nullptr)
+        outputBuffers_.push_back(bufPtr);
+    holder.insert(tensor.data);
+    return *this;
+}
+
+OperatorRunner& OperatorRunner::addInput(const AscendMat& mat, const char* name)
+{
+    AscendTensor tensor(mat, name);
+    return addInput(tensor);
+}
+
+OperatorRunner& OperatorRunner::addOutput(AscendMat& mat, const char* name)
+{
+    AscendTensor tensor(mat, name);
+    return addOutput(tensor);
+}
+
+OperatorRunner& OperatorRunner::addInput(const Scalar& sc, int type, const char* name)
+{
+    uchar rawData[32];
+    cv::scalarToRawData(sc, rawData, type, 0);
+    std::shared_ptr<uchar> scPtr = mallocAndUpload(
+        rawData, (CV_ELEM_SIZE(type)), AscendStream::Null(), AscendMat::defaultAllocator());
+
+    int64_t dims[] = {1, 1, 1, (CV_MAT_CN(type))};
+    AscendTensor tensor(scPtr, (CV_ELEM_SIZE(type)), dims, sizeof(dims) / sizeof(dims[0]),
+                        getACLType(CV_MAT_DEPTH(type)), name);
+    return addInput(tensor);
+}
+
+OperatorRunner& OperatorRunner::run(AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    CV_ACL_SAFE_CALL(aclopCompileAndExecute(op.c_str(), inputDesc_.size(), inputDesc_.data(),
+                                            inputBuffers_.data(), outputDesc_.size(),
+                                            outputDesc_.data(), outputBuffers_.data(), opAttr_,
+                                            ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, rawStream));
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtSynchronizeStream(rawStream));
+    else
+    {
+        for (const auto& ptr : holder)
+            stream.addTensorHolder(ptr);
+    }
+    return *this;
+}
+
+/********************************Ascend Tensor********************************/
+
+AscendTensor::AscendTensor(std::shared_ptr<uchar> _data, size_t _dataSize, int64_t* _dims,
+                           size_t _dimSize, aclDataType _dtype, const char* _name,
+                           aclFormat _format)
+    : name(_name), data(_data), dataSize(_dataSize), dtype(_dtype), format(_format)
+{
+    dims.assign(_dims, _dims + _dimSize);
+}
+
+AscendTensor::AscendTensor(const AscendMat& ascendMat, const char* _name, aclFormat _format)
+    : name(_name), format(_format)
+{
+    data = ascendMat.data;
+    // Ascend can't process with gaps in matrix.
+    CV_Assert(ascendMat.isContinuous());
+    dataSize = ascendMat.rows * ascendMat.cols * ascendMat.elemSize();
+
+    switch (_format)
+    {
+        case ACL_FORMAT_NHWC:
+        case ACL_FORMAT_ND:
+            dims.resize(4);
+            // Batch, default = 1.
+            dims[0] = 1;
+            // Default OpenCV image format = NHWC.
+            dims[1] = ascendMat.rows;
+            dims[2] = ascendMat.cols;
+            dims[3] = ascendMat.channels();
+            break;
+        case ACL_FORMAT_NCHW:
+            dims.resize(4);
+            dims[0] = 1;
+            dims[1] = ascendMat.channels();
+            dims[2] = ascendMat.rows;
+            dims[3] = ascendMat.cols;
+            break;
+        default:
+            CV_Error(Error::StsBadArg, "Unknown/unsupported matrix format");
+    }
+
+    dtype = getACLType(ascendMat.depth());
+}
+
+/**********************************Device*************************************/
+void setDevice(int device_id)
+{
+    aclrtContext context;
+    CV_ACL_SAFE_CALL(aclrtSetDevice(device_id));
+    CV_ACL_SAFE_CALL(aclrtCreateContext(&context, device_id));
+}
+
+void resetDevice() { CV_ACL_SAFE_CALL(aclrtResetDevice(getDevice())); }
+
+int32_t getDevice()
+{
+    int32_t deviceId;
+    CV_ACL_SAFE_CALL(aclrtGetDevice(&deviceId));
+    return deviceId;
+}
+
+void initAcl() { CV_ACL_SAFE_CALL(aclInit(nullptr)); }
+
+void finalizeAcl() { CV_ACL_SAFE_CALL(aclFinalize()); }
+
+class DefaultDeviceInitializer
+{
+public:
+    DefaultDeviceInitializer();
+    ~DefaultDeviceInitializer();
+
+    AscendStream& getNullAscendStream(int deviceId);
+
+private:
+    std::vector<Ptr<AscendStream>> streams_;
+    Mutex streams_mtx_;
+};
+
+DefaultDeviceInitializer::DefaultDeviceInitializer() {}
+
+DefaultDeviceInitializer::~DefaultDeviceInitializer() { streams_.clear(); }
+
+AscendStream& DefaultDeviceInitializer::getNullAscendStream(int deviceId)
+{
+    AutoLock lock(streams_mtx_);
+
+    if (streams_.empty())
+    {
+        uint32_t deviceCount;
+        CV_ACL_SAFE_CALL(aclrtGetDeviceCount(&deviceCount));
+
+        if (deviceCount > 0)
+            streams_.resize(deviceCount);
+    }
+
+    CV_DbgAssert(deviceId >= 0 && deviceId < static_cast<int>(streams_.size()));
+
+    if (streams_[deviceId].empty())
+    {
+        aclrtStream stream = nullptr;
+        Ptr<AscendStream::Impl> impl = makePtr<AscendStream::Impl>(stream);
+        streams_[deviceId] = Ptr<AscendStream>(new AscendStream(impl));
+    }
+
+    return *streams_[deviceId];
+}
+
+DefaultDeviceInitializer initializer;
+
+/***********************************Event*************************************/
+AscendEvent::Impl::Impl() : event(nullptr), ownEvent(true)
+{
+    CV_ACL_SAFE_CALL(aclrtCreateEvent(&event));
+}
+
+AscendEvent::Impl::Impl(aclrtEvent e) : event(e), ownEvent(false) {}
+
+AscendEvent::Impl::~Impl()
+{
+    if (event && ownEvent)
+    {
+        CV_ACL_SAFE_CALL(aclrtDestroyEvent(event));
+    }
+}
+
+aclrtEvent AscendEventAccessor::getEvent(const AscendEvent& event) { return event.impl_->event; }
+
+AscendEvent AscendEventAccessor::wrapEvent(aclrtEvent event)
+{
+    return AscendEvent(makePtr<AscendEvent::Impl>(event));
+}
+
+AscendEvent::AscendEvent() { impl_ = makePtr<Impl>(); }
+
+void AscendEvent::record(AscendStream& stream)
+{
+    CV_ACL_SAFE_CALL(aclrtRecordEvent(impl_->event, AscendStreamAccessor::getStream(stream)));
+}
+
+void AscendEvent::waitForComplete() const { CV_ACL_SAFE_CALL(aclrtSynchronizeEvent(impl_->event)); }
+
+/************************************Stream***********************************/
+void AscendStream::Impl::AddTensorHolder(const std::shared_ptr<uchar>& tensorData)
+{
+    tensorHolders.insert(tensorData);
+}
+
+AscendStream::Impl::Impl() : stream(nullptr), ownStream(true)
+{
+    CV_ACL_SAFE_CALL(aclrtCreateStream(&stream));
+}
+
+AscendStream::Impl::Impl(aclrtStream s) : stream(s), ownStream(false) {}
+
+aclrtStream AscendStreamAccessor::getStream(const AscendStream& stream)
+{
+    return stream.impl_->stream;
+}
+
+AscendStream AscendStreamAccessor::wrapStream(aclrtStream stream)
+{
+    return AscendStream(makePtr<AscendStream::Impl>(stream));
+}
+
+AscendStream wrapStream(size_t AscendStreamAddress)
+{
+    return AscendStreamAccessor::wrapStream(reinterpret_cast<aclrtStream>(AscendStreamAddress));
+}
+
+AscendStream::AscendStream() { impl_ = makePtr<Impl>(); }
+
+void AscendStream::waitForCompletion()
+{
+    CV_ACL_SAFE_CALL(aclrtSynchronizeStream(impl_->stream));
+    impl_->tensorHolders.clear();
+}
+
+void AscendStream::waitAscendEvent(const AscendEvent& event)
+{
+    CV_ACL_SAFE_CALL(aclrtStreamWaitEvent(impl_->stream, AscendEventAccessor::getEvent(event)));
+}
+
+AscendStream& AscendStream::Null()
+{
+    const uint32_t deviceId = getDevice();
+    return initializer.getNullAscendStream(deviceId);
+}
+
+void AscendStream::addTensorHolder(const std::shared_ptr<uchar>& holder)
+{
+    impl_->AddTensorHolder(holder);
+}
+
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannops/src/color.cpp b/modules/cannops/src/color.cpp
new file mode 100644
index 00000000000..f08a785e576
--- /dev/null
+++ b/modules/cannops/src/color.cpp
@@ -0,0 +1,777 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+namespace cv
+{
+namespace cann
+{
+// Integer type images will have a loss of accuracy during calculation, so they must be converted to
+// float before calculation.
+static AscendMat convertTo(const AscendMat& src, int dtype, AscendStream& stream)
+{
+    AscendMat ret;
+    if (src.depth() != dtype)
+        src.convertTo(ret, dtype, stream);
+    else
+        ret = src;
+    return ret;
+}
+
+static void convertBack(const AscendMat& src, AscendMat& dst, AscendStream& stream)
+{
+    if (src.depth() != dst.depth())
+        src.convertTo(dst, stream);
+}
+
+//! Set alpha channel to a Mat.
+static void matAlphaSet(AscendMat& mat, int dtype, AscendStream& stream)
+{
+    if (dtype < 0)
+        dtype = mat.depth();
+
+    if (mat.depth() == CV_8U || mat.depth() == CV_16U)
+    {
+        size_t size = mat.rows * mat.step;
+        aclrtMemsetWarpper(mat.data, 255, size, stream);
+    }
+    else
+    {
+        if (dtype == CV_32F)
+            mat.setTo(1.0f, stream);
+        else
+            mat.setTo((dtype == CV_8U ? (1 << 8) : (1 << 16)) - 1, stream);
+    }
+}
+
+inline void checkImg(const AscendMat& mat)
+{
+    int depth = mat.depth();
+    CV_Assert(!mat.empty());
+    CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32F);
+}
+
+inline void cvtBGRtoBGR(const AscendMat& src, AscendMat& dst, int dcn, bool swapBlue,
+                        AscendStream& stream)
+{
+    checkImg(src);
+    CV_Assert(src.channels() == 3 || src.channels() == 4);
+
+    AscendMat matChannels[4];
+    split(src, matChannels, stream);
+
+    if (swapBlue)
+        std::swap(matChannels[0], matChannels[2]);
+
+    if (dcn == 4 && src.channels() != 4)
+    {
+        AscendMat& alpha = matChannels[3];
+        alpha.create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), 1));
+        matAlphaSet(alpha, -1, stream);
+    }
+
+    merge(matChannels, dcn, dst, stream);
+}
+
+inline void cvtBGRtoBGR(InputArray& _src, OutputArray& _dst, int dcn, bool swapBlue,
+                        AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtBGRtoBGR(src, dst, dcn, swapBlue, stream);
+    dst.download(_dst, stream);
+}
+
+// TODO duplicated code
+static const float B2YF = 0.114f;
+static const float G2YF = 0.587f;
+static const float R2YF = 0.299f;
+
+inline void cvtBGRtoGray(const AscendMat& src, AscendMat& dst, int, bool swapBlue,
+                         AscendStream& stream)
+{
+    checkImg(src);
+    CV_Assert(src.channels() == 3 || src.channels() == 4);
+
+    float coeffs[] = {B2YF, G2YF, R2YF};
+    dst.create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), 1));
+    AscendMat formatedSrc = convertTo(src, CV_32F, stream);
+    AscendMat formatedDst = convertTo(dst, CV_32F, stream);
+
+    // For RGB
+    if (swapBlue)
+        std::swap(coeffs[0], coeffs[2]);
+
+    Scalar sc = {coeffs[0], coeffs[1], coeffs[2], 0};
+    AscendMat grayRet(formatedSrc.rows, formatedSrc.cols, formatedSrc.type());
+    arithm_op(formatedSrc, sc, grayRet, "Mul", stream);
+
+    AscendMat matChannels[4];
+    split(grayRet, matChannels, stream);
+
+    OperatorRunner runner;
+    runner.setOp("AddN")
+        .addInput(matChannels[0], "x0")
+        .addInput(matChannels[1], "x1")
+        .addInput(matChannels[2], "x2")
+        .addOutput(formatedDst, "y")
+        .addAttr(3, "N")
+        .run(stream);
+
+    convertBack(formatedDst, dst, stream);
+}
+
+inline void cvtBGRtoGray(const InputArray& _src, OutputArray& _dst, int, bool swapBlue,
+                         AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtBGRtoGray(src, dst, 0, swapBlue, stream);
+    dst.download(_dst, stream);
+}
+
+inline void cvtGraytoBGR(const AscendMat& src, AscendMat& dst, int dcn, bool, AscendStream& stream)
+{
+    checkImg(src);
+    CV_Assert(src.channels() == 1);
+
+    AscendMat matChannels[4];
+    for (int i = 0; i < 3; i++)
+        matChannels[i] = src;
+
+    if (dcn == 4)
+    {
+        AscendMat& alpha = matChannels[3];
+        alpha.create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), 1));
+        matAlphaSet(alpha, -1, stream);
+    }
+
+    merge(matChannels, dcn, dst, stream);
+}
+
+inline void cvtGraytoBGR(const InputArray& _src, OutputArray& _dst, int dcn, bool,
+                         AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtGraytoBGR(src, dst, dcn, false, stream);
+    dst.download(_dst, stream);
+}
+
+static const float RGB2XYZ_D65[] = {0.412453, 0.357580, 0.180423, 0.212671, 0.715160,
+                                    0.072169, 0.019334, 0.119193, 0.950227};
+
+static const float XYZ2RGB_D65[] = {3.240479, -1.53715, -0.498535, -0.969256, 1.875991,
+                                    0.041556, 0.055648, -0.204043, 1.057311};
+
+inline void matMulRGB(const AscendMat& src, AscendMat& dst, float* matrix, AscendStream& stream)
+{
+    checkImg(src);
+    CV_Assert(src.channels() == 3);
+
+    dst.create(src.rows, src.cols, src.type());
+    AscendMat formatedSrc = convertTo(src, CV_32F, stream);
+    AscendMat formatedDst = convertTo(dst, CV_32F, stream);
+
+    int64_t dims[] = {3, 3};
+    OperatorRunner runner;
+    runner.setOp("BatchMatMulV2")
+        .addInput(formatedSrc, "x1")
+        .addInput<float>(matrix, dims, 2, getACLType(CV_32F), "x2")
+        .addOutput(formatedDst, "y")
+        .addAttr(false, "adj_x1")
+        .addAttr(true, "adj_x2")
+        .run(stream);
+
+    if (src.depth() != CV_32F)
+    {
+        AscendMat thresholdTempMat(formatedSrc.size(), formatedSrc.type());
+        uint16_t thresh = (src.depth() == CV_8U ? (1 << 8) : (1 << 16)) - 1;
+        threshold(formatedDst, thresholdTempMat, thresh, 0, 2 /*THRESH_TRUNC*/, stream);
+        threshold(thresholdTempMat, formatedDst, 0, 0, 3 /*THRESH_TOZERO*/, stream);
+    }
+
+    convertBack(formatedDst, dst, stream);
+}
+
+// TODO: should deal with overflow. set 255 instead of cut off.
+inline void cvtBGRtoXYZ(const AscendMat& src, AscendMat& dst, int, bool swapBlue,
+                        AscendStream& stream)
+{
+    float coeffs[9];
+    memcpy(coeffs, RGB2XYZ_D65, 9 * sizeof(float));
+    if (!swapBlue)
+    {
+        std::swap(coeffs[0], coeffs[2]);
+        std::swap(coeffs[3], coeffs[5]);
+        std::swap(coeffs[6], coeffs[8]);
+    }
+    matMulRGB(src, dst, coeffs, stream);
+}
+
+inline void cvtBGRtoXYZ(const InputArray& _src, OutputArray& _dst, int, bool swapBlue,
+                        AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtBGRtoXYZ(src, dst, 0, swapBlue, stream);
+    dst.download(_dst, stream);
+}
+
+inline void cvtXYZtoBGR(const AscendMat& src, AscendMat& dst, int dcn, bool swapBlue,
+                        AscendStream& stream)
+{
+    float coeffs[9];
+    memcpy(coeffs, XYZ2RGB_D65, 9 * sizeof(float));
+    if (!swapBlue)
+    {
+        std::swap(coeffs[0], coeffs[6]);
+        std::swap(coeffs[1], coeffs[7]);
+        std::swap(coeffs[2], coeffs[8]);
+    }
+
+    if (dcn == 4)
+    {
+        AscendMat tempMat[2];
+        matMulRGB(src, tempMat[0], coeffs, stream);
+        tempMat[1].create(tempMat[0].rows, tempMat[0].cols, CV_MAKE_TYPE(tempMat[0].depth(), 1));
+        matAlphaSet(tempMat[1], -1, stream);
+        merge(tempMat, 2, dst, stream);
+    }
+    else
+        matMulRGB(src, dst, coeffs, stream);
+}
+
+inline void cvtXYZtoBGR(const InputArray& _src, OutputArray& _dst, int dcn, bool swapBlue,
+                        AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtXYZtoBGR(src, dst, dcn, swapBlue, stream);
+    dst.download(_dst, stream);
+}
+
+// TODO duplicated code
+static const float YCRF = 0.713f;
+static const float YCBF = 0.564f;
+static const float R2VF = 0.877f;
+static const float B2UF = 0.492f;
+inline void cvtBGRtoYCrCb(const AscendMat& src, AscendMat& dst, float* coeffs, bool swapBlue,
+                          bool yuvOrder, AscendStream& stream)
+{
+    checkImg(src);
+    CV_Assert(src.channels() == 3);
+
+    int buleIdx = swapBlue ? 2 : 0;
+    int depth = src.depth();
+    float delta = (depth == CV_8U) ? 128 : ((depth == CV_16U) ? 32768 : 0.5);
+
+    dst.create(src.rows, src.cols, src.type());
+    AscendMat formatedSrc = convertTo(src, CV_32F, stream);
+    AscendMat formatedDst = convertTo(dst, CV_32F, stream);
+
+    AscendMat YCrCb[3], RGB[3];
+    split(formatedSrc, RGB, stream);
+    cvtBGRtoGray(formatedSrc, YCrCb[0], 1, swapBlue, stream);
+    YCrCb[1].create(YCrCb[0].rows, YCrCb[0].cols, YCrCb[0].type());
+    YCrCb[2].create(YCrCb[0].rows, YCrCb[0].cols, YCrCb[0].type());
+
+    AscendMat tempMat1(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), 1)),
+        tempMat2(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), 1));
+
+    arithm_op(RGB[buleIdx ^ 2], YCrCb[0], tempMat1, "Sub", stream);
+    arithm_op(tempMat1, coeffs[0], tempMat2, "Muls", stream);
+    arithm_op(tempMat2, delta, YCrCb[1], "Adds", stream);
+
+    arithm_op(RGB[buleIdx], YCrCb[0], tempMat1, "Sub", stream);
+    arithm_op(tempMat1, coeffs[1], tempMat2, "Muls", stream);
+    arithm_op(tempMat2, delta, YCrCb[2], "Adds", stream);
+
+    if (yuvOrder)
+        std::swap(YCrCb[1], YCrCb[2]);
+
+    merge(YCrCb, 3, formatedDst, stream);
+    if (src.depth() != CV_32F)
+    {
+        AscendMat thresholdTempMat(formatedSrc.size(), formatedSrc.type());
+        uint16_t thresh = (src.depth() == CV_8U ? (1 << 8) : (1 << 16)) - 1;
+        threshold(formatedDst, thresholdTempMat, thresh, 0, 2 /*THRESH_TRUNC*/, stream);
+        threshold(thresholdTempMat, formatedDst, 0, 0, 3 /*THRESH_TOZERO*/, stream);
+    }
+
+    convertBack(formatedDst, dst, stream);
+}
+
+inline void cvtBGRtoYCrCb(const InputArray& _src, OutputArray& _dst, float* coeffs, bool swapBlue,
+                          bool yuvOrder, AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtBGRtoYCrCb(src, dst, coeffs, swapBlue, yuvOrder, stream);
+    dst.download(_dst, stream);
+}
+
+static const float CR2RF = 1.403f;
+static const float CR2GF = -0.714f;
+static const float CB2GF = -0.344f;
+static const float CB2BF = 1.773f;
+
+static const float V2RF = 1.140f;
+static const float V2GF = -0.581f;
+static const float U2GF = -0.395f;
+static const float U2BF = 2.032f;
+
+inline void cvtYCrCbtoBGR(const AscendMat& src, AscendMat& dst, int dcn, float* coeffs,
+                          bool swapBlue, bool yuvOrder, AscendStream& stream)
+{
+    checkImg(src);
+    CV_Assert(src.channels() == 3);
+
+    int buleIdx = swapBlue ? 2 : 0;
+    int depth = src.depth();
+    float delta = (depth == CV_8U) ? 128 : ((depth == CV_16U) ? 32768 : 0.5);
+
+    dst.create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), dcn));
+    AscendMat formatedSrc = convertTo(src, CV_32F, stream);
+    AscendMat formatedDst = convertTo(dst, CV_32F, stream);
+
+    AscendMat YCrCb[3], RGB[4];
+    split(formatedSrc, YCrCb, stream);
+    if (yuvOrder)
+        std::swap(YCrCb[1], YCrCb[2]);
+
+    RGB[0].create(formatedSrc.rows, formatedSrc.cols, CV_MAKE_TYPE(formatedSrc.depth(), 1));
+    RGB[1].create(formatedSrc.rows, formatedSrc.cols, CV_MAKE_TYPE(formatedSrc.depth(), 1));
+    RGB[2].create(formatedSrc.rows, formatedSrc.cols, CV_MAKE_TYPE(formatedSrc.depth(), 1));
+    AscendMat tempMat1(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), 1)),
+        tempMat2(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), 1)),
+        CbSubDelta(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), 1)),
+        CrSubDelta(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), 1));
+
+    arithm_op(YCrCb[1], (0.0f - delta), CrSubDelta, "Adds", stream);
+    arithm_op(YCrCb[2], (0.0f - delta), CbSubDelta, "Adds", stream);
+    arithm_op(CrSubDelta, coeffs[0], tempMat1, "Muls", stream);
+    arithm_op(YCrCb[0], tempMat1, RGB[buleIdx ^ 2], "Add", stream);
+
+    arithm_op(CrSubDelta, coeffs[1], tempMat1, "Muls", stream);
+    arithm_op(YCrCb[0], tempMat1, tempMat2, "Add", stream);
+    arithm_op(CbSubDelta, coeffs[2], tempMat1, "Muls", stream);
+    arithm_op(tempMat2, tempMat1, RGB[1], "Add", stream);
+
+    arithm_op(CbSubDelta, coeffs[3], tempMat1, "Muls", stream);
+    arithm_op(YCrCb[0], tempMat1, RGB[buleIdx], "Add", stream);
+
+    if (dcn == 4)
+    {
+        RGB[3].create(RGB[0].rows, RGB[0].cols, RGB[0].type());
+        matAlphaSet(RGB[3], src.depth(), stream);
+    }
+
+    merge(RGB, dcn, formatedDst, stream);
+    if (src.depth() != CV_32F)
+    {
+        AscendMat thresholdTempMat(formatedSrc.size(), CV_MAKE_TYPE(formatedSrc.depth(), dcn));
+        uint16_t thresh = (src.depth() == CV_8U ? (1 << 8) : (1 << 16)) - 1;
+        threshold(formatedDst, thresholdTempMat, thresh, 0, 2 /*THRESH_TRUNC*/, stream);
+        threshold(thresholdTempMat, formatedDst, 0, 0, 3 /*THRESH_TOZERO*/, stream);
+    }
+
+    convertBack(formatedDst, dst, stream);
+}
+
+inline void cvtYCrCbtoBGR(const InputArray& _src, OutputArray& _dst, int dcn, float* coeffs,
+                          bool swapBlue, bool yuvOrder, AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, swapBlue, yuvOrder, stream);
+    dst.download(_dst, stream);
+}
+
+// The input may be Input/OutputArray or AscendMat. Use templates to reduce duplicate code.
+template <typename SRC, typename DST>
+inline void BGR2BGRA(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 4, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGRA2BGR(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 3, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGR2RGBA(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 4, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void RGBA2BGR(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 3, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGR2RGB(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 3, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGRA2RGBA(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 4, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGR2GRAY(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoGray(src, dst, 1, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void RGB2GRAY(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoGray(src, dst, 1, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void GRAY2BGR(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtGraytoBGR(src, dst, 3, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void GRAY2BGRA(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtGraytoBGR(src, dst, 4, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGRA2GRAY(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoGray(src, dst, 1, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void RGBA2GRAY(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoGray(src, dst, 1, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGR2XYZ(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoXYZ(src, dst, 3, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void RGB2XYZ(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    cvtBGRtoXYZ(src, dst, 3, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void XYZ2BGR(const SRC& src, DST& dst, int dcn, AscendStream& stream)
+{
+    if (dcn <= 0)
+        dcn = 3;
+    cvtXYZtoBGR(src, dst, dcn, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void XYZ2RGB(const SRC& src, DST& dst, int dcn, AscendStream& stream)
+{
+    if (dcn <= 0)
+        dcn = 3;
+    cvtXYZtoBGR(src, dst, dcn, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGR2YCrCb(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    float coeffs[2];
+    coeffs[0] = YCRF;
+    coeffs[1] = YCBF;
+    cvtBGRtoYCrCb(src, dst, coeffs, false, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void RGB2YCrCb(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    float coeffs[2];
+    coeffs[0] = YCRF;
+    coeffs[1] = YCBF;
+    cvtBGRtoYCrCb(src, dst, coeffs, true, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void YCrCb2BGR(const SRC& src, DST& dst, int dcn, AscendStream& stream)
+{
+    float coeffs[4];
+    coeffs[0] = CR2RF;
+    coeffs[1] = CR2GF;
+    coeffs[2] = CB2GF;
+    coeffs[3] = CB2BF;
+    if (dcn <= 0)
+        dcn = 3;
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, false, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void YCrCb2RGB(const SRC& src, DST& dst, int dcn, AscendStream& stream)
+{
+    float coeffs[4];
+    coeffs[0] = CR2RF;
+    coeffs[1] = CR2GF;
+    coeffs[2] = CB2GF;
+    coeffs[3] = CB2BF;
+    if (dcn <= 0)
+        dcn = 3;
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, true, false, stream);
+}
+
+template <typename SRC, typename DST>
+inline void BGR2YUV(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    float coeffs[2];
+    coeffs[0] = R2VF;
+    coeffs[1] = B2UF;
+    cvtBGRtoYCrCb(src, dst, coeffs, false, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void RGB2YUV(const SRC& src, DST& dst, int, AscendStream& stream)
+{
+    float coeffs[2];
+    coeffs[0] = R2VF;
+    coeffs[1] = B2UF;
+    cvtBGRtoYCrCb(src, dst, coeffs, true, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void YUV2BGR(const SRC& src, DST& dst, int dcn, AscendStream& stream)
+{
+    float coeffs[4];
+    coeffs[0] = V2RF;
+    coeffs[1] = V2GF;
+    coeffs[2] = U2GF;
+    coeffs[3] = U2BF;
+    if (dcn <= 0)
+        dcn = 3;
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, false, true, stream);
+}
+
+template <typename SRC, typename DST>
+inline void YUV2RGB(const SRC& src, DST& dst, int dcn, AscendStream& stream)
+{
+    float coeffs[4];
+    coeffs[0] = V2RF;
+    coeffs[1] = V2GF;
+    coeffs[2] = U2GF;
+    coeffs[3] = U2BF;
+    if (dcn <= 0)
+        dcn = 3;
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, true, true, stream);
+}
+
+template <typename SRC, typename DST>
+void cvtColorDo(const SRC& src, DST& dst, int code, int dcn, AscendStream& stream)
+{
+    typedef void (*func_t)(const SRC& src, DST& dst, int dcn, AscendStream& stream);
+    static const func_t funcs[] = {
+        BGR2BGRA,  // CV_BGR2BGRA    =0
+        BGRA2BGR,  // CV_BGRA2BGR    =1
+        BGR2RGBA,  // CV_BGR2RGBA    =2
+        RGBA2BGR,  // CV_RGBA2BGR    =3
+        BGR2RGB,   // CV_BGR2RGB     =4
+        BGRA2RGBA, // CV_BGRA2RGBA   =5
+
+        BGR2GRAY,  // CV_BGR2GRAY    =6
+        RGB2GRAY,  // CV_RGB2GRAY    =7
+        GRAY2BGR,  // CV_GRAY2BGR    =8
+        GRAY2BGRA, // CV_GRAY2BGRA   =9
+        BGRA2GRAY, // CV_BGRA2GRAY   =10
+        RGBA2GRAY, // CV_RGBA2GRAY   =11
+
+        0, // CV_BGR2BGR565  =12
+        0, // CV_RGB2BGR565  =13
+        0, // CV_BGR5652BGR  =14
+        0, // CV_BGR5652RGB  =15
+        0, // CV_BGRA2BGR565 =16
+        0, // CV_RGBA2BGR565 =17
+        0, // CV_BGR5652BGRA =18
+        0, // CV_BGR5652RGBA =19
+
+        0, // CV_GRAY2BGR565 =20
+        0, // CV_BGR5652GRAY =21
+
+        0, // CV_BGR2BGR555  =22
+        0, // CV_RGB2BGR555  =23
+        0, // CV_BGR5552BGR  =24
+        0, // CV_BGR5552RGB  =25
+        0, // CV_BGRA2BGR555 =26
+        0, // CV_RGBA2BGR555 =27
+        0, // CV_BGR5552BGRA =28
+        0, // CV_BGR5552RGBA =29
+
+        0, // CV_GRAY2BGR555 =30
+        0, // CV_BGR5552GRAY =31
+
+        BGR2XYZ, // CV_BGR2XYZ     =32
+        RGB2XYZ, // CV_RGB2XYZ     =33
+        XYZ2BGR, // CV_XYZ2BGR     =34
+        XYZ2RGB, // CV_XYZ2RGB     =35
+
+        BGR2YCrCb, // CV_BGR2YCrCb   =36
+        RGB2YCrCb, // CV_RGB2YCrCb   =37
+        YCrCb2BGR, // CV_YCrCb2BGR   =38
+        YCrCb2RGB, // CV_YCrCb2RGB   =39
+
+        0, // CV_BGR2HSV     =40
+        0, // CV_RGB2HSV     =41
+
+        0, //                =42
+        0, //                =43
+
+        0, // CV_BGR2Lab     =44
+        0, // CV_RGB2Lab     =45
+
+        0, // CV_BayerBG2BGR =46
+        0, // CV_BayeRGB2BGR =47
+        0, // CV_BayerRG2BGR =48
+        0, // CV_BayerGR2BGR =49
+
+        0, // CV_BGR2Luv     =50
+        0, // CV_RGB2Luv     =51
+
+        0, // CV_BGR2HLS     =52
+        0, // CV_RGB2HLS     =53
+
+        0, // CV_HSV2BGR     =54
+        0, // CV_HSV2RGB     =55
+
+        0, // CV_Lab2BGR     =56
+        0, // CV_Lab2RGB     =57
+        0, // CV_Luv2BGR     =58
+        0, // CV_Luv2RGB     =59
+
+        0, // CV_HLS2BGR     =60
+        0, // CV_HLS2RGB     =61
+
+        0, // CV_BayerBG2BGR_VNG =62
+        0, // CV_BayeRGB2BGR_VNG =63
+        0, // CV_BayerRG2BGR_VNG =64
+        0, // CV_BayerGR2BGR_VNG =65
+
+        0, // CV_BGR2HSV_FULL = 66
+        0, // CV_RGB2HSV_FULL = 67
+        0, // CV_BGR2HLS_FULL = 68
+        0, // CV_RGB2HLS_FULL = 69
+
+        0, // CV_HSV2BGR_FULL = 70
+        0, // CV_HSV2RGB_FULL = 71
+        0, // CV_HLS2BGR_FULL = 72
+        0, // CV_HLS2RGB_FULL = 73
+
+        0, // CV_LBGR2Lab     = 74
+        0, // CV_LRGB2Lab     = 75
+        0, // CV_LBGR2Luv     = 76
+        0, // CV_LRGB2Luv     = 77
+
+        0, // CV_Lab2LBGR     = 78
+        0, // CV_Lab2LRGB     = 79
+        0, // CV_Luv2LBGR     = 80
+        0, // CV_Luv2LRGB     = 81
+
+        BGR2YUV, // CV_BGR2YUV      = 82
+        RGB2YUV, // CV_RGB2YUV      = 83
+        YUV2BGR, // CV_YUV2BGR      = 84
+        YUV2RGB, // CV_YUV2RGB      = 85
+
+        0, // CV_BayerBG2GRAY = 86
+        0, // CV_BayeRGB2GRAY = 87
+        0, // CV_BayerRG2GRAY = 88
+        0, // CV_BayerGR2GRAY = 89
+
+        // YUV 4:2:0 formats family
+        0, // CV_YUV2RGB_NV12 = 90,
+        0, // CV_YUV2BGR_NV12 = 91,
+        0, // CV_YUV2RGB_NV21 = 92,
+        0, // CV_YUV2BGR_NV21 = 93,
+
+        0, // CV_YUV2RGBA_NV12 = 94,
+        0, // CV_YUV2BGRA_NV12 = 95,
+        0, // CV_YUV2RGBA_NV21 = 96,
+        0, // CV_YUV2BGRA_NV21 = 97,
+
+        0, // CV_YUV2RGB_YV12 = 98,
+        0, // CV_YUV2BGR_YV12 = 99,
+        0, // CV_YUV2RGB_IYUV = 100,
+        0, // CV_YUV2BGR_IYUV = 101,
+
+        0, // CV_YUV2RGBA_YV12 = 102,
+        0, // CV_YUV2BGRA_YV12 = 103,
+        0, // CV_YUV2RGBA_IYUV = 104,
+        0, // CV_YUV2BGRA_IYUV = 105,
+
+        0, // CV_YUV2GRAY_420 = 106,
+
+        // YUV 4:2:2 formats family
+        0, // CV_YUV2RGB_UYVY = 107,
+        0, // CV_YUV2BGR_UYVY = 108,
+        0, // //CV_YUV2RGB_VYUY = 109,
+        0, // //CV_YUV2BGR_VYUY = 110,
+
+        0, // CV_YUV2RGBA_UYVY = 111,
+        0, // CV_YUV2BGRA_UYVY = 112,
+        0, // //CV_YUV2RGBA_VYUY = 113,
+        0, // //CV_YUV2BGRA_VYUY = 114,
+
+        0, // CV_YUV2RGB_YUY2 = 115,
+        0, // CV_YUV2BGR_YUY2 = 116,
+        0, // CV_YUV2RGB_YVYU = 117,
+        0, // CV_YUV2BGR_YVYU = 118,
+
+        0, // CV_YUV2RGBA_YUY2 = 119,
+        0, // CV_YUV2BGRA_YUY2 = 120,
+        0, // CV_YUV2RGBA_YVYU = 121,
+        0, // CV_YUV2BGRA_YVYU = 122,
+
+        0, // CV_YUV2GRAY_UYVY = 123,
+        0, // CV_YUV2GRAY_YUY2 = 124,
+
+        // alpha premultiplication
+        0, // CV_RGBA2mRGBA = 125,
+        0, // CV_mRGBA2RGBA = 126,
+
+        0, // CV_COLORCVT_MAX  = 127
+    };
+
+    CV_Assert(code < 128);
+
+    func_t func = funcs[code];
+
+    if (func == 0)
+        CV_Error(Error::StsBadFlag, "Unknown/unsupported color conversion code");
+
+    func(src, dst, dcn, stream);
+}
+
+// Instantiate templates to avoid confusion in python code generation
+void cvtColor(const InputArray src, OutputArray dst, int code, int dcn, AscendStream& stream)
+{
+    cvtColorDo(src, dst, code, dcn, stream);
+}
+
+void cvtColor(const AscendMat& src, AscendMat& dst, int code, int dcn, AscendStream& stream)
+{
+    cvtColorDo(src, dst, code, dcn, stream);
+}
+
+} // namespace cann
+} // namespace cv
\ No newline at end of file
diff --git a/modules/cannops/src/core.cpp b/modules/cannops/src/core.cpp
new file mode 100644
index 00000000000..7d328915ef9
--- /dev/null
+++ b/modules/cannops/src/core.cpp
@@ -0,0 +1,310 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+namespace cv
+{
+namespace cann
+{
+// Transform data type from one to another. eg. from NCHW to NHWC.
+void transData(const AscendMat& src, AscendMat& dst, const char* from, const char* to,
+               AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp("TransData")
+        .addInput(src, "src")
+        .addOutput(dst, "dst")
+        .addAttr(from, "src_format")
+        .addAttr(to, "dst_format")
+        .run(stream);
+}
+
+void merge(const AscendMat* src, size_t n, AscendMat& dst, AscendStream& stream)
+{
+    if (src == nullptr || n < 2)
+        return;
+
+    int depth = src->depth();
+    int rows = src->rows;
+    int cols = src->cols;
+
+    // All matrix must have same size and type
+    for (size_t i = 1; i < n; i++)
+    {
+        CV_Assert(src[i].depth() == depth && src[i].channels() == 1);
+        CV_Assert(src[i].rows == rows && src[i].cols == cols);
+    }
+
+    int cns = 0;
+    for (size_t i = 0; i < n; i++)
+        cns += src[i].channels();
+    dst.create(src->rows, src->cols, CV_MAKE_TYPE(src->depth(), cns));
+
+    OperatorRunner runner;
+    runner.setOp("ConcatD");
+
+    for (size_t i = 0; i < n; i++)
+    {
+        runner.addInput(src[i], ("x" + std::to_string(i)).c_str());
+    }
+
+    runner.addOutput(dst, "output_data").addAttr(3, "concat_dim").run(stream);
+}
+
+void merge(const std::vector<AscendMat>& src, AscendMat& dst, AscendStream& stream)
+{
+    merge(&src[0], src.size(), dst, stream);
+}
+
+void merge(const AscendMat* src, size_t n, OutputArray& _dst, AscendStream& stream)
+{
+    AscendMat dst;
+    merge(src, n, dst, stream);
+    dst.download(_dst, stream);
+}
+void merge(const std::vector<AscendMat>& src, OutputArray& dst, AscendStream& stream)
+{
+    merge(&src[0], src.size(), dst, stream);
+}
+
+void split(const AscendMat& src, AscendMat* dst, AscendStream& stream)
+{
+    if (src.empty() || dst == nullptr)
+        return;
+
+    int cn = src.channels();
+
+    OperatorRunner runner;
+    runner.setOp("SplitD").addInput(src, "x");
+    for (int i = 0; i < cn; i++)
+    {
+        dst[i].create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), 1));
+        runner.addOutput(dst[i], ("y" + std::to_string(i)).c_str());
+    }
+    runner.addAttr(3, "split_dim").addAttr(cn, "num_split").run(stream);
+}
+
+void split(const AscendMat& src, std::vector<AscendMat>& dst, AscendStream& stream)
+{
+    dst.resize(src.channels());
+    split(src, &dst[0], stream);
+}
+
+void split(const InputArray _src, AscendMat* dst, AscendStream& stream)
+{
+    AscendMat src;
+    src.upload(_src, stream);
+    split(src, dst, stream);
+}
+void split(const InputArray _src, std::vector<AscendMat>& dst, AscendStream& stream)
+{
+    AscendMat src;
+    src.upload(_src, stream);
+    dst.resize(src.channels());
+    split(_src, &dst[0], stream);
+}
+
+void transpose(const AscendMat& src, int64_t* perm, AscendMat& dst, AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp("TransposeD")
+        .addInput(src, "x")
+        .addOutput(dst, "y")
+        .addAttr(perm, 4, "perm")
+        .run(stream);
+}
+
+void transpose(const AscendMat& src, AscendMat& dst, AscendStream& stream)
+{
+    int64_t perm[] = {0, 2, 1, 3};
+    dst.create(src.cols, src.rows, src.type());
+    transpose(src, perm, dst, stream);
+}
+
+void transpose(InputArray _src, OutputArray _dst, AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    transpose(src, dst, stream);
+    dst.download(_dst, stream);
+}
+
+void flip(const AscendMat& src, std::vector<int32_t>& asixs, AscendMat& dst, AscendStream& stream)
+{
+    int64_t dim = asixs.size();
+    OperatorRunner runner;
+    runner.setOp("ReverseV2")
+        .addInput(src, "x")
+        .addInput<int32_t>(&asixs.at(0), &dim, 1, ACL_INT32, "axis")
+        .addOutput(dst, "y")
+        .run(stream);
+}
+
+void flip(const AscendMat& src, AscendMat& dst, int flipCode, AscendStream& stream)
+{
+    std::vector<int32_t> asix;
+    if (flipCode == 0)
+        asix.push_back(1);
+    else if (flipCode > 0)
+        asix.push_back(2);
+    else
+    {
+        asix.push_back(1);
+        asix.push_back(2);
+    }
+    dst.create(src.rows, src.cols, src.type());
+    flip(src, asix, dst, stream);
+}
+
+void flip(const InputArray _src, OutputArray _dst, int flipCode, AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    flip(src, dst, flipCode, stream);
+    dst.download(_dst, stream);
+}
+
+void rotate(const AscendMat& src, AscendMat& dst, int rotateMode, AscendStream& stream)
+{
+    AscendMat tempMat;
+    switch (rotateMode)
+    {
+        case ROTATE_90_CLOCKWISE:
+        {
+            dst.create(src.cols, src.rows, src.type());
+            transpose(src, tempMat, stream);
+            flip(tempMat, dst, 1, stream);
+            break;
+        }
+        case ROTATE_180:
+        {
+            dst.create(src.rows, src.cols, src.type());
+            flip(src, dst, -1, stream);
+            break;
+        }
+        case ROTATE_90_COUNTERCLOCKWISE:
+        {
+            dst.create(src.cols, src.rows, src.type());
+            transpose(src, tempMat, stream);
+            flip(tempMat, dst, 0, stream);
+            break;
+        }
+        default:
+            break;
+    }
+}
+
+void rotate(InputArray _src, OutputArray _dst, int rotateMode, AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    rotate(src, dst, rotateMode, stream);
+    dst.download(_dst, stream);
+}
+
+void crop(const AscendMat& src, AscendMat& dst, const AscendMat& sizeSrcNpu, int64_t* offset,
+          AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp("Crop")
+        .addInput(src, "x")
+        .addInput(sizeSrcNpu, "size")
+        .addAttr(1, "axis")
+        .addAttr(offset, 3, "offsets")
+        .addOutput(dst, "y")
+        .run(stream);
+}
+
+AscendMat crop(const AscendMat& src, const Rect& rect, AscendStream& stream)
+{
+    AscendMat dst, sizeSrcNpu;
+    // left-up conner
+    int x = rect.x, y = rect.y, width = rect.width, height = rect.height;
+    int64_t offset[] = {y, x, 0};
+
+    CV_Assert(x + width <= src.cols && y + height <= src.rows);
+    int size1[] = {1, src.channels(), height, width};
+    dst.create(height, width, src.type());
+
+    Mat sizeSrc(height, width, src.type(), size1);
+    sizeSrcNpu.upload(sizeSrc);
+    crop(src, dst, sizeSrcNpu, offset, stream);
+
+    return dst;
+}
+AscendMat crop(InputArray _src, const Rect& rect, AscendStream& stream)
+{
+    AscendMat src;
+    src.upload(_src, stream);
+    return crop(src, rect, stream);
+}
+
+void resize(const AscendMat& src, AscendMat& dst, int32_t* dstSize, int interpolation,
+            AscendStream& stream)
+{
+    OperatorRunner runner;
+    int64_t dims[] = {2};
+    char const* mode = "";
+    switch (interpolation)
+    {
+        case INTER_CUBIC:
+            mode = "ResizeBicubic";
+            break;
+        case INTER_AREA:
+            mode = "ResizeArea";
+            break;
+        default:
+            break;
+    }
+
+    runner.setOp(mode)
+        .addInput(src, "images")
+        .addInput<int32_t>(dstSize, dims, 1, ACL_INT32, "size")
+        .addAttr(true, "half_pixel_centers")
+        .addOutput(dst, "y")
+        .run(stream);
+}
+
+void resize(const AscendMat& src, AscendMat& dst, Size dsize, double inv_scale_x,
+            double inv_scale_y, int interpolation, AscendStream& stream)
+{
+    Size ssize = src.size();
+    CV_Assert(!ssize.empty());
+    float_t scaleX = (float_t)inv_scale_x;
+    float_t scaleY = (float_t)inv_scale_y;
+    CV_Assert(interpolation == INTER_CUBIC || interpolation == INTER_AREA);
+
+    if (dsize.empty())
+    {
+        CV_Assert(scaleX > 0);
+        CV_Assert(scaleY > 0);
+        dsize = Size(saturate_cast<int>(ssize.width * inv_scale_x),
+                     saturate_cast<int>(ssize.height * inv_scale_y));
+        CV_Assert(!dsize.empty());
+    }
+    else
+    {
+        scaleX = (float_t)dsize.width / ssize.width;
+        scaleY = (float_t)dsize.height / ssize.height;
+        CV_Assert(scaleX > 0);
+        CV_Assert(scaleY > 0);
+    }
+
+    int32_t dstSize[] = {dsize.width, dsize.height};
+    dst.create(dstSize[0], dstSize[1], src.type());
+    resize(src, dst, dstSize, interpolation, stream);
+}
+
+void resize(InputArray _src, OutputArray _dst, Size dsize, double inv_scale_x, double inv_scale_y,
+            int interpolation, AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    resize(src, dst, dsize, inv_scale_x, inv_scale_y, interpolation, stream);
+    dst.download(_dst, stream);
+}
+
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannops/src/element_operations.cpp b/modules/cannops/src/element_operations.cpp
new file mode 100644
index 00000000000..402658369b5
--- /dev/null
+++ b/modules/cannops/src/element_operations.cpp
@@ -0,0 +1,499 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+namespace cv
+{
+namespace cann
+{
+
+static inline void applyMask(const AscendMat& src, AscendMat& dst, const AscendMat& mask,
+                             AscendStream& stream)
+{
+    int mtype = mask.type();
+    CV_Assert((mtype == CV_8UC1 || mtype == CV_8SC1) && mask.size() == src.size());
+    AscendMat onesMask, castedMask;
+    onesMask.create(mask.rows, mask.cols, mask.type());
+
+    OperatorRunner runner;
+    runner.setOp("Div")
+        .addInput(mask, "x1")
+        .addInput(mask, "x2")
+        .addOutput(onesMask, "y")
+        .run(stream);
+
+    onesMask.convertTo(castedMask, dst.depth(), stream);
+    arithm_op(src, castedMask, dst, "Mul", stream);
+}
+
+static inline void applyScale(const AscendMat& src, AscendMat& dst, float scale,
+                              AscendStream& stream)
+{
+    OperatorRunner runner;
+    arithm_op(src, scale, dst, "Muls", stream);
+}
+
+void arithm_op(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const char* op,
+               AscendStream& stream)
+{
+    if (src2.empty())
+        arithm_op(src1, dst, op, stream);
+    else
+    {
+        OperatorRunner runner;
+        runner.setOp(op).addInput(src1, "x1").addInput(src2, "x2").addOutput(dst, "y").run(stream);
+    }
+}
+
+void arithm_op(const AscendMat& src, const Scalar& sc, AscendMat& dst, const char* op,
+               AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp(op)
+        .addInput(src, "x1")
+        .addInput(sc, src.type(), "x2")
+        .addOutput(dst, "y")
+        .run(stream);
+}
+
+void arithm_op(const Scalar& sc, const AscendMat& src, AscendMat& dst, const char* op,
+               AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp(op)
+        .addInput(sc, src.type(), "x1")
+        .addInput(src, "x2")
+        .addOutput(dst, "y")
+        .run(stream);
+}
+
+void arithm_op(const AscendMat& src, AscendMat& dst, const char* op, AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp(op).addInput(src, "x").addOutput(dst, "y").run(stream);
+}
+
+void arithm_op(const AscendMat& src, float scalar, AscendMat& dst, const char* op,
+               AscendStream& stream)
+{
+    OperatorRunner runner;
+    runner.setOp(op).addInput(src, "x").addAttr(scalar, "value").addOutput(dst, "y").run(stream);
+}
+
+// Helper function for template arithm_op. all function called in template arithm_op should be
+// done in both AscendMat and Scalar.
+static void getInputInfo(const AscendMat& src, int& depth, int& cn, Size& size)
+{
+    depth = src.depth();
+    cn = src.channels();
+    size = src.size();
+}
+
+static void getInputInfo(const Scalar& src, int& depth, int& cn, Size& size)
+{
+    CV_UNUSED(src);
+    depth = -1;
+    cn = -1;
+    size = {-1, -1};
+}
+
+static void convert(const AscendMat& src, AscendMat& dst, AscendStream& stream)
+{
+    src.convertTo(dst, CV_32F, stream);
+}
+
+static void convert(const Scalar& src, Scalar& dst, AscendStream& stream)
+{
+    CV_UNUSED(stream);
+    dst = src;
+}
+
+template <typename T1, typename T2>
+static void arithm_op(const T1& src1, const T2& src2, AscendMat& dst, const AscendMat& mask, float scale,
+                      int dtype, const char* op, AscendStream& stream)
+{
+    T1 castedSrc1;
+    T2 castedSrc2;
+    AscendMat castedRet;
+
+    int sdepth1, sdepth2, scn1, scn2;
+    Size ssize1, ssize2;
+    getInputInfo(src1, sdepth1, scn1, ssize1);
+    getInputInfo(src2, sdepth2, scn2, ssize2);
+
+    int sdepth = sdepth1 == -1 ? sdepth2 : sdepth1;
+    int cn = scn1 == -1 ? scn2 : scn1;
+    Size size = sdepth1 == -1 ? ssize2 : ssize1;
+
+    if (sdepth1 != -1 && sdepth2 != -1 && !ssize1.empty() && !ssize2.empty())
+        CV_Assert(sdepth1 == sdepth2 && scn1 == scn2 && ssize1 == ssize2);
+
+    if (dtype < 0)
+        dtype = sdepth;
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    CV_Assert(sdepth <= CV_16F && ddepth <= CV_16F);
+
+    dst.create(size.height, size.width, CV_MAKE_TYPE(ddepth, cn));
+
+    // In order to achieve high accuracy, convert integers to float for calculation.
+    if (scale != 1 && dtype < CV_32F)
+    {
+        convert(src1, castedSrc1, stream);
+        convert(src2, castedSrc2, stream);
+        castedRet.create(size.height, size.width, CV_MAKE_TYPE(CV_32F, cn));
+    }
+    else
+    {
+        castedSrc1 = src1;
+        castedSrc2 = src2;
+        castedRet = dst;
+    }
+
+    // step1, calculate operator.
+    OperatorRunner runner;
+    arithm_op(castedSrc1, castedSrc2, castedRet, op, stream);
+
+    // step2, apply mask if need.
+    if (!mask.empty())
+        applyMask(castedRet, castedRet, mask, stream);
+
+    // step3, apply scale if need.
+    if (scale != 1)
+        applyScale(castedRet, castedRet, scale, stream);
+
+    // After rounding the result, convert the type to the original type.
+    if (castedRet.depth() != dst.depth())
+    {
+        runner.setOp("Round").addInput(castedRet, "x").addOutput(castedRet, "y").run(stream);
+        castedRet.convertTo(dst, stream);
+    }
+}
+
+static void arithm_op(const InputArray _src1, const InputArray _src2, OutputArray _dst, const InputArray _mask,
+                      float scale, int dtype, const char* op, AscendStream& stream)
+{
+    const bool isScalar1 = (_src1.kind() == _InputArray::MATX);
+    const bool isScalar2 = (_src2.kind() == _InputArray::MATX);
+
+    if (isScalar1 && isScalar2)
+        CV_Error(Error::StsBadArg, "At list one matrix parameter shoule be passwd.");
+
+    AscendMat src1, src2, dst, mask;
+    Mat scalar;
+
+    if (!isScalar1 && !_src1.empty())
+        src1.upload(_src1, stream);
+    if (!isScalar2 && !_src2.empty())
+        src2.upload(_src2, stream);
+
+    if (!_mask.empty())
+        mask.upload(_mask, stream);
+
+    Scalar val;
+    if (isScalar1)
+        scalar = _src1.getMat();
+    else if (isScalar2)
+        scalar = _src2.getMat();
+
+    if (!scalar.empty())
+    {
+        CV_Assert(scalar.total() <= 4);
+        scalar.convertTo(Mat_<double>(scalar.rows, scalar.cols, &val[0]), CV_64F);
+    }
+
+    if (isScalar1)
+        arithm_op(val, src2, dst, mask, scale, dtype, op, stream);
+    else if (isScalar2)
+        arithm_op(src1, val, dst, mask, scale, dtype, op, stream);
+    else
+        arithm_op(src1, src2, dst, mask, scale, dtype, op, stream);
+
+    dst.download(_dst, stream);
+}
+
+// In order to supply more interfaces, differnet function declaration shoule be done.
+void add(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask, int dtype,
+         AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
+}
+
+void add(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+         AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
+}
+
+void add(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+         AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
+}
+
+void add(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+         AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
+}
+
+
+void subtract(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
+}
+
+void subtract(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
+}
+
+void subtract(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
+}
+
+void subtract(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
+}
+
+
+void multiply(const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), scale, dtype, "Mul", stream);
+}
+
+void multiply(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "Mul", stream);
+}
+
+void multiply(const AscendMat& src1, const Scalar& src2, AscendMat& dst, float scale, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "Mul", stream);
+}
+
+void multiply(const Scalar& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "Mul", stream);
+}
+
+
+void divide(const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+            AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), scale, dtype, "RealDiv", stream);
+}
+
+void divide(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+            AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "RealDiv", stream);
+}
+
+void divide(const AscendMat& src1, const Scalar& src2, AscendMat& dst, float scale, int dtype,
+            AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "RealDiv", stream);
+}
+
+void divide(const Scalar& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+            AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "RealDiv", stream);
+}
+
+
+void bitwise_and(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
+}
+
+void bitwise_and(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
+}
+
+void bitwise_and(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
+}
+
+void bitwise_and(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
+}
+
+
+void bitwise_or(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+                AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
+}
+
+void bitwise_or(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+                AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
+}
+
+void bitwise_or(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+                AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
+}
+
+void bitwise_or(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+                AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
+}
+
+
+void bitwise_xor(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
+}
+
+void bitwise_xor(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
+}
+
+void bitwise_xor(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
+}
+
+void bitwise_xor(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
+}
+
+
+void bitwise_not(const InputArray src, OutputArray dst, const InputArray mask, AscendStream& stream)
+{
+    arithm_op(src, noArray(), dst, mask, 1, -1, "Invert", stream);
+}
+
+void bitwise_not(const AscendMat& src, AscendMat& dst, const AscendMat& mask, AscendStream& stream)
+{
+    arithm_op(src, AscendMat(), dst, mask, 1, -1, "Invert", stream);
+}
+
+
+void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2, double beta, double gamma,
+                 AscendMat& dst, int dtype, AscendStream& stream)
+{
+    if (dtype < 0)
+        dtype = src1.depth();
+
+    CV_Assert(src2.depth() == src1.depth() && src2.size() == src1.size() &&
+              src1.channels() == src2.channels());
+
+    int type = CV_MAKE_TYPE(dtype, src1.channels());
+    dst.create(src1.rows, src1.cols, type);
+
+    // TODO: Consider overflow, should extend type or not?
+    AscendMat src1Weighted(src1.size(), type), src2Weighted(src1.size(), type),
+        srcWeightedSumRet(src1.size(), type);
+
+    arithm_op(src1, (float)alpha, src1Weighted, "Muls", stream);
+    arithm_op(src2, (float)beta, src2Weighted, "Muls", stream);
+    arithm_op(src1Weighted, src2Weighted, srcWeightedSumRet, "Add", stream);
+    arithm_op(srcWeightedSumRet, (float)gamma, dst, "Adds", stream);
+}
+
+void addWeighted(const InputArray _src1, double alpha, const InputArray _src2, double beta, double gamma,
+                 OutputArray _dst, int dtype, AscendStream& stream)
+{
+    AscendMat src1, src2, dst;
+    src1.upload(_src1, stream);
+    src2.upload(_src2, stream);
+    addWeighted(src1, alpha, src2, beta, gamma, dst, dtype, stream);
+    dst.download(_dst, stream);
+}
+
+double threshold(const AscendMat& src, AscendMat& dst, double thresh, double maxval, int type,
+                 AscendStream& stream)
+{
+    // ThresholdTypes is defined in opencv2/imgproc, This type is the only Symbol we need.
+    // Add imgproc to dependence is too heavy, use magic number instead.
+    CV_Assert(type <= 4 /*THRESH_TOZERO_INV*/);
+
+    AscendMat threshMat(src.size(), src.type());
+
+    dst.create(src.rows, src.cols, src.type());
+
+    OperatorRunner runner;
+    runner.setOp("Threshold")
+        .addInput(src, "x")
+        .addOutput(threshMat, "y")
+        .addAttr((float)thresh, "threshold")
+        .run(stream);
+
+    // THRESH_*_INV, THRESH_TRUNC need a inverse threshMat.
+    // THRESH_BINARY_INV = 1, THRESH_TRUNC = 2, THRESH_TOZERO_INV = 4,
+    if (type == 1 || type == 2 || type == 4)
+    {
+        AscendMat threshInvMat(src.size(), src.type());
+        AscendMat ones(src.size(), src.type());
+        Scalar s(1, 1, 1, 1);
+        ones.setTo(s, stream);
+        arithm_op(ones, threshMat, threshInvMat, "Sub", stream);
+
+        if (type == 1)
+            arithm_op(threshInvMat, (float)maxval, dst, "Muls", stream);
+        else if (type == 2)
+        {
+            AscendMat ToZeroInvMat(src.size(), src.type());
+            AscendMat TruncMat(src.size(), src.type());
+            arithm_op(threshInvMat, src, ToZeroInvMat, "Mul", stream);
+            arithm_op(threshMat, (float)thresh, TruncMat, "Muls", stream);
+            arithm_op(ToZeroInvMat, TruncMat, dst, "Add", stream);
+        }
+        else
+            arithm_op(threshInvMat, src, dst, "Mul", stream);
+    }
+    else
+    {
+        if (type == 0) /* THRESH_BINARY = 0 */
+            arithm_op(threshMat, (float)maxval, dst, "Muls", stream);
+        else if (type == 3) /* THRESH_TOZERO = 3 */
+            arithm_op(threshMat, src, dst, "Mul", stream);
+        else
+            CV_Error(Error::StsError, "Unknown/unsupported threshold type");
+    }
+    return thresh;
+}
+
+double threshold(const InputArray _src, OutputArray _dst, double thresh, double maxval, int type,
+                 AscendStream& stream)
+{
+    AscendMat src, dst;
+    src.upload(_src, stream);
+    dst.create(src.rows, src.cols, src.type());
+    double ret = threshold(src, dst, thresh, maxval, type, stream);
+    dst.download(_dst, stream);
+    return ret;
+}
+
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannops/src/precomp.hpp b/modules/cannops/src/precomp.hpp
new file mode 100644
index 00000000000..8411cc40407
--- /dev/null
+++ b/modules/cannops/src/precomp.hpp
@@ -0,0 +1,14 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include "opencv2/cann.hpp"
+#include "opencv2/stream_accessor.hpp"
+#include "opencv2/cann_call.hpp"
+#include "opencv2/cann_interface.hpp"
+#include "opencv2/cann_private.hpp"
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cannops/test/test_core.cpp b/modules/cannops/test/test_core.cpp
new file mode 100644
index 00000000000..6b63a8cf061
--- /dev/null
+++ b/modules/cannops/test/test_core.cpp
@@ -0,0 +1,217 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include <vector>
+
+namespace opencv_test
+{
+namespace
+{
+TEST(CORE, MERGE)
+{
+    Mat m1 = (Mat_<uchar>(2, 2) << 1, 4, 7, 10);
+    Mat m2 = (Mat_<uchar>(2, 2) << 2, 5, 8, 11);
+    Mat m3 = (Mat_<uchar>(2, 2) << 3, 6, 9, 12);
+    Mat channels[3] = {m1, m2, m3};
+    Mat m;
+    cv::merge(channels, 3, m);
+
+    cv::cann::setDevice(0);
+
+    AscendMat a1, a2, a3;
+    a1.upload(m1);
+    a2.upload(m2);
+    a3.upload(m3);
+    AscendMat aclChannels[3] = {a1, a2, a3};
+    std::vector<AscendMat> aclChannelsVector;
+    aclChannelsVector.push_back(a1);
+    aclChannelsVector.push_back(a2);
+    aclChannelsVector.push_back(a3);
+
+    Mat checker1, checker2;
+    cv::cann::merge(aclChannels, 3, checker1);
+    cv::cann::merge(aclChannelsVector, checker2);
+
+    EXPECT_MAT_NEAR(m, checker1, 0.0);
+    EXPECT_MAT_NEAR(m, checker2, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(CORE, SPLIT)
+{
+    char d[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+    Mat m(2, 2, CV_8UC3, d);
+    Mat channels[3];
+    cv::split(m, channels);
+
+    cv::cann::setDevice(0);
+
+    AscendMat aclChannels[3];
+    std::vector<AscendMat> aclChannelsVector;
+
+    cv::cann::split(m, aclChannels);
+    cv::cann::split(m, aclChannelsVector);
+
+    Mat checker1[3], checker2[3];
+    aclChannels[0].download(checker1[0]);
+    aclChannels[1].download(checker1[1]);
+    aclChannels[2].download(checker1[2]);
+
+    aclChannelsVector[0].download(checker2[0]);
+    aclChannelsVector[1].download(checker2[1]);
+    aclChannelsVector[2].download(checker2[2]);
+
+    EXPECT_MAT_NEAR(channels[0], checker1[0], 0.0);
+    EXPECT_MAT_NEAR(channels[1], checker1[1], 0.0);
+    EXPECT_MAT_NEAR(channels[2], checker1[2], 0.0);
+
+    EXPECT_MAT_NEAR(channels[0], checker2[0], 0.0);
+    EXPECT_MAT_NEAR(channels[1], checker2[1], 0.0);
+    EXPECT_MAT_NEAR(channels[2], checker2[2], 0.0);
+
+    AscendMat npuM;
+    npuM.upload(m);
+    cv::cann::split(npuM, aclChannels);
+    cv::cann::split(npuM, aclChannelsVector);
+
+    aclChannels[0].download(checker1[0]);
+    aclChannels[1].download(checker1[1]);
+    aclChannels[2].download(checker1[2]);
+
+    aclChannelsVector[0].download(checker2[0]);
+    aclChannelsVector[1].download(checker2[1]);
+    aclChannelsVector[2].download(checker2[2]);
+
+    EXPECT_MAT_NEAR(channels[0], checker1[0], 0.0);
+    EXPECT_MAT_NEAR(channels[1], checker1[1], 0.0);
+    EXPECT_MAT_NEAR(channels[2], checker1[2], 0.0);
+
+    EXPECT_MAT_NEAR(channels[0], checker2[0], 0.0);
+    EXPECT_MAT_NEAR(channels[1], checker2[1], 0.0);
+    EXPECT_MAT_NEAR(channels[2], checker2[2], 0.0);
+    cv::cann::resetDevice();
+}
+
+TEST(CORE, TRANSPOSE)
+{
+    Mat cpuMat = randomMat(10, 10, CV_32SC3), cpuRetMat, checker;
+    cv::transpose(cpuMat, cpuRetMat);
+    cv::cann::transpose(cpuMat, checker);
+    EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(cpuMat);
+    cv::cann::transpose(npuMat, npuChecker);
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+}
+
+TEST(CORE, FLIP)
+{
+    Mat cpuMat = randomMat(10, 10, CV_32SC3), cpuRetMat, checker;
+
+    int flipMode;
+
+    for (flipMode = -1; flipMode < 2; flipMode++)
+    {
+        cv::flip(cpuMat, cpuRetMat, flipMode);
+        cv::cann::flip(cpuMat, checker, flipMode);
+        EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+    }
+
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(cpuMat);
+    for (flipMode = -1; flipMode < 2; flipMode++)
+    {
+        cv::flip(cpuMat, cpuRetMat, flipMode);
+        cv::cann::flip(npuMat, npuChecker, flipMode);
+        npuChecker.download(checker);
+        EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+    }
+}
+
+TEST(CORE, ROTATE)
+{
+    Mat cpuRetMat, checker, cpuMat = randomMat(3, 5, CV_16S, 0.0, 255.0);
+
+    int rotateMode;
+    for (rotateMode = 0; rotateMode < 3; rotateMode++)
+    {
+        cv::rotate(cpuMat, cpuRetMat, rotateMode);
+        cv::cann::rotate(cpuMat, checker, rotateMode);
+        EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+    }
+
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(cpuMat);
+    for (rotateMode = 0; rotateMode < 3; rotateMode++)
+    {
+        cv::rotate(cpuMat, cpuRetMat, rotateMode);
+        cv::cann::rotate(npuMat, npuChecker, rotateMode);
+        npuChecker.download(checker);
+        EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+    }
+}
+
+TEST(CORE, CROP)
+{
+    Mat cpuOpRet, checker, cpuMat = randomMat(6, 6, CV_32SC3, 0.0, 255.0);
+    Rect b(1, 2, 4, 4);
+    Mat cropped_cv(cpuMat, b);
+    AscendMat cropped_cann(cpuMat, b);
+    cropped_cann.download(checker);
+    EXPECT_MAT_NEAR(cropped_cv, checker, 1e-10);
+}
+
+TEST(CORE, CROP_OVERLOAD)
+{
+    Mat cpuOpRet, checker, cpuMat = randomMat(6, 6, CV_16SC3, 0.0, 255.0);
+    const Rect b(1, 2, 4, 4);
+    Mat cropped_cv = cpuMat(b);
+    AscendMat cropped_cann = cv::cann::crop(cpuMat, b);
+    cropped_cann.download(checker);
+    EXPECT_MAT_NEAR(cropped_cv, checker, 1e-10);
+
+    AscendMat npuMat;
+    npuMat.upload(cpuMat);
+    cropped_cann = cv::cann::crop(npuMat, b);
+    cropped_cann.download(checker);
+    EXPECT_MAT_NEAR(cropped_cv, checker, 1e-10);
+}
+
+TEST(CORE, RESIZE)
+{
+    Mat resized_cv, checker, cpuMat = randomMat(10, 10, CV_32F, 100.0, 255.0);
+    Size dsize = Size(6, 6);
+    // only support {2 INTER_CUBIC} and {3 INTER_AREA}
+    // only the resize result of INTER_AREA is close to CV's.
+    int flags = 3;
+    cv::cann::setDevice(0);
+    cv::resize(cpuMat, resized_cv, dsize, 0, 0, flags);
+    cv::cann::resize(cpuMat, checker, dsize, 0, 0, flags);
+    EXPECT_MAT_NEAR(resized_cv, checker, 1e-4);
+
+    cv::resize(cpuMat, resized_cv, Size(), 0.5, 0.5, flags);
+    cv::cann::resize(cpuMat, checker, Size(), 0.5, 0.5, flags);
+    EXPECT_MAT_NEAR(resized_cv, checker, 1e-4);
+
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(cpuMat);
+    cv::resize(cpuMat, resized_cv, dsize, 0, 0, flags);
+    cv::cann::resize(npuMat, npuChecker, dsize, 0, 0, flags);
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(resized_cv, checker, 1e-4);
+
+    cv::resize(cpuMat, resized_cv, Size(), 0.5, 0.5, flags);
+    cv::cann::resize(npuMat, npuChecker, Size(), 0.5, 0.5, flags);
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(resized_cv, checker, 1e-4);
+    cv::cann::resetDevice();
+}
+
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/test/test_cvtcolor.cpp b/modules/cannops/test/test_cvtcolor.cpp
new file mode 100644
index 00000000000..27a92298961
--- /dev/null
+++ b/modules/cannops/test/test_cvtcolor.cpp
@@ -0,0 +1,89 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+void cvtColorTest(int code, int cn, int dcn = 3, float diff = 0.0f)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat cpuRet, npuRet;
+
+    Mat img8U = randomMat(512, 512, CV_MAKETYPE(CV_8U, cn), 0.0f, 255.0f);
+    Mat img16U = randomMat(512, 512, CV_MAKETYPE(CV_16U, cn), 0.0f, 65535.0f);
+    Mat img32F = randomMat(512, 512, CV_MAKETYPE(CV_32F, cn), 0.0f, 65535.0f);
+
+    cv::cvtColor(img8U, cpuRet, code, dcn);
+    cv::cann::cvtColor(img8U, npuRet, code, dcn);
+    EXPECT_MAT_NEAR(cpuRet, npuRet, diff);
+
+    cv::cvtColor(img16U, cpuRet, code, dcn);
+    cv::cann::cvtColor(img16U, npuRet, code, dcn);
+    EXPECT_MAT_NEAR(cpuRet, npuRet, diff);
+
+    cv::cvtColor(img32F, cpuRet, code, dcn);
+    cv::cann::cvtColor(img32F, npuRet, code, dcn);
+    EXPECT_MAT_NEAR(cpuRet, npuRet, diff);
+    cv::cann::resetDevice();
+}
+
+TEST(CVT_COLOR, BGR2BGRA) { cvtColorTest(COLOR_BGR2BGRA, 3, 4); }
+TEST(CVT_COLOR, BGRA2BGR) { cvtColorTest(COLOR_BGRA2BGR, 4); }
+TEST(CVT_COLOR, BGR2RGBA) { cvtColorTest(COLOR_BGR2RGBA, 3, 4); }
+TEST(CVT_COLOR, RGBA2BGR) { cvtColorTest(COLOR_RGBA2BGR, 4); }
+TEST(CVT_COLOR, BGR2RGB) { cvtColorTest(COLOR_BGR2RGB, 3); }
+TEST(CVT_COLOR, BGRA2RGBA) { cvtColorTest(COLOR_BGRA2RGBA, 4, 4); }
+
+// Due to parameter accuracy issues, the calculation results have certain accuracy differences.
+TEST(CVT_COLOR, BGR2GRAY) { cvtColorTest(COLOR_BGR2GRAY, 3, 1, 10.0f); }
+TEST(CVT_COLOR, RGB2GRAY) { cvtColorTest(COLOR_BGR2GRAY, 3, 1, 10.0f); }
+TEST(CVT_COLOR, GRAY2BGR) { cvtColorTest(COLOR_GRAY2BGR, 1); }
+TEST(CVT_COLOR, GRAY2BGRA) { cvtColorTest(COLOR_GRAY2BGRA, 1, 4); }
+TEST(CVT_COLOR, BGRA2GRAY) { cvtColorTest(COLOR_BGRA2GRAY, 4, 1, 10.0f); }
+TEST(CVT_COLOR, RGBA2GRAY) { cvtColorTest(COLOR_RGBA2GRAY, 4, 1, 10.0f); }
+
+TEST(CVT_COLOR, RGB2XYZ) { cvtColorTest(COLOR_RGB2XYZ, 3, 3, 50.0f); }
+TEST(CVT_COLOR, BGR2XYZ) { cvtColorTest(COLOR_BGR2XYZ, 3, 3, 50.0f); }
+TEST(CVT_COLOR, XYZ2BGR) { cvtColorTest(COLOR_XYZ2BGR, 3, 3, 150.0f); }
+TEST(CVT_COLOR, XYZ2RGB) { cvtColorTest(COLOR_XYZ2RGB, 3, 3, 150.0f); }
+TEST(CVT_COLOR, XYZ2BGR_DC4) { cvtColorTest(COLOR_XYZ2BGR, 3, 4, 150.0f); }
+TEST(CVT_COLOR, XYZ2RGB_DC4) { cvtColorTest(COLOR_XYZ2RGB, 3, 4, 150.0f); }
+
+TEST(CVT_COLOR, BGR2YCrCb) { cvtColorTest(COLOR_BGR2YCrCb, 3, 3, 10.0f); }
+TEST(CVT_COLOR, RGB2YCrCb) { cvtColorTest(COLOR_RGB2YCrCb, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YCrCb2BGR) { cvtColorTest(COLOR_YCrCb2BGR, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YCrCb2RGB) { cvtColorTest(COLOR_YCrCb2RGB, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YCrCb2BGR_DC4) { cvtColorTest(COLOR_YCrCb2BGR, 3, 4, 10.0f); }
+TEST(CVT_COLOR, YCrCb2RGB_DC4) { cvtColorTest(COLOR_YCrCb2RGB, 3, 4, 10.0f); }
+
+TEST(CVT_COLOR, BGR2YUV) { cvtColorTest(COLOR_BGR2YUV, 3, 3, 10.0f); }
+TEST(CVT_COLOR, RGB2YUV) { cvtColorTest(COLOR_RGB2YUV, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YUV2BGR) { cvtColorTest(COLOR_YUV2BGR, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YUV2RGB) { cvtColorTest(COLOR_YUV2RGB, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YUV2BGR_DC4) { cvtColorTest(COLOR_YUV2BGR, 3, 4, 10.0f); }
+TEST(CVT_COLOR, YUV2RGB_DC4) { cvtColorTest(COLOR_YUV2RGB, 3, 4, 10.0f); }
+
+// Test of AscendMat. Since the logic is the same, only interface test is needed.
+TEST(CVT_COLOR, COLOR_BGR2BGRA_ASCENDMAT)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat cpuRet, npuRet;
+
+    Mat img8U = randomMat(512, 512, CV_8UC3, 0.0f, 255.0f);
+    cv::cvtColor(img8U, cpuRet, COLOR_BGR2BGRA, 4);
+
+    AscendMat npuImg8U, npuChecker;
+    npuImg8U.upload(img8U);
+    cv::cann::cvtColor(npuImg8U, npuChecker, COLOR_BGR2BGRA, 4);
+    npuChecker.download(npuRet);
+    EXPECT_MAT_NEAR(cpuRet, npuRet, 10.0f);
+    cv::cann::resetDevice();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/test/test_element_operations.cpp b/modules/cannops/test/test_element_operations.cpp
new file mode 100644
index 00000000000..76c103a65f4
--- /dev/null
+++ b/modules/cannops/test/test_element_operations.cpp
@@ -0,0 +1,697 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include <iostream>
+
+namespace opencv_test
+{
+namespace
+{
+template <typename FCV, typename FCANN, typename... PARAMS>
+void testMatOpMat(FCV cvFunc, FCANN cannFunc, PARAMS... param)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat mat1 = randomMat(10, 10, CV_32SC3);
+    Mat mat2 = randomMat(10, 10, CV_32SC3);
+    Mat cpuDst, check;
+
+    cvFunc(mat1, mat2, cpuDst, param...);
+    cannFunc(mat1, mat2, check, param..., AscendStream::Null());
+    EXPECT_MAT_NEAR(cpuDst, check, 1.0);
+
+    AscendStream stream;
+    cannFunc(mat1, mat2, check, param..., stream);
+    stream.waitForCompletion();
+    EXPECT_MAT_NEAR(cpuDst, check, 1.0);
+
+    cv::cann::resetDevice();
+}
+
+template <typename FCV, typename FCANN, typename DTMASK, typename... PARAMS>
+void testAscendMatOpAscendMatMask(FCV cvFunc, FCANN cannFunc, DTMASK mask = AscendMat(),
+                                  PARAMS... param)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat mat1 = randomMat(10, 10, CV_32SC3);
+    Mat mat2 = randomMat(10, 10, CV_32SC3);
+    Mat cpuDst, check, cpuMask;
+    AscendMat npuMat1, npuMat2, npuCheck;
+    npuMat1.upload(mat1);
+    npuMat2.upload(mat2);
+    if (mask.empty())
+    {
+        cvFunc(mat1, mat2, cpuDst, noArray(), param...);
+    }
+    else
+    {
+        mask.download(cpuMask);
+        cvFunc(mat1, mat2, cpuDst, cpuMask, param...);
+    }
+
+    cannFunc(npuMat1, npuMat2, npuCheck, mask, param..., AscendStream::Null());
+    npuCheck.download(check);
+    EXPECT_MAT_NEAR(cpuDst, check, 1.0);
+
+    AscendStream stream;
+    cannFunc(npuMat1, npuMat2, npuCheck, mask, param..., stream);
+    npuCheck.download(check);
+    stream.waitForCompletion();
+    EXPECT_MAT_NEAR(cpuDst, check, 1.0);
+
+    cv::cann::resetDevice();
+}
+
+template <typename FCV, typename FCANN, typename... PARAMS>
+void testAscendMatOpAscendMat(FCV cvFunc, FCANN cannFunc, PARAMS... param)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat mat1 = randomMat(10, 10, CV_32SC3);
+    Mat mat2 = randomMat(10, 10, CV_32SC3);
+    Mat cpuDst, check;
+    AscendMat npuMat1, npuMat2, npuCheck;
+    npuMat1.upload(mat1);
+    npuMat2.upload(mat2);
+    cvFunc(mat1, mat2, cpuDst, param...);
+    cannFunc(npuMat1, npuMat2, npuCheck, param..., AscendStream::Null());
+    npuCheck.download(check);
+    EXPECT_MAT_NEAR(cpuDst, check, 1.0);
+
+    AscendStream stream;
+    cannFunc(npuMat1, npuMat2, npuCheck, param..., stream);
+    npuCheck.download(check);
+    stream.waitForCompletion();
+    EXPECT_MAT_NEAR(cpuDst, check, 1.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(ELEMENTWISE_OP, MAT_ADD_MAT)
+{
+    testMatOpMat(
+        cv::add,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        noArray(), -1);
+    testAscendMatOpAscendMatMask(
+        cv::add,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_SUB_MAT)
+{
+    testMatOpMat(
+        cv::subtract,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        noArray(), -1);
+    testAscendMatOpAscendMatMask(
+        cv::subtract,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_MUL_MAT)
+{
+    testMatOpMat(
+        cv::multiply,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+    testAscendMatOpAscendMat(
+        cv::multiply,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_DIV_MAT)
+{
+    testMatOpMat([](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+                 { cv::divide(src1, src2, dst, scale, dtype); },
+                 [](const InputArray src1, const InputArray src2, OutputArray dst, float scale,
+                    int dtype, AscendStream& stream)
+                 { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+                 1, -1);
+    testAscendMatOpAscendMat(
+        [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+        { cv::divide(src1, src2, dst, scale, dtype); },
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_AND_MAT)
+{
+    testMatOpMat(
+        cv::bitwise_and,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        noArray());
+    testAscendMatOpAscendMatMask(
+        cv::bitwise_and,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        AscendMat());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_OR_MAT)
+{
+    testMatOpMat(
+        cv::bitwise_or,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        noArray());
+    testAscendMatOpAscendMatMask(
+        cv::bitwise_or,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        AscendMat());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_XOR_MAT)
+{
+    testMatOpMat(
+        cv::bitwise_xor,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        noArray());
+    testAscendMatOpAscendMatMask(
+        cv::bitwise_xor,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        AscendMat());
+}
+
+TEST(ELEMENTWISE_OP, MAT_ADD_MAT_WITH_MASK_AND_DTYPE)
+{
+    testMatOpMat(
+        cv::add,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        genMask(), CV_32SC3);
+    testAscendMatOpAscendMatMask(
+        cv::add,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), CV_32SC3);
+}
+
+TEST(ELEMENTWISE_OP, MAT_SUB_MAT_WITH_MASK_AND_DTYPE)
+{
+    testMatOpMat(
+        cv::subtract,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        genMask(), CV_32SC3);
+    testAscendMatOpAscendMatMask(
+        cv::subtract,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), CV_32SC3);
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_AND_MAT_WITH_MASK)
+{
+    testMatOpMat(
+        cv::bitwise_and,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        genMask());
+    testAscendMatOpAscendMatMask(
+        cv::bitwise_and,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        genNpuMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_OR_MAT_WITH_MASK)
+{
+    testMatOpMat(
+        cv::bitwise_or,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        genMask());
+    testAscendMatOpAscendMatMask(
+        cv::bitwise_or,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        genNpuMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_XOR_MAT_WITH_MASK)
+{
+    testMatOpMat(
+        cv::bitwise_xor,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        genMask());
+    testAscendMatOpAscendMatMask(
+        cv::bitwise_xor,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        genNpuMask());
+}
+
+float randomScale = randomNum();
+TEST(ELEMENTWISE_OP, MAT_MUL_MAT_WITH_SCALE)
+{
+    testMatOpMat(
+        cv::multiply,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+        randomScale, -1);
+    testAscendMatOpAscendMat(
+        cv::multiply,
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+        randomScale, -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_DIV_MAT_WITH_SCALE)
+{
+    testMatOpMat([](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+                 { cv::divide(src1, src2, dst, scale, dtype); },
+                 [](const InputArray src1, const InputArray src2, OutputArray dst, float scale,
+                    int dtype, AscendStream& stream)
+                 { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+                 randomScale, -1);
+    testAscendMatOpAscendMat(
+        [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+        { cv::divide(src1, src2, dst, scale, dtype); },
+        [](const AscendMat& src1, const AscendMat& src2, AscendMat& dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+        randomScale, -1);
+}
+
+template <typename FCV, typename FCANN, typename... PARAMS>
+void testMatOpScalar(FCV cvFunc, FCANN cannFunc, PARAMS... param)
+{
+    Scalar scalar = randomScalar();
+    Mat mat(10, 10, CV_32SC3, randomScalar());
+    Mat cpuDst1, cpuDst2, checker1, checker2;
+
+    cvFunc(Mat(10, 10, CV_32SC3, scalar), mat, cpuDst1, param...);
+    cvFunc(mat, Mat(10, 10, CV_32SC3, scalar), cpuDst2, param...);
+    cv::cann::setDevice(DEVICE_ID);
+
+    cannFunc(scalar, mat, checker1, param..., AscendStream::Null());
+    cannFunc(mat, scalar, checker2, param..., AscendStream::Null());
+
+    EXPECT_MAT_NEAR(cpuDst1, checker1, 1.0);
+    EXPECT_MAT_NEAR(cpuDst2, checker2, 1.0);
+
+    AscendStream stream;
+    cannFunc(scalar, mat, checker1, param..., stream);
+    cannFunc(mat, scalar, checker2, param..., stream);
+    stream.waitForCompletion();
+    EXPECT_MAT_NEAR(cpuDst1, checker1, 1.0);
+    EXPECT_MAT_NEAR(cpuDst2, checker2, 1.0);
+
+    cv::cann::resetDevice();
+}
+
+template <typename FCV, typename FCANN, typename DTMASK, typename... PARAMS>
+void testAscendMatOpScalarMask(FCV cvFunc, FCANN cannFunc, DTMASK mask, PARAMS... param)
+{
+    Scalar scalar = randomScalar();
+    Mat mat(10, 10, CV_32SC3, randomScalar());
+    Mat cpuDst, checker, cpuMask;
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(mat);
+    if (mask.empty())
+    {
+        cvFunc(mat, Mat(10, 10, CV_32SC3, scalar), cpuDst, noArray(), param...);
+    }
+    else
+    {
+        mask.download(cpuMask);
+        cvFunc(mat, Mat(10, 10, CV_32SC3, scalar), cpuDst, cpuMask, param...);
+    }
+    cv::cann::setDevice(DEVICE_ID);
+
+    cannFunc(npuMat, scalar, npuChecker, mask, param..., AscendStream::Null());
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(cpuDst, checker, 1.0);
+
+    AscendStream stream;
+    cannFunc(npuMat, scalar, npuChecker, mask, param..., stream);
+    stream.waitForCompletion();
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(cpuDst, checker, 1.0);
+
+    cv::cann::resetDevice();
+}
+template <typename FCV, typename FCANN, typename DTMASK, typename... PARAMS>
+void testScalarOpAscendMatMask(FCV cvFunc, FCANN cannFunc, DTMASK mask, PARAMS... param)
+{
+    Scalar scalar = randomScalar();
+    Mat mat(10, 10, CV_32SC3, randomScalar());
+    Mat cpuDst, checker, cpuMask;
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(mat);
+    if (mask.empty())
+    {
+        cvFunc(Mat(10, 10, CV_32SC3, scalar), mat, cpuDst, noArray(), param...);
+    }
+    else
+    {
+        mask.download(cpuMask);
+        cvFunc(Mat(10, 10, CV_32SC3, scalar), mat, cpuDst, cpuMask, param...);
+    }
+    cv::cann::setDevice(DEVICE_ID);
+
+    cannFunc(scalar, npuMat, npuChecker, mask, param..., AscendStream::Null());
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(cpuDst, checker, 1.0);
+
+    AscendStream stream;
+    cannFunc(scalar, npuMat, npuChecker, mask, param..., stream);
+    stream.waitForCompletion();
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(cpuDst, checker, 1.0);
+
+    cv::cann::resetDevice();
+}
+template <typename FCV, typename FCANN, typename... PARAMS>
+void testAscendMatOpScalar(FCV cvFunc, FCANN cannFunc, PARAMS... param)
+{
+    Scalar scalar = randomScalar();
+    Mat mat(10, 10, CV_32SC3, randomScalar());
+    Mat cpuDst, checker;
+    AscendMat npuMat, npuChecker;
+    npuMat.upload(mat);
+
+    cvFunc(mat, Mat(10, 10, CV_32SC3, scalar), cpuDst, param...);
+    cv::cann::setDevice(DEVICE_ID);
+
+    cannFunc(npuMat, scalar, npuChecker, param..., AscendStream::Null());
+    npuChecker.download(checker);
+    EXPECT_MAT_NEAR(cpuDst, checker, 1.0);
+
+    AscendStream stream;
+    cannFunc(npuMat, scalar, npuChecker, param..., stream);
+    stream.waitForCompletion();
+    npuChecker.download(checker);
+
+    cv::cann::resetDevice();
+}
+
+TEST(ELEMENTWISE_OP, MAT_ADD_SCALAR)
+{
+    testMatOpScalar(
+        cv::add,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        noArray(), -1);
+    testAscendMatOpScalarMask(
+        cv::add,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), -1);
+    testScalarOpAscendMatMask(
+        cv::add,
+        [](const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_SUB_SCALAR)
+{
+    testMatOpScalar(
+        cv::subtract,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        noArray(), -1);
+    testAscendMatOpScalarMask(
+        cv::subtract,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        AscendMat(), -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_MUL_SCALAR)
+{
+    testMatOpScalar(
+        cv::multiply,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+    testAscendMatOpScalar(
+        cv::multiply,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_DIV_SCALAR)
+{
+    testMatOpScalar(
+        [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+        { cv::divide(src1, src2, dst, scale, dtype); },
+        [](const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+    testAscendMatOpScalar(
+        [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+        { cv::divide(src1, src2, dst, scale, dtype); },
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, float scale, int dtype,
+           AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+        1, -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_AND_SCALAR)
+{
+    testMatOpScalar(
+        cv::bitwise_and,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        noArray());
+    testAscendMatOpScalarMask(
+        cv::bitwise_and,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        AscendMat());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_OR_SCALAR)
+{
+    testMatOpScalar(
+        cv::bitwise_or,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        noArray());
+    testAscendMatOpScalarMask(
+        cv::bitwise_or,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        AscendMat());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_XOR_SCALAR)
+{
+    testMatOpScalar(
+        cv::bitwise_xor,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        noArray());
+    testAscendMatOpScalarMask(
+        cv::bitwise_xor,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        AscendMat());
+}
+
+TEST(ELEMENTWISE_OP, MAT_ADD_SCALAR_WITH_MASK_AND_DETYPE)
+{
+    testMatOpScalar(
+        cv::add,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        genMask(), CV_32SC3);
+    testAscendMatOpScalarMask(
+        cv::add,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::add(src1, src2, dst, mask, dtype, stream); },
+        genNpuMask(), CV_32SC3);
+}
+
+TEST(ELEMENTWISE_OP, MAT_SUB_SCALAR_WITH_MASK_AND_DETYPE)
+{
+    testMatOpScalar(
+        cv::subtract,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        genMask(), CV_32SC3);
+    testAscendMatOpScalarMask(
+        cv::subtract,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           int dtype, AscendStream& stream)
+        { cv::cann::subtract(src1, src2, dst, mask, dtype, stream); },
+        genNpuMask(), CV_32SC3);
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_AND_SCALAR_WITH_MASK)
+{
+    testMatOpScalar(
+        cv::bitwise_and,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        genMask());
+    testAscendMatOpScalarMask(
+        cv::bitwise_and,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_and(src1, src2, dst, mask, stream); },
+        genNpuMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_OR_SCALAR_WITH_MASK)
+{
+    testMatOpScalar(
+        cv::bitwise_or,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        genMask());
+    testAscendMatOpScalarMask(
+        cv::bitwise_or,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_or(src1, src2, dst, mask, stream); },
+        genNpuMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_XOR_SCALAR_WITH_MASK)
+{
+    testMatOpScalar(
+        cv::bitwise_xor,
+        [](const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        genMask());
+    testAscendMatOpScalarMask(
+        cv::bitwise_xor,
+        [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+           AscendStream& stream) { cv::cann::bitwise_xor(src1, src2, dst, mask, stream); },
+        genNpuMask());
+}
+
+// TODO: I think the cv result is wrong, which has truncated middle result.
+// Disable these two test case bacause it't not stable.
+// TEST(ELEMENTWISE_OP, MAT_MUL_SCALAR_WITH_SCALE)
+// {
+//     testMatOpScalar(
+//         cv::multiply,
+//         [](const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+//            AscendStream& stream) { cv::cann::multiply(src1, src2, dst, scale, dtype, stream); },
+//         randomScale, CV_32SC3);
+//     testAscendMatOpScalar(
+//         [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+//         { cv::divide(src1, src2, dst, scale, dtype); },
+//         [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, float scale, int dtype,
+//            AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+//         randomScale, -1);
+// }
+
+// TEST(ELEMENTWISE_OP, MAT_DIV_SCALAR_WITH_SCALE)
+// {
+//     testMatOpScalar(
+//         [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+//         { cv::divide(src1, src2, dst, scale, dtype); },
+//         [](const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
+//            AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+//         randomScale, -1);
+//     testAscendMatOpScalar(
+//         [](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+//         { cv::divide(src1, src2, dst, scale, dtype); },
+//         [](const AscendMat& src1, const Scalar& src2, AscendMat& dst, float scale, int dtype,
+//            AscendStream& stream) { cv::cann::divide(src1, src2, dst, scale, dtype, stream); },
+//         randomScale, -1);
+// }
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_NOT)
+{
+    Mat cpuOpRet, checker, cpuMat = randomMat(10, 10, CV_32SC3);
+    cv::cann::setDevice(DEVICE_ID);
+    cv::bitwise_not(cpuMat, cpuOpRet);
+    cv::cann::bitwise_not(cpuMat, checker);
+    EXPECT_MAT_NEAR(cpuOpRet, checker, 0.0);
+
+    AscendMat npuMat, npuOpRet;
+    npuMat.upload(cpuMat);
+    cv::cann::bitwise_not(npuMat, npuOpRet);
+    npuOpRet.download(checker);
+    EXPECT_MAT_NEAR(cpuOpRet, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+// TODO random test matrix
+TEST(ELEMENTWISE_OP, MAT_ADD_WEIGHTED)
+{
+    Mat cpuOpRet, checker, cpuMat1 = Mat::ones(5, 5, CV_32S), cpuMat2 = Mat::ones(5, 5, CV_32S);
+
+    cv::cann::setDevice(DEVICE_ID);
+    cv::addWeighted(cpuMat1, 2, cpuMat2, 3, 5, cpuOpRet);
+    cv::cann::addWeighted(cpuMat1, 2, cpuMat2, 3, 5, checker);
+    EXPECT_MAT_NEAR(cpuOpRet, checker, 0.0);
+
+    AscendMat npuOpRet, npuMat1, npuMat2;
+    npuMat1.upload(cpuMat1);
+    npuMat2.upload(cpuMat2);
+    cv::cann::addWeighted(npuMat1, 2, npuMat2, 3, 5, npuOpRet);
+    npuOpRet.download(checker);
+    EXPECT_MAT_NEAR(cpuOpRet, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(ELEMENTWISE_OP, MAT_THRESHOLD)
+{
+    Mat cpuOpRet, checker, cpuMat = randomMat(10, 10, CV_16SC3, 0.0, 255.0);
+
+    AscendMat ascendMat, ascendMat16F, aclOpRet, aclOpRet16S;
+    cv::cann::setDevice(DEVICE_ID);
+    ascendMat.upload(cpuMat);
+    ascendMat.convertTo(ascendMat16F, CV_16F);
+
+    Mat cpuMat16F, checker16F;
+    cpuMat.convertTo(cpuMat16F, CV_16F);
+
+    for (int i = 0; i <= 4; i++)
+    {
+        cv::threshold(cpuMat, cpuOpRet, 128, 250, i);
+        // TODO find the reason empty AscendMat is not continuous.
+        cv::cann::threshold(ascendMat16F, aclOpRet, 128, 250, i);
+        aclOpRet.convertTo(aclOpRet16S, CV_16S);
+        aclOpRet16S.download(checker);
+
+        EXPECT_MAT_NEAR(cpuOpRet, checker, 1e-10);
+
+        cv::cann::threshold(cpuMat16F, checker16F, 128, 250, i);
+        checker16F.convertTo(checker, CV_16S);
+        EXPECT_MAT_NEAR(cpuOpRet, checker, 1e-10);
+    }
+
+    cv::cann::resetDevice();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/test/test_main.cpp b/modules/cannops/test/test_main.cpp
new file mode 100644
index 00000000000..202c6af27ee
--- /dev/null
+++ b/modules/cannops/test/test_main.cpp
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+class CannEnvironment : public ::testing::Environment
+{
+public:
+    virtual ~CannEnvironment() = default;
+    virtual void SetUp() CV_OVERRIDE { initAcl(); }
+    virtual void TearDown() CV_OVERRIDE { finalizeAcl(); }
+};
+
+static void initTests()
+{
+    CannEnvironment* cannEnv = new CannEnvironment();
+    ::testing::AddGlobalTestEnvironment(cannEnv);
+}
+
+CV_TEST_MAIN("cannops", initTests());
diff --git a/modules/cannops/test/test_npumat.cpp b/modules/cannops/test/test_npumat.cpp
new file mode 100644
index 00000000000..1ff445399f8
--- /dev/null
+++ b/modules/cannops/test/test_npumat.cpp
@@ -0,0 +1,146 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+class DummyAllocator : public AscendMat::Allocator
+{
+public:
+    std::shared_ptr<uchar> allocate(size_t size) CV_OVERRIDE
+    {
+        CV_UNUSED(size);
+        return std::shared_ptr<uchar>();
+    }
+    bool allocate(cv::cann::AscendMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE
+    {
+        CV_UNUSED(rows);
+        CV_UNUSED(cols);
+        CV_UNUSED(elemSize);
+        mat->data = std::shared_ptr<uchar>((uchar*)0x12345, [](void* ptr) { CV_UNUSED(ptr); });
+        return true;
+    }
+};
+
+TEST(AscendMat, Construct)
+{
+    cv::cann::setDevice(0);
+    // 1 Default constructor.
+    AscendMat defaultAscendMat;
+    AscendMat::Allocator* defaultAllocator = AscendMat::defaultAllocator();
+    ASSERT_EQ(defaultAscendMat.allocator, defaultAllocator);
+
+    // 2 get & set allocator.
+    DummyAllocator dummyAllocator;
+    AscendMat::setDefaultAllocator(&dummyAllocator);
+    ASSERT_EQ(defaultAscendMat.defaultAllocator(), &dummyAllocator);
+    AscendMat::setDefaultAllocator(defaultAllocator);
+
+    // 3 constructs AscendMat of the specified size and type
+    AscendMat specifiedSizeAscendMat1(5, 6, CV_8UC3);
+    AscendMat specifiedSizeAscendMat2(Size(300, 200), CV_64F);
+
+    ASSERT_EQ(specifiedSizeAscendMat1.rows, 5);
+    ASSERT_EQ(specifiedSizeAscendMat1.cols, 6);
+    ASSERT_EQ(specifiedSizeAscendMat1.depth(), CV_8U);
+    ASSERT_EQ(specifiedSizeAscendMat1.channels(), 3);
+
+    ASSERT_EQ(specifiedSizeAscendMat2.cols, 300);
+    ASSERT_EQ(specifiedSizeAscendMat2.rows, 200);
+    ASSERT_EQ(specifiedSizeAscendMat2.depth(), CV_64F);
+    ASSERT_EQ(specifiedSizeAscendMat2.channels(), 1);
+
+    // 4 constructs AscendMat and fills it with the specified value s
+    srand((unsigned int)(time(NULL)));
+    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+
+    Mat scalarToMat(7, 8, CV_8UC3, sc);
+    AscendMat scalarToAscendMat1(7, 8, CV_8UC3, sc);
+    Mat scalarToMatChecker;
+    scalarToAscendMat1.download(scalarToMatChecker);
+
+    EXPECT_MAT_NEAR(scalarToMat, scalarToMatChecker, 0.0);
+
+    AscendMat scalarToAscendMat2(Size(123, 345), CV_32S);
+
+    ASSERT_EQ(scalarToAscendMat1.rows, 7);
+    ASSERT_EQ(scalarToAscendMat1.cols, 8);
+    ASSERT_EQ(scalarToAscendMat1.depth(), CV_8U);
+    ASSERT_EQ(scalarToAscendMat1.channels(), 3);
+
+    ASSERT_EQ(scalarToAscendMat2.cols, 123);
+    ASSERT_EQ(scalarToAscendMat2.rows, 345);
+    ASSERT_EQ(scalarToAscendMat2.depth(), CV_32S);
+    ASSERT_EQ(scalarToAscendMat2.channels(), 1);
+
+    // 6 builds AscendMat from host memory
+    Scalar sc2(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+    Mat randomMat(7, 8, CV_8UC3, sc2);
+    InputArray arr = randomMat;
+
+    AscendMat fromInputArray(arr, AscendStream::Null());
+    Mat randomMatChecker;
+    fromInputArray.download(randomMatChecker);
+    EXPECT_MAT_NEAR(randomMat, randomMatChecker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(AscendMat, Assignment)
+{
+    DummyAllocator dummyAllocator;
+    AscendMat mat1;
+    AscendMat mat2(3, 4, CV_8SC1, &dummyAllocator);
+    mat1 = mat2;
+
+    ASSERT_EQ(mat1.rows, 3);
+    ASSERT_EQ(mat1.cols, 4);
+    ASSERT_EQ(mat1.depth(), CV_8S);
+    ASSERT_EQ(mat1.channels(), 1);
+    ASSERT_EQ(mat1.data.get(), (uchar*)0x12345);
+}
+
+TEST(AscendMat, SetTo)
+{
+    cv::cann::setDevice(0);
+
+    srand((unsigned int)(time(NULL)));
+    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+
+    AscendMat ascendMat(2, 2, CV_8UC4);
+    ascendMat.setTo(sc);
+    Mat mat(2, 2, CV_8UC4, sc);
+    Mat checker;
+    ascendMat.download(checker);
+
+    EXPECT_MAT_NEAR(mat, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(AscendMat, ConvertTo)
+{
+    cv::cann::setDevice(0);
+
+    srand((unsigned int)(time(NULL)));
+    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+
+    AscendMat ascendMat(2, 2, CV_8UC4, sc);
+    AscendMat convertedAscendMat;
+    ascendMat.convertTo(convertedAscendMat, CV_16S);
+    Mat mat(2, 2, CV_16SC4, sc);
+    Mat checker;
+    convertedAscendMat.download(checker);
+
+    EXPECT_MAT_NEAR(mat, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/test/test_precomp.hpp b/modules/cannops/test/test_precomp.hpp
new file mode 100644
index 00000000000..f7bdbea0b08
--- /dev/null
+++ b/modules/cannops/test/test_precomp.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/cann.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+#include "opencv2/cann_interface.hpp"
+
+using namespace cv;
+using namespace cv::cann;
+#undef EXPECT_MAT_NEAR
+#define EXPECT_MAT_NEAR(m1, m2, eps) EXPECT_PRED_FORMAT3(cvtest::assertMatNear, m1, m2, eps)
+#define ASSERT_MAT_NEAR(m1, m2, eps) ASSERT_PRED_FORMAT3(cvtest::assertMatNear, m1, m2, eps)
+
+#define DEVICE_ID 0
+
+Mat randomMat(int w, int h, int dtype, float min = 1.0f, float max = 10.0f);
+Scalar randomScalar();
+float randomNum();
+int randomInterger();
+Mat genMask();
+AscendMat genNpuMask();
+
+#endif //__OPENCV_TEST_PRECOMP_HPP__
diff --git a/modules/cannops/test/test_utils.cpp b/modules/cannops/test/test_utils.cpp
new file mode 100644
index 00000000000..d2bd31647b7
--- /dev/null
+++ b/modules/cannops/test/test_utils.cpp
@@ -0,0 +1,49 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+// Random Generator
+Mat randomMat(int w, int h, int dtype, float min, float max)
+{
+    Mat rnMat(w, h, dtype);
+    RNG rng(getTickCount());
+    rng.fill(rnMat, RNG::UNIFORM, min, max);
+    return rnMat;
+}
+Scalar randomScalar()
+{
+    RNG rng(getTickCount());
+    Scalar sc;
+    rng.fill(sc, RNG::UNIFORM, 1.0, 5.0);
+    return sc;
+}
+float randomNum()
+{
+    RNG rng(getTickCount());
+    float rdnNum = float(rng.uniform(1.0, 5.0));
+    return rdnNum;
+}
+
+int randomInterger()
+{
+    RNG rng(getTickCount());
+    float rdnNum = float(rng.uniform(1, 5));
+    return rdnNum;
+}
+
+Mat genMask()
+{
+    Mat mask = Mat::zeros(Size(10, 10), CV_8UC1);
+    rectangle(mask, cv::Rect(5, 5, 3, 3), Scalar(255), -1);
+    return mask;
+}
+
+AscendMat genNpuMask()
+{
+    cv::Mat mask = genMask();
+    cv::cann::AscendMat npuMask;
+    npuMask.upload(mask);
+    return npuMask;
+}
diff --git a/modules/cannops/tutorials/ascend_npu_image_processing.markdown b/modules/cannops/tutorials/ascend_npu_image_processing.markdown
new file mode 100644
index 00000000000..ed905831d31
--- /dev/null
+++ b/modules/cannops/tutorials/ascend_npu_image_processing.markdown
@@ -0,0 +1,130 @@
+Ascend NPU Image Processing {#tutorial_ascend_npu_image_processing}
+==========================================================
+
+## Goal
+
+In this guide, you will gain insights into the thread safety of Ascend operators already in use, as well as discover how to effectively employ Ascend operators for image preprocessing and understand their usage limitations.
+
+## Preface
+
+We provide a suite of common matrix operation operators that support the [Ascend NPU](https://www.hiascend.com/en/) within OpenCV. For user convenience, the new 'AscendMat' structure and its associated operators maintain compatibility with the 'Mat' interface in OpenCV. These operators encompass a wide range of frequently used functions, including arithmetic operations, image processing operations, and image color space conversion. All of these operators are implemented utilizing [CANN](https://www.hiascend.com/en/software/cann)(Compute Architecture of Neural Networks). The Ascend operator facilitates accelerated operations on the NPU by making use of CANN. This acceleration effect is particularly noticeable when working with larger images, such as those with dimensions like 2048x2048, 3840x2160, 7680x4320, etc.
+
+
+## Instructions on Thread Safety
+
+Our stream function is implemented by invoking the CANN operators. In the same stream, tasks are executed sequentially, while across different streams, tasks are executed in parallel. The use of event mechanisms ensures synchronization of tasks between streams, please refer to the [**Stream Management**](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/infacldevg/aclcppdevg/aclcppdevg_000147.html) documentation for details.
+
+
+## Example for Image Preprocessing
+
+In this section, you will discover how to use Ascend operators for image preprocessing, including functions below:
+
+- Add
+- Rotate
+- Flip
+
+
+### code
+
+@add_toggle_cpp
+@include opencv_contrib/modules/cannops/samples/image_processing.cpp
+@end_toggle
+
+@add_toggle_python
+@include opencv_contrib/modules/cannops/samples/image_processing.py
+@end_toggle
+
+### Explanation
+
+**Input Image**
+
+@add_toggle_cpp
+@snippet opencv_contrib/modules/cannops/samples/image_processing.cpp input_noise
+@end_toggle
+
+@add_toggle_python
+
+```python
+# Read the input image
+img = cv2.imread("/path/to/img")
+# Generate gauss noise that will be added into the input image
+gaussNoise = np.random.normal(mean=0,sigma=25,(img.shape[0],img.shape[1],img.shape[2])).astype(img.dtype)
+```
+
+@end_toggle
+
+**Setup CANN**
+
+@add_toggle_cpp
+
+@snippet opencv_contrib/modules/cannops/samples/image_processing.cpp setup
+
+@end_toggle
+
+@add_toggle_python
+
+@snippet opencv_contrib/modules/cannops/samples/image_processing.py setup
+
+@end_toggle
+**Image Preprocessing Example**
+
+@add_toggle_cpp
+
+@snippet opencv_contrib/modules/cannops/samples/image_processing.cpp image-process
+
+@end_toggle
+
+@add_toggle_python
+
+@snippet opencv_contrib/modules/cannops/samples/image_processing.py image-process
+
+@end_toggle
+
+**Tear down CANN**
+
+@add_toggle_cpp
+@snippet opencv_contrib/modules/cannops/samples/image_processing.cpp tear-down-cann
+
+@end_toggle
+
+@add_toggle_python
+
+@snippet opencv_contrib/modules/cannops/samples/image_processing.py tear-down-cann
+
+@end_toggle
+Results
+
+1. The original RGB input image with dimensions of (480, 640, 3):
+
+   ![puppy](./puppy.jpg)
+
+2. After introducing Gaussian noise, we obtain the following result:
+
+   ![puppy_noisy](./puppy_noisy.jpg)
+
+3. When applying the rotate operation with a rotation code of 0 (90 degrees clockwise), we obtain this result:
+
+   ![puppy_noisy_rotate](./puppy_noisy_rotate.jpg)
+
+4. Upon applying the flip operation with a flip code of 0 (flipping around the x-axis), we achieve the final result:
+
+   ![puppy_processed_normalized](./puppy_processed.jpg)
+
+
+
+## Usage Limitations
+
+While Ascend supports most commonly used operators, there are still some limitations that need to be addressed.
+
+- There is no strict limit on the size of the input image used for encoding; however, it depends on the available RAM size of your device.
+- Please note that not all data types (dtypes) are supported by every operator. The current dtype limitations are outlined in the following table. We are actively working on addressing these limitations through automatic dtype conversion in an upcoming commit.
+
+
+| Operator               | Supported Dtype                                              |
+| ---------------------- | ------------------------------------------------------------ |
+| multiply (with scale)  | float16,float32,int32                                        |
+| divide (with scale)    | float16,float,int32,int8,uint8                               |
+| bitwise add/or/xor/not | int32,int16,uint16                                           |
+| flip                   | float16,float,int64,int32,int16,uint16                       |
+| transpose              | float16,float,int64,int32,int16,int8,uint64,uint32,uint16,uint8,bool |
+| rotate                 | float16,float,int64,int32,int16,uint16                       |
diff --git a/modules/cannops/tutorials/puppy.jpg b/modules/cannops/tutorials/puppy.jpg
new file mode 100644
index 00000000000..b0f0595e5ce
Binary files /dev/null and b/modules/cannops/tutorials/puppy.jpg differ
diff --git a/modules/cannops/tutorials/puppy_noisy.jpg b/modules/cannops/tutorials/puppy_noisy.jpg
new file mode 100644
index 00000000000..e90cadb1720
Binary files /dev/null and b/modules/cannops/tutorials/puppy_noisy.jpg differ
diff --git a/modules/cannops/tutorials/puppy_noisy_rotate.jpg b/modules/cannops/tutorials/puppy_noisy_rotate.jpg
new file mode 100644
index 00000000000..e62b04834dc
Binary files /dev/null and b/modules/cannops/tutorials/puppy_noisy_rotate.jpg differ
diff --git a/modules/cannops/tutorials/puppy_processed.jpg b/modules/cannops/tutorials/puppy_processed.jpg
new file mode 100644
index 00000000000..296b47aefea
Binary files /dev/null and b/modules/cannops/tutorials/puppy_processed.jpg differ
diff --git a/modules/ccalib/src/omnidir.cpp b/modules/ccalib/src/omnidir.cpp
index ccca0b57a8f..5e5f8cc2bf4 100644
--- a/modules/ccalib/src/omnidir.cpp
+++ b/modules/ccalib/src/omnidir.cpp
@@ -2138,7 +2138,7 @@ void cv::omnidir::internal::flags2idxStereo(int flags, std::vector<int>& idx, in
     }
 }
 
-// fill in zerso for fixed parameters
+// fill in zeros for fixed parameters
 void cv::omnidir::internal::fillFixed(Mat&G, int flags, int n)
 {
     Mat tmp = G.clone();
diff --git a/modules/cudaarithm/test/test_event.cpp b/modules/cudaarithm/test/test_event.cpp
index 375c51d0d35..ffe0f7b681e 100644
--- a/modules/cudaarithm/test/test_event.cpp
+++ b/modules/cudaarithm/test/test_event.cpp
@@ -91,7 +91,7 @@ CUDA_TEST_P(AsyncEvent, Timing)
             const double elTimeMs = Event::elapsedTime(startEvent, stopEvent);
             ASSERT_GT(elTimeMs, 0);
         }
-        catch (cv::Exception ex) {
+        catch (const cv::Exception& ex) {
             failed = true;
         }
         ASSERT_EQ(failed, shouldFail.at(i));
diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index 42325c64613..163417108e7 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -184,18 +184,18 @@ struct CV_EXPORTS_W_SIMPLE EncoderParams
 public:
     CV_WRAP EncoderParams() : nvPreset(ENC_PRESET_P3), tuningInfo(ENC_TUNING_INFO_HIGH_QUALITY), encodingProfile(ENC_CODEC_PROFILE_AUTOSELECT),
         rateControlMode(ENC_PARAMS_RC_VBR), multiPassEncoding(ENC_MULTI_PASS_DISABLED), constQp({ 0,0,0 }), averageBitRate(0), maxBitRate(0),
-        targetQuality(30), gopLength(0) {};
-
+        targetQuality(30), gopLength(250), idrPeriod(250) {};
     CV_PROP_RW EncodePreset nvPreset;
     CV_PROP_RW EncodeTuningInfo tuningInfo;
     CV_PROP_RW EncodeProfile encodingProfile;
     CV_PROP_RW EncodeParamsRcMode rateControlMode;
     CV_PROP_RW EncodeMultiPass multiPassEncoding;
-    CV_PROP_RW EncodeQp constQp; //!< QP's for ENC_PARAMS_RC_CONSTQP.
-    CV_PROP_RW int averageBitRate; //!< target bitrate for ENC_PARAMS_RC_VBR and ENC_PARAMS_RC_CBR.
-    CV_PROP_RW int maxBitRate; //!< upper bound on bitrate for ENC_PARAMS_RC_VBR and ENC_PARAMS_RC_CONSTQP.
-    CV_PROP_RW uint8_t targetQuality; //!< value 0 - 51 where video quality decreases as targetQuality increases, used with ENC_PARAMS_RC_VBR.
-    CV_PROP_RW int gopLength;
+    CV_PROP_RW EncodeQp constQp; //!< QP's for \ref ENC_PARAMS_RC_CONSTQP.
+    CV_PROP_RW int averageBitRate; //!< target bitrate for \ref ENC_PARAMS_RC_VBR and \ref ENC_PARAMS_RC_CBR.
+    CV_PROP_RW int maxBitRate; //!< upper bound on bitrate for \ref ENC_PARAMS_RC_VBR and \ref ENC_PARAMS_RC_CONSTQP.
+    CV_PROP_RW uint8_t targetQuality; //!< value 0 - 51 where video quality decreases as targetQuality increases, used with \ref ENC_PARAMS_RC_VBR.
+    CV_PROP_RW int gopLength; //!< the number of pictures in one GOP, ensuring \ref idrPeriod >= \ref gopLength.
+    CV_PROP_RW int idrPeriod; //!< IDR interval, ensuring \ref idrPeriod >= \ref gopLength.
 };
 CV_EXPORTS bool operator==(const EncoderParams& lhs, const EncoderParams& rhs);
 
@@ -209,7 +209,7 @@ class CV_EXPORTS_W EncoderCallback {
 
     @param vPacket The raw bitstream for one or more frames.
     */
-    virtual void onEncoded(std::vector<std::vector<uint8_t>> vPacket) = 0;
+    virtual void onEncoded(const std::vector<std::vector<uint8_t>>& vPacket) = 0;
 
     /** @brief Callback function to that the encoding has finished.
     * */
@@ -218,14 +218,14 @@ class CV_EXPORTS_W EncoderCallback {
     virtual ~EncoderCallback() {}
 };
 
-/** @brief Video writer interface.
+/** @brief Video writer interface, see createVideoWriter().
 
-Available when built with WITH_NVCUVENC=ON while Nvidia's Video Codec SDK is installed.
+Available if Nvidia's Video Codec SDK is installed.
 
-Encoding support is dependent on the GPU, refer to the Nvidia Video Codec SDK Video Encode and Decode GPU Support Matrix for details.
+Only Codec::H264 and Codec::HEVC are supported with encoding support dependent on the GPU, refer to the Nvidia Video Codec SDK Video Encode and Decode GPU Support Matrix for details.
 
 @note
-   -   An example on how to use the videoWriter class can be found at
+   -   An example on how to use the VideoWriter class can be found at
         opencv_source_code/samples/gpu/video_writer.cpp
 */
 class CV_EXPORTS_W VideoWriter
@@ -253,9 +253,9 @@ class CV_EXPORTS_W VideoWriter
 
 /** @brief Creates video writer.
 
-@param fileName Name of the output video file. Only raw h264 or hevc files are supported.
+@param fileName Name of the output video file.
 @param frameSize Size of the input video frames.
-@param codec Codec.
+@param codec Supports Codec::H264 and Codec::HEVC.
 @param fps Framerate of the created video stream.
 @param colorFormat OpenCv color format of the frames to be encoded.
 @param encoderCallback Callbacks for video encoder. See cudacodec::EncoderCallback. Required for working with the encoded video stream.
@@ -266,9 +266,9 @@ CV_EXPORTS_W Ptr<cudacodec::VideoWriter> createVideoWriter(const String& fileNam
 
 /** @brief Creates video writer.
 
-@param fileName Name of the output video file. Only raw h264 or hevc files are supported.
+@param fileName Name of the output video file.
 @param frameSize Size of the input video frames.
-@param codec Codec.
+@param codec Supports Codec::H264 and Codec::HEVC.
 @param fps Framerate of the created video stream.
 @param colorFormat OpenCv color format of the frames to be encoded.
 @param params Additional encoding parameters.
@@ -361,14 +361,14 @@ enum class VideoReaderProps {
 #endif
 };
 
-/** @brief Video reader interface.
+/** @brief Video reader interface, see createVideoReader().
 
-Available when built with WITH_NVCUVID=ON while Nvidia's Video Codec SDK is installed.
+Available if Nvidia's Video Codec SDK is installed.
 
 Decoding support is dependent on the GPU, refer to the Nvidia Video Codec SDK Video Encode and Decode GPU Support Matrix for details.
 
 @note
-   -   An example on how to use the videoReader class can be found at
+   -   An example on how to use the VideoReader interface can be found at
         opencv_source_code/samples/gpu/video_reader.cpp
  */
 class CV_EXPORTS_W VideoReader
@@ -544,6 +544,14 @@ class CV_EXPORTS_W RawVideoSource
     @return `true` unless the property is unset set or not supported.
      */
     virtual bool get(const int propertyId, double& propertyVal) const = 0;
+
+    /** @brief Retrieve the index of the first frame that will returned after construction.
+
+    @return index of the index of the first frame that will returned after construction.
+
+    @note To reduce the decoding overhead when initializing VideoReader to start its decoding from frame N, RawVideoSource should seek to the first valid key frame less than or equal to N and return that index here.
+     */
+    virtual int getFirstFrameIdx() const = 0;
 };
 
 /** @brief VideoReader initialization parameters
@@ -561,9 +569,10 @@ but it cannot go below the number determined by NVDEC.
 @param targetRoi Region of interest (x/width should be multiples of 4 and y/height multiples of 2) within the output frame to copy and resize the decoded frame to,
 defaults to the full frame.
 @param enableHistogram Request output of decoded luma histogram \a hist from VideoReader::nextFrame(GpuMat& frame, GpuMat& hist, Stream& stream), if hardware supported.
+@param firstFrameIdx Index of the first frame to seek to on initialization of the VideoReader.
 */
 struct CV_EXPORTS_W_SIMPLE VideoReaderInitParams {
-    CV_WRAP VideoReaderInitParams() : udpSource(false), allowFrameDrop(false), minNumDecodeSurfaces(0), rawMode(0), enableHistogram(false){};
+    CV_WRAP VideoReaderInitParams() : udpSource(false), allowFrameDrop(false), minNumDecodeSurfaces(0), rawMode(0), enableHistogram(false), firstFrameIdx(0){};
     CV_PROP_RW bool udpSource;
     CV_PROP_RW bool allowFrameDrop;
     CV_PROP_RW int minNumDecodeSurfaces;
@@ -572,6 +581,7 @@ struct CV_EXPORTS_W_SIMPLE VideoReaderInitParams {
     CV_PROP_RW cv::Rect srcRoi;
     CV_PROP_RW cv::Rect targetRoi;
     CV_PROP_RW bool enableHistogram;
+    CV_PROP_RW int firstFrameIdx;
 };
 
 /** @brief Creates video reader.
diff --git a/modules/cudacodec/src/NvEncoder.cpp b/modules/cudacodec/src/NvEncoder.cpp
index 249f6f1c61e..aa9d2a67c17 100644
--- a/modules/cudacodec/src/NvEncoder.cpp
+++ b/modules/cudacodec/src/NvEncoder.cpp
@@ -7,16 +7,6 @@
 #include "NvEncoder.h"
 
 namespace cv { namespace cudacodec {
-#ifndef _WIN32
-#include <cstring>
-static inline bool operator==(const GUID& guid1, const GUID& guid2) {
-    return !memcmp(&guid1, &guid2, sizeof(GUID));
-}
-
-static inline bool operator!=(const GUID& guid1, const GUID& guid2) {
-    return !(guid1 == guid2);
-}
-#endif
 
 NvEncoder::NvEncoder(NV_ENC_DEVICE_TYPE eDeviceType, void* pDevice, uint32_t nWidth, uint32_t nHeight, NV_ENC_BUFFER_FORMAT eBufferFormat,
     uint32_t nExtraOutputDelay) :
diff --git a/modules/cudacodec/src/NvEncoder.h b/modules/cudacodec/src/NvEncoder.h
index dd13d2c1501..c5a53712e14 100644
--- a/modules/cudacodec/src/NvEncoder.h
+++ b/modules/cudacodec/src/NvEncoder.h
@@ -15,6 +15,17 @@
 
 namespace cv { namespace cudacodec {
 
+#ifndef _WIN32
+#include <cstring>
+    static inline bool operator==(const GUID& guid1, const GUID& guid2) {
+        return !memcmp(&guid1, &guid2, sizeof(GUID));
+    }
+
+    static inline bool operator!=(const GUID& guid1, const GUID& guid2) {
+        return !(guid1 == guid2);
+    }
+#endif
+
 #define NVENC_THROW_ERROR( errorStr, errorCode ) \
 do \
 { \
diff --git a/modules/cudacodec/src/ffmpeg_video_source.cpp b/modules/cudacodec/src/ffmpeg_video_source.cpp
index 20a02f84b55..87b7ef149e2 100644
--- a/modules/cudacodec/src/ffmpeg_video_source.cpp
+++ b/modules/cudacodec/src/ffmpeg_video_source.cpp
@@ -169,19 +169,21 @@ bool ParamSetsExist(unsigned char* parameterSets, const int szParameterSets, uns
     return paramSetStartCodeLen != 0 && packetStartCodeLen != 0 && parameterSets[paramSetStartCodeLen] == data[packetStartCodeLen];
 }
 
-cv::cudacodec::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname, const std::vector<int>& _videoCaptureParams)
+cv::cudacodec::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname, const std::vector<int>& _videoCaptureParams, const int iMaxStartFrame)
     : videoCaptureParams(_videoCaptureParams)
 {
     if (!videoio_registry::hasBackend(CAP_FFMPEG))
         CV_Error(Error::StsNotImplemented, "FFmpeg backend not found");
 
-    cap.open(fname, CAP_FFMPEG, videoCaptureParams);
-    if (!cap.isOpened())
+    videoCaptureParams.push_back(CAP_PROP_FORMAT);
+    videoCaptureParams.push_back(-1);
+    if (!cap.open(fname, CAP_FFMPEG, videoCaptureParams))
         CV_Error(Error::StsUnsupportedFormat, "Unsupported video source");
-
-    if (!cap.set(CAP_PROP_FORMAT, -1))  // turn off video decoder (extract stream)
-        CV_Error(Error::StsUnsupportedFormat, "Fetching of RAW video streams is not supported");
     CV_Assert(cap.get(CAP_PROP_FORMAT) == -1);
+    if (iMaxStartFrame) {
+        CV_Assert(cap.set(CAP_PROP_POS_FRAMES, iMaxStartFrame));
+        firstFrameIdx = static_cast<int>(cap.get(CAP_PROP_POS_FRAMES));
+    }
 
     const int codecExtradataIndex = static_cast<int>(cap.get(CAP_PROP_CODEC_EXTRADATA_INDEX));
     Mat tmpExtraData;
diff --git a/modules/cudacodec/src/ffmpeg_video_source.hpp b/modules/cudacodec/src/ffmpeg_video_source.hpp
index ce8582f6503..b2c25817a4c 100644
--- a/modules/cudacodec/src/ffmpeg_video_source.hpp
+++ b/modules/cudacodec/src/ffmpeg_video_source.hpp
@@ -51,7 +51,7 @@ namespace cv { namespace cudacodec { namespace detail {
 class FFmpegVideoSource : public RawVideoSource
 {
 public:
-    FFmpegVideoSource(const String& fname, const std::vector<int>& params);
+    FFmpegVideoSource(const String& fname, const std::vector<int>& params, const int iMaxStartFrame);
     ~FFmpegVideoSource();
 
     bool getNextPacket(unsigned char** data, size_t* size) CV_OVERRIDE;
@@ -66,12 +66,15 @@ class FFmpegVideoSource : public RawVideoSource
 
     bool get(const int propertyId, double& propertyVal) const;
 
+    int getFirstFrameIdx() const { return firstFrameIdx; }
+
 private:
     FormatInfo format_;
     VideoCapture cap;
     Mat rawFrame, extraData, dataWithHeader;
     int iFrame = 0;
     std::vector<int> videoCaptureParams;
+    int firstFrameIdx = 0;
 };
 
 }}}
diff --git a/modules/cudacodec/src/video_reader.cpp b/modules/cudacodec/src/video_reader.cpp
index b6ef2ca5376..6d71e544fa0 100644
--- a/modules/cudacodec/src/video_reader.cpp
+++ b/modules/cudacodec/src/video_reader.cpp
@@ -112,7 +112,7 @@ namespace
     {
     public:
         explicit VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop = false , const bool udpSource = false,
-            const Size targetSz = Size(), const Rect srcRoi = Rect(), const Rect targetRoi = Rect(), const bool enableHistogram = false);
+            const Size targetSz = Size(), const Rect srcRoi = Rect(), const Rect targetRoi = Rect(), const bool enableHistogram = false, const int firstFrameIdx = 0);
         ~VideoReaderImpl();
 
         bool nextFrame(GpuMat& frame, Stream& stream) CV_OVERRIDE;
@@ -135,6 +135,9 @@ namespace
         bool get(const int propertyId, double& propertyVal) const CV_OVERRIDE;
 
     private:
+        bool skipFrame();
+        bool aquireFrameInfo(std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS>& frameInfo, Stream& stream = Stream::Null());
+        void releaseFrameInfo(const std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS>& frameInfo);
         bool internalGrab(GpuMat & frame, GpuMat & histogram, Stream & stream);
         void waitForDecoderInit();
 
@@ -154,6 +157,7 @@ namespace
         static const int rawPacketsBaseIdx = 2;
         ColorFormat colorFormat = ColorFormat::BGRA;
         static const String errorMsg;
+        int iFrame = 0;
     };
 
     const String VideoReaderImpl::errorMsg = "Parsing/Decoding video source failed, check GPU memory is available and GPU supports requested functionality.";
@@ -173,7 +177,7 @@ namespace
     }
 
     VideoReaderImpl::VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop, const bool udpSource,
-        const Size targetSz, const Rect srcRoi, const Rect targetRoi, const bool enableHistogram) :
+        const Size targetSz, const Rect srcRoi, const Rect targetRoi, const bool enableHistogram, const int firstFrameIdx) :
         videoSource_(source),
         lock_(0)
     {
@@ -190,6 +194,8 @@ namespace
         videoSource_->setVideoParser(videoParser_);
         videoSource_->start();
         waitForDecoderInit();
+        for(iFrame = videoSource_->getFirstFrameIdx(); iFrame < firstFrameIdx; iFrame++)
+            CV_Assert(skipFrame());
         videoSource_->updateFormat(videoDecoder_->format());
     }
 
@@ -209,10 +215,7 @@ namespace
         CUvideoctxlock m_lock;
     };
 
-    bool VideoReaderImpl::internalGrab(GpuMat& frame, GpuMat& histogram, Stream& stream) {
-        if (videoParser_->hasError())
-            CV_Error(Error::StsError, errorMsg);
-        cudacodec::FormatInfo fmt;
+    bool VideoReaderImpl::aquireFrameInfo(std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS>& frameInfo, Stream& stream) {
         if (frames_.empty())
         {
             CUVIDPARSERDISPINFO displayInfo;
@@ -234,8 +237,6 @@ namespace
 
             bool isProgressive = displayInfo.progressive_frame != 0;
             const int num_fields = isProgressive ? 1 : 2 + displayInfo.repeat_first_field;
-            fmt = videoDecoder_->format();
-            videoSource_->updateFormat(fmt);
 
             for (int active_field = 0; active_field < num_fields; ++active_field)
             {
@@ -243,25 +244,46 @@ namespace
                 std::memset(&videoProcParams, 0, sizeof(CUVIDPROCPARAMS));
 
                 videoProcParams.progressive_frame = displayInfo.progressive_frame;
-                videoProcParams.second_field      = active_field;
-                videoProcParams.top_field_first   = displayInfo.top_field_first;
-                videoProcParams.unpaired_field    = (num_fields == 1);
+                videoProcParams.second_field = active_field;
+                videoProcParams.top_field_first = displayInfo.top_field_first;
+                videoProcParams.unpaired_field = (num_fields == 1);
                 videoProcParams.output_stream = StreamAccessor::getStream(stream);
 
                 frames_.push_back(std::make_pair(displayInfo, videoProcParams));
             }
         }
+        else {
+            for (auto& frame : frames_)
+                frame.second.output_stream = StreamAccessor::getStream(stream);
+        }
 
         if (frames_.empty())
             return false;
 
-        std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> frameInfo = frames_.front();
+        frameInfo = frames_.front();
         frames_.pop_front();
+        return true;
+    }
+
+    void VideoReaderImpl::releaseFrameInfo(const std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS>& frameInfo) {
+        // release the frame, so it can be re-used in decoder
+        if (frames_.empty())
+            frameQueue_->releaseFrame(frameInfo.first);
+    }
+
+    bool VideoReaderImpl::internalGrab(GpuMat& frame, GpuMat& histogram, Stream& stream) {
+        if (videoParser_->hasError())
+            CV_Error(Error::StsError, errorMsg);
+
+        std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> frameInfo;
+        if (!aquireFrameInfo(frameInfo, stream))
+            return false;
 
         {
             VideoCtxAutoLock autoLock(lock_);
 
             unsigned long long cuHistogramPtr = 0;
+            const cudacodec::FormatInfo fmt = videoDecoder_->format();
             if (fmt.enableHistogram)
                 frameInfo.second.histogram_dptr = &cuHistogramPtr;
 
@@ -281,10 +303,16 @@ namespace
             videoDecoder_->unmapFrame(decodedFrame);
         }
 
-        // release the frame, so it can be re-used in decoder
-        if (frames_.empty())
-            frameQueue_->releaseFrame(frameInfo.first);
+        releaseFrameInfo(frameInfo);
+        iFrame++;
+        return true;
+    }
 
+    bool VideoReaderImpl::skipFrame() {
+        std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> frameInfo;
+        if (!aquireFrameInfo(frameInfo))
+            return false;
+        releaseFrameInfo(frameInfo);
         return true;
     }
 
@@ -399,6 +427,10 @@ namespace
     }
 
     bool VideoReaderImpl::get(const int propertyId, double& propertyVal) const {
+        if (propertyId == cv::VideoCaptureProperties::CAP_PROP_POS_FRAMES) {
+            propertyVal = static_cast<double>(iFrame);
+            return true;
+        }
         return videoSource_->get(propertyId, propertyVal);
     }
 
@@ -421,11 +453,10 @@ Ptr<VideoReader> cv::cudacodec::createVideoReader(const String& filename, const
     CV_Assert(!filename.empty());
 
     Ptr<VideoSource> videoSource;
-
     try
     {
         // prefer ffmpeg to cuvidGetSourceVideoFormat() which doesn't always return the corrct raw pixel format
-        Ptr<RawVideoSource> source(new FFmpegVideoSource(filename, sourceParams));
+        Ptr<RawVideoSource> source(new FFmpegVideoSource(filename, sourceParams, params.firstFrameIdx));
         videoSource.reset(new RawVideoSourceWrapper(source, params.rawMode));
     }
     catch (...)
@@ -433,16 +464,15 @@ Ptr<VideoReader> cv::cudacodec::createVideoReader(const String& filename, const
         if (sourceParams.size()) throw;
         videoSource.reset(new CuvidVideoSource(filename));
     }
-
     return makePtr<VideoReaderImpl>(videoSource, params.minNumDecodeSurfaces, params.allowFrameDrop, params.udpSource, params.targetSz,
-        params.srcRoi, params.targetRoi, params.enableHistogram);
+        params.srcRoi, params.targetRoi, params.enableHistogram, params.firstFrameIdx);
 }
 
 Ptr<VideoReader> cv::cudacodec::createVideoReader(const Ptr<RawVideoSource>& source, const VideoReaderInitParams params)
 {
     Ptr<VideoSource> videoSource(new RawVideoSourceWrapper(source, params.rawMode));
     return makePtr<VideoReaderImpl>(videoSource, params.minNumDecodeSurfaces, params.allowFrameDrop, params.udpSource, params.targetSz,
-        params.srcRoi, params.targetRoi, params.enableHistogram);
+        params.srcRoi, params.targetRoi, params.enableHistogram, params.firstFrameIdx);
 }
 
 void cv::cudacodec::MapHist(const GpuMat& hist, Mat& histFull) {
diff --git a/modules/cudacodec/src/video_source.cpp b/modules/cudacodec/src/video_source.cpp
index a81b75e366d..169ffbb9bce 100644
--- a/modules/cudacodec/src/video_source.cpp
+++ b/modules/cudacodec/src/video_source.cpp
@@ -76,6 +76,10 @@ bool cv::cudacodec::detail::RawVideoSourceWrapper::get(const int propertyId, dou
     return source_->get(propertyId, propertyVal);
 }
 
+int cv::cudacodec::detail::RawVideoSourceWrapper::getFirstFrameIdx() const {
+    return source_->getFirstFrameIdx();
+}
+
 void cv::cudacodec::detail::RawVideoSourceWrapper::start()
 {
     stop_ = false;
diff --git a/modules/cudacodec/src/video_source.hpp b/modules/cudacodec/src/video_source.hpp
index 8c96a34f2d5..f7e4c0bd15b 100644
--- a/modules/cudacodec/src/video_source.hpp
+++ b/modules/cudacodec/src/video_source.hpp
@@ -58,6 +58,7 @@ class VideoSource
     virtual FormatInfo format() const = 0;
     virtual void updateFormat(const FormatInfo& videoFormat) = 0;
     virtual bool get(const int propertyId, double& propertyVal) const { return false; }
+    virtual int getFirstFrameIdx() const { return 0; }
     virtual void start() = 0;
     virtual void stop() = 0;
     virtual bool isStarted() const = 0;
@@ -91,6 +92,7 @@ class RawVideoSourceWrapper : public VideoSource
     FormatInfo format() const CV_OVERRIDE;
     void updateFormat(const FormatInfo& videoFormat) CV_OVERRIDE;
     bool get(const int propertyId, double& propertyVal) const CV_OVERRIDE;
+    int getFirstFrameIdx() const CV_OVERRIDE;
     void start() CV_OVERRIDE;
     void stop() CV_OVERRIDE;
     bool isStarted() const CV_OVERRIDE;
diff --git a/modules/cudacodec/src/video_writer.cpp b/modules/cudacodec/src/video_writer.cpp
index db3e2e36306..8b5c703f759 100644
--- a/modules/cudacodec/src/video_writer.cpp
+++ b/modules/cudacodec/src/video_writer.cpp
@@ -59,7 +59,6 @@ GUID CodecGuid(const Codec codec);
 void FrameRate(const double fps, uint32_t& frameRateNum, uint32_t& frameRateDen);
 GUID EncodingProfileGuid(const EncodeProfile encodingProfile);
 GUID EncodingPresetGuid(const EncodePreset nvPreset);
-bool Equal(const GUID& g1, const GUID& g2);
 
 bool operator==(const EncoderParams& lhs, const EncoderParams& rhs)
 {
@@ -68,12 +67,48 @@ bool operator==(const EncoderParams& lhs, const EncoderParams& rhs)
             rhs.averageBitRate, rhs.maxBitRate, rhs.targetQuality, rhs.gopLength);
 };
 
+class FFmpegVideoWriter : public EncoderCallback
+{
+public:
+    FFmpegVideoWriter(const String& fileName, const Codec codec, const int fps, const Size sz, const int idrPeriod);
+    ~FFmpegVideoWriter();
+    void onEncoded(const std::vector<std::vector<uint8_t>>& vPacket);
+    void onEncodingFinished();
+private:
+    cv::VideoWriter writer;
+};
+
+FFmpegVideoWriter::FFmpegVideoWriter(const String& fileName, const Codec codec, const int fps, const Size sz, const int idrPeriod) {
+    if (!videoio_registry::hasBackend(CAP_FFMPEG))
+        CV_Error(Error::StsNotImplemented, "FFmpeg backend not found");
+    const int fourcc = codec == Codec::H264 ? cv::VideoWriter::fourcc('a', 'v', 'c', '1') : cv::VideoWriter::fourcc('h', 'e', 'v', '1');
+    writer.open(fileName, fourcc, fps, sz, { VideoWriterProperties::VIDEOWRITER_PROP_RAW_VIDEO, 1, VideoWriterProperties::VIDEOWRITER_PROP_KEY_INTERVAL, idrPeriod });
+    if (!writer.isOpened())
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported video sink");
+}
+
+void FFmpegVideoWriter::onEncodingFinished() {
+    writer.release();
+}
+
+FFmpegVideoWriter::~FFmpegVideoWriter() {
+    onEncodingFinished();
+}
+
+void FFmpegVideoWriter::onEncoded(const std::vector<std::vector<uint8_t>>& vPacket) {
+    for (auto& packet : vPacket) {
+        Mat wrappedPacket(1, packet.size(), CV_8UC1, (void*)packet.data());
+        writer.write(wrappedPacket);
+    }
+}
+
+
 class RawVideoWriter : public EncoderCallback
 {
 public:
-    RawVideoWriter(String fileName);
+    RawVideoWriter(const String fileName);
     ~RawVideoWriter();
-    void onEncoded(std::vector<std::vector<uint8_t>> vPacket);
+    void onEncoded(const std::vector<std::vector<uint8_t>>& vPacket);
     void onEncodingFinished();
 private:
     std::ofstream fpOut;
@@ -93,9 +128,9 @@ RawVideoWriter::~RawVideoWriter() {
     onEncodingFinished();
 }
 
-void RawVideoWriter::onEncoded(std::vector<std::vector<uint8_t>> vPacket) {
+void RawVideoWriter::onEncoded(const std::vector<std::vector<uint8_t>>& vPacket) {
     for (auto& packet : vPacket)
-        fpOut.write(reinterpret_cast<char*>(packet.data()), packet.size());
+        fpOut.write(reinterpret_cast<const char*>(packet.data()), packet.size());
 }
 
 class VideoWriterImpl : public VideoWriter
@@ -172,12 +207,6 @@ VideoWriterImpl::VideoWriterImpl(const Ptr<EncoderCallback>& encoderCallBack_, c
     Init(codec, fps, frameSz);
 }
 
-VideoWriterImpl::VideoWriterImpl(const Ptr<EncoderCallback>& encoderCallback, const Size frameSz, const Codec codec, const double fps,
-    const ColorFormat colorFormat, const Stream& stream) :
-    VideoWriterImpl(encoderCallback, frameSz, codec, fps, colorFormat, EncoderParams(), stream)
-{
-}
-
 void VideoWriterImpl::release() {
     pEnc->EndEncode(vPacket);
     encoderCallback->onEncoded(vPacket);
@@ -271,12 +300,6 @@ GUID EncodingPresetGuid(const EncodePreset nvPreset) {
     CV_Error(Error::StsUnsupportedFormat, msg);
 }
 
-bool Equal(const GUID& g1, const GUID& g2) {
-    if (std::tie(g1.Data1, g1.Data2, g1.Data3, g1.Data4) == std::tie(g2.Data1, g2.Data2, g2.Data3, g2.Data4))
-        return true;
-    return false;
-}
-
 void VideoWriterImpl::InitializeEncoder(const GUID codec, const double fps)
 {
     NV_ENC_INITIALIZE_PARAMS initializeParams = {};
@@ -293,10 +316,10 @@ void VideoWriterImpl::InitializeEncoder(const GUID codec, const double fps)
     initializeParams.encodeConfig->rcParams.maxBitRate = encoderParams.maxBitRate;
     initializeParams.encodeConfig->rcParams.targetQuality = encoderParams.targetQuality;
     initializeParams.encodeConfig->gopLength = encoderParams.gopLength;
-    if (Equal(codec, NV_ENC_CODEC_H264_GUID))
-        initializeParams.encodeConfig->encodeCodecConfig.h264Config.idrPeriod = encoderParams.gopLength;
-    else if (Equal(codec, NV_ENC_CODEC_HEVC_GUID))
-        initializeParams.encodeConfig->encodeCodecConfig.hevcConfig.idrPeriod = encoderParams.gopLength;
+    if (codec == NV_ENC_CODEC_H264_GUID)
+        initializeParams.encodeConfig->encodeCodecConfig.h264Config.idrPeriod = encoderParams.idrPeriod;
+    else if (codec == NV_ENC_CODEC_HEVC_GUID)
+        initializeParams.encodeConfig->encodeCodecConfig.hevcConfig.idrPeriod = encoderParams.idrPeriod;
     pEnc->CreateEncoder(&initializeParams);
 }
 
@@ -371,14 +394,25 @@ EncoderParams VideoWriterImpl::getEncoderParams() const {
 Ptr<VideoWriter> createVideoWriter(const String& fileName, const Size frameSize, const Codec codec, const double fps, const ColorFormat colorFormat,
     Ptr<EncoderCallback> encoderCallback, const Stream& stream)
 {
-    encoderCallback = encoderCallback ? encoderCallback : new RawVideoWriter(fileName);
-    return makePtr<VideoWriterImpl>(encoderCallback, frameSize, codec, fps, colorFormat, stream);
+    return createVideoWriter(fileName, frameSize, codec, fps, colorFormat, EncoderParams(), encoderCallback, stream);
 }
 
 Ptr<VideoWriter> createVideoWriter(const String& fileName, const Size frameSize, const Codec codec, const double fps, const ColorFormat colorFormat,
     const EncoderParams& params, Ptr<EncoderCallback> encoderCallback, const Stream& stream)
 {
-    encoderCallback = encoderCallback ? encoderCallback : new RawVideoWriter(fileName);
+    CV_Assert(params.idrPeriod >= params.gopLength);
+    if (!encoderCallback) {
+        // required until PR for raw video encapsulation is merged and windows dll is updated
+#ifndef WIN32 // remove #define and keep code once merged
+        try {
+            encoderCallback = new FFmpegVideoWriter(fileName, codec, fps, frameSize, params.idrPeriod);
+        }
+        catch (...)
+#endif
+        {
+            encoderCallback = new RawVideoWriter(fileName);
+        }
+    }
     return makePtr<VideoWriterImpl>(encoderCallback, frameSize, codec, fps, colorFormat, params, stream);
 }
 
diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
index ead5fa944ca..88df2fb1afb 100644
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -113,6 +113,10 @@ struct CheckParams : SetDevice
 {
 };
 
+struct Seek : SetDevice
+{
+};
+
 #if defined(HAVE_NVCUVID)
 //////////////////////////////////////////////////////
 // VideoReader
@@ -542,36 +546,22 @@ CUDA_TEST_P(CheckParams, Reader)
         ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_OPEN_TIMEOUT_MSEC, msActual));
         ASSERT_EQ(msActual, msReference);
     }
-
-    {
-        std::vector<bool> exceptionsThrown = { false,true };
-        std::vector<int> capPropFormats = { -1,0 };
-        for (int i = 0; i < capPropFormats.size(); i++) {
-            bool exceptionThrown = false;
-            try {
-                cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile, {
-                    cv::VideoCaptureProperties::CAP_PROP_FORMAT, capPropFormats.at(i) });
-            }
-            catch (cv::Exception &ex) {
-                if (ex.code == Error::StsUnsupportedFormat)
-                    exceptionThrown = true;
-            }
-            ASSERT_EQ(exceptionThrown, exceptionsThrown.at(i));
-        }
-    }
 }
 
 CUDA_TEST_P(CheckParams, CaptureProps)
 {
     std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.mp4";
     cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
-    double width, height, fps;
+    double width, height, fps, iFrame;
     ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_FRAME_WIDTH, width));
     ASSERT_EQ(672, width);
     ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_FRAME_HEIGHT, height));
     ASSERT_EQ(384, height);
     ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_FPS, fps));
     ASSERT_EQ(24, fps);
+    ASSERT_TRUE(reader->grab());
+    ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_POS_FRAMES, iFrame));
+    ASSERT_EQ(iFrame, 1.);
 }
 
 CUDA_TEST_P(CheckDecodeSurfaces, Reader)
@@ -619,6 +609,37 @@ CUDA_TEST_P(CheckInitParams, Reader)
     ASSERT_TRUE(reader->get(cv::cudacodec::VideoReaderProps::PROP_RAW_MODE, rawMode) && static_cast<bool>(rawMode) == params.rawMode);
 }
 
+CUDA_TEST_P(Seek, Reader)
+{
+#if defined(WIN32)
+    throw SkipTestException("Test disabled on Windows until the FFMpeg wrapper is updated to include PR24012.");
+#endif
+    std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.mp4";
+    // seek to a non key frame
+    const int firstFrameIdx = 18;
+
+    GpuMat frameGs;
+    {
+        cv::Ptr<cv::cudacodec::VideoReader> readerGs = cv::cudacodec::createVideoReader(inputFile);
+        ASSERT_TRUE(readerGs->set(cudacodec::ColorFormat::GRAY));
+        for (int i = 0; i <= firstFrameIdx; i++)
+            ASSERT_TRUE(readerGs->nextFrame(frameGs));
+    }
+
+    cudacodec::VideoReaderInitParams params;
+    params.firstFrameIdx = firstFrameIdx;
+    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile, {}, params);
+    double iFrame = 0.;
+    ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_POS_FRAMES, iFrame));
+    ASSERT_EQ(iFrame, static_cast<double>(firstFrameIdx));
+    ASSERT_TRUE(reader->set(cudacodec::ColorFormat::GRAY));
+    GpuMat frame;
+    ASSERT_TRUE(reader->nextFrame(frame));
+    ASSERT_EQ(cuda::norm(frameGs, frame, NORM_INF), 0.0);
+    ASSERT_TRUE(reader->get(cv::VideoCaptureProperties::CAP_PROP_POS_FRAMES, iFrame));
+    ASSERT_EQ(iFrame, static_cast<double>(firstFrameIdx+1));
+}
+
 #endif // HAVE_NVCUVID
 
 #if defined(HAVE_NVCUVID) && defined(HAVE_NVCUVENC)
@@ -639,7 +660,13 @@ CUDA_TEST_P(TransCode, H264ToH265)
     constexpr cv::cudacodec::ColorFormat colorFormat = cv::cudacodec::ColorFormat::NV_NV12;
     constexpr double fps = 25;
     const cudacodec::Codec codec = cudacodec::Codec::HEVC;
-    const std::string ext = ".h265";
+    // required until PR for raw video encapsulation is merged and windows dll is updated
+#ifdef WIN32
+    const std::string ext = ".hevc";
+#else
+    // use this after update
+    const std::string ext = ".mp4";
+#endif
     const std::string outputFile = cv::tempfile(ext.c_str());
     constexpr int nFrames = 5;
     Size frameSz;
@@ -716,7 +743,13 @@ CUDA_TEST_P(Write, Writer)
     const cudacodec::Codec codec = GET_PARAM(2);
     const double fps = GET_PARAM(3);
     const cv::cudacodec::ColorFormat colorFormat = GET_PARAM(4);
+    // required until PR for raw video encapsulation is merged and windows dll is updated
+#ifdef WIN32
     const std::string ext = codec == cudacodec::Codec::H264 ? ".h264" : ".hevc";
+#else
+    // use this after update
+    const std::string ext = ".mp4";
+#endif
     const std::string outputFile = cv::tempfile(ext.c_str());
     constexpr int nFrames = 5;
     Size frameSz;
@@ -750,7 +783,7 @@ CUDA_TEST_P(Write, Writer)
         const int width = static_cast<int>(cap.get(CAP_PROP_FRAME_WIDTH));
         const int height = static_cast<int>(cap.get(CAP_PROP_FRAME_HEIGHT));
         ASSERT_EQ(frameSz, Size(width, height));
-        ASSERT_TRUE(abs(fps - cap.get(CAP_PROP_FPS)) < 0.5);
+        ASSERT_EQ(fps, cap.get(CAP_PROP_FPS));
         Mat frame;
         for (int i = 0; i < nFrames; ++i) {
             cap >> frame;
@@ -761,24 +794,22 @@ CUDA_TEST_P(Write, Writer)
 }
 
 #define DEVICE_SRC true, false
-#define FPS 10, 29.7
+#define FPS 10, 29
 #define CODEC cv::cudacodec::Codec::H264, cv::cudacodec::Codec::HEVC
 #define COLOR_FORMAT cv::cudacodec::ColorFormat::BGR, cv::cudacodec::ColorFormat::RGB, cv::cudacodec::ColorFormat::BGRA, \
 cv::cudacodec::ColorFormat::RGBA, cv::cudacodec::ColorFormat::GRAY
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, Write, testing::Combine(ALL_DEVICES, testing::Values(DEVICE_SRC), testing::Values(CODEC), testing::Values(FPS),
     testing::Values(COLOR_FORMAT)));
 
-
-struct EncoderParams : testing::TestWithParam<cv::cuda::DeviceInfo>
+PARAM_TEST_CASE(EncoderParams, cv::cuda::DeviceInfo, int)
 {
     cv::cuda::DeviceInfo devInfo;
     cv::cudacodec::EncoderParams params;
     virtual void SetUp()
     {
-        devInfo = GetParam();
+        devInfo = GET_PARAM(0);
         cv::cuda::setDevice(devInfo.deviceID());
         // Fixed params for CBR test
-        params.nvPreset = cv::cudacodec::EncodePreset::ENC_PRESET_P7;
         params.tuningInfo = cv::cudacodec::EncodeTuningInfo::ENC_TUNING_INFO_HIGH_QUALITY;
         params.encodingProfile = cv::cudacodec::EncodeProfile::ENC_H264_PROFILE_MAIN;
         params.rateControlMode = cv::cudacodec::EncodeParamsRcMode::ENC_PARAMS_RC_CBR;
@@ -787,19 +818,25 @@ struct EncoderParams : testing::TestWithParam<cv::cuda::DeviceInfo>
         params.maxBitRate = 0;
         params.targetQuality = 0;
         params.gopLength = 5;
+        params.idrPeriod = GET_PARAM(1);
     }
 };
 
-
 CUDA_TEST_P(EncoderParams, Writer)
 {
     const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.mp4";
     constexpr double fps = 25.0;
     constexpr cudacodec::Codec codec = cudacodec::Codec::H264;
+    // required until PR for raw video encapsulation is merged and windows dll is updated
+#ifdef WIN32
     const std::string ext = ".h264";
+#else
+    // use this after update
+    const std::string ext = ".mp4";
+#endif
     const std::string outputFile = cv::tempfile(ext.c_str());
     Size frameSz;
-    constexpr int nFrames = 5;
+    const int nFrames = max(params.gopLength, params.idrPeriod) + 1;
     {
         cv::VideoCapture reader(inputFile);
         ASSERT_TRUE(reader.isOpened());
@@ -829,20 +866,36 @@ CUDA_TEST_P(EncoderParams, Writer)
         const int height = static_cast<int>(cap.get(CAP_PROP_FRAME_HEIGHT));
         ASSERT_EQ(frameSz, Size(width, height));
         ASSERT_EQ(fps, cap.get(CAP_PROP_FPS));
-        const bool checkGop = videoio_registry::hasBackend(CAP_FFMPEG);
-        Mat frame;
+        const bool checkFrameType = videoio_registry::hasBackend(CAP_FFMPEG);
+        VideoCapture capRaw;
+        int idrPeriod = 0;
+        if (checkFrameType) {
+            capRaw.open(outputFile, CAP_FFMPEG, { CAP_PROP_FORMAT, -1 });
+            ASSERT_TRUE(capRaw.isOpened());
+            idrPeriod = params.idrPeriod == 0 ? params.gopLength : params.idrPeriod;
+        }
+        const double frameTypeIAsciiCode = 73.0; // see CAP_PROP_FRAME_TYPE
+        Mat frame, frameRaw;
         for (int i = 0; i < nFrames; ++i) {
             cap >> frame;
             ASSERT_FALSE(frame.empty());
-            if (checkGop && (cap.get(CAP_PROP_FRAME_TYPE) == 73)) {
-                ASSERT_TRUE(i % params.gopLength == 0);
+            if (checkFrameType) {
+                capRaw >> frameRaw;
+                ASSERT_FALSE(frameRaw.empty());
+                const bool intraFrameReference = cap.get(CAP_PROP_FRAME_TYPE) == frameTypeIAsciiCode;
+                const bool intraFrameActual = i % params.gopLength == 0;
+                ASSERT_EQ(intraFrameActual, intraFrameReference);
+                const bool keyFrameActual = capRaw.get(CAP_PROP_LRF_HAS_KEY_FRAME) == 1.0;
+                const bool keyFrameReference = i % idrPeriod == 0;
+                ASSERT_EQ(keyFrameActual, keyFrameReference);
             }
         }
     }
     ASSERT_EQ(0, remove(outputFile.c_str()));
 }
 
-INSTANTIATE_TEST_CASE_P(CUDA_Codec, EncoderParams, ALL_DEVICES);
+#define IDR_PERIOD testing::Values(5,10)
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, EncoderParams, testing::Combine(ALL_DEVICES, IDR_PERIOD));
 
 #endif // HAVE_NVCUVENC
 
@@ -926,5 +979,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckInitParams, testing::Combine(
     testing::Values("highgui/video/big_buck_bunny.mp4"),
     testing::Values(true,false), testing::Values(true,false), testing::Values(true,false)));
 
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, Seek, ALL_DEVICES);
+
 #endif // HAVE_NVCUVID || HAVE_NVCUVENC
 }} // namespace
diff --git a/modules/datasets/src/tinyxml2/tinyxml2.h b/modules/datasets/src/tinyxml2/tinyxml2.h
index 95ae3bcc057..89a16c65b75 100644
--- a/modules/datasets/src/tinyxml2/tinyxml2.h
+++ b/modules/datasets/src/tinyxml2/tinyxml2.h
@@ -212,7 +212,7 @@ template <class T, int INIT>
 class DynArray
 {
 public:
-    DynArray< T, INIT >() {
+    DynArray() {
         _mem = _pool;
         _allocated = INIT;
         _size = 0;
diff --git a/modules/face/samples/landmarks_demo.py b/modules/face/samples/landmarks_demo.py
index 4c6c4935975..84893a77a25 100644
--- a/modules/face/samples/landmarks_demo.py
+++ b/modules/face/samples/landmarks_demo.py
@@ -18,7 +18,11 @@
     print("cascade not found")
     exit()
 faces = cascade.detectMultiScale(frame, 1.05,  3, cv.CASCADE_SCALE_IMAGE, (30, 30))
-ok, landmarks = facemark.fit(frame, faces=faces)
+if len(faces) == 0:
+    print('no faces detected')
+    landmarks = []
+else:
+    ok, landmarks = facemark.fit(frame, faces=faces)
 cv.imshow("Image", frame)
 for marks in landmarks:
     couleur = (random.randint(0,255),
diff --git a/modules/freetype/include/opencv2/freetype.hpp b/modules/freetype/include/opencv2/freetype.hpp
index e62d058a876..90007badd1d 100644
--- a/modules/freetype/include/opencv2/freetype.hpp
+++ b/modules/freetype/include/opencv2/freetype.hpp
@@ -76,7 +76,7 @@ class CV_EXPORTS_W FreeType2 : public Algorithm
 public:
 /** @brief Load font data.
 
-The function loadFontData loads font data.
+The function loadFontData loads font data from file.
 
 @param fontFileName FontFile Name
 @param idx face_index to select a font faces in a single file.
@@ -84,6 +84,19 @@ The function loadFontData loads font data.
 
     CV_WRAP virtual void loadFontData(String fontFileName, int idx) = 0;
 
+/** @brief Load font data.
+
+The function loadFontData loads font data from memory.
+The data is not copied, the user needs to make sure the data lives at least as long as FreeType2.
+After the FreeType2 object is destroyed, the buffer can be safely deallocated.
+
+@param pBuf pointer to buffer containing font data
+@param bufSize size of buffer
+@param idx face_index to select a font faces in a single file.
+*/
+
+    CV_WRAP virtual void loadFontData(char* pBuf, size_t bufSize, int idx) = 0;
+
 /** @brief Set Split Number from Bezier-curve to line
 
 The function setSplitNumber set the number of split points from bezier-curve to line.
diff --git a/modules/freetype/src/freetype.cpp b/modules/freetype/src/freetype.cpp
index b8e605e5104..d8934e361a2 100644
--- a/modules/freetype/src/freetype.cpp
+++ b/modules/freetype/src/freetype.cpp
@@ -67,6 +67,7 @@ class CV_EXPORTS_W FreeType2Impl CV_FINAL : public FreeType2
     FreeType2Impl();
     ~FreeType2Impl();
     void loadFontData(String fontFileName, int idx) CV_OVERRIDE;
+    void loadFontData(char* pBuf, size_t bufSize, int idx) CV_OVERRIDE;
     void setSplitNumber( int num ) CV_OVERRIDE;
     void putText(
         InputOutputArray img, const String& text, Point org,
@@ -87,6 +88,8 @@ class CV_EXPORTS_W FreeType2Impl CV_FINAL : public FreeType2
     int              mCtoL;
     hb_font_t        *mHb_font;
 
+    void loadFontData(FT_Open_Args &args, int idx);
+
     void putTextBitmapMono(
         InputOutputArray img, const String& text, Point org,
         int fontHeight, Scalar color,
@@ -179,18 +182,54 @@ FreeType2Impl::~FreeType2Impl()
 }
 
 void FreeType2Impl::loadFontData(String fontFileName, int idx)
+{
+    FT_Open_Args args
+    {
+        FT_OPEN_PATHNAME,
+        nullptr, // memory_base
+        0,       // memory_size
+        const_cast<FT_String*>(fontFileName.c_str()),
+        nullptr, // stream
+        nullptr, // driver
+        0,       // num_params
+        nullptr  // params
+    };
+
+    this->loadFontData(args, idx);
+}
+
+void FreeType2Impl::loadFontData(char* pBuf, size_t bufSize, int idx)
+{
+    CV_Assert( pBuf != nullptr );
+
+    FT_Open_Args args
+    {
+        FT_OPEN_MEMORY,
+        reinterpret_cast<FT_Byte*>(pBuf),
+        static_cast<FT_Long>(bufSize),
+        nullptr, // pathname
+        nullptr, // stream
+        nullptr, // driver
+        0,       // num_params
+        nullptr  // params
+    };
+
+    this->loadFontData(args, idx);
+}
+
+void FreeType2Impl::loadFontData(FT_Open_Args &args, int idx)
 {
     CV_Assert( idx >= 0 );
-    if( mIsFaceAvailable  == true )
+    if ( mIsFaceAvailable  == true )
     {
-        hb_font_destroy (mHb_font);
+        hb_font_destroy(mHb_font);
         CV_Assert(!FT_Done_Face(mFace));
     }
 
     mIsFaceAvailable = false;
-    CV_Assert( !FT_New_Face( mLibrary, fontFileName.c_str(), static_cast<FT_Long>(idx), &(mFace) ) );
+    CV_Assert( !FT_Open_Face(mLibrary, &args, idx, &mFace) );
 
-    mHb_font = hb_ft_font_create (mFace, NULL);
+    mHb_font = hb_ft_font_create(mFace, NULL);
     if ( mHb_font == NULL )
     {
         CV_Assert(!FT_Done_Face(mFace));
diff --git a/modules/freetype/test/test_basic.cpp b/modules/freetype/test/test_basic.cpp
index 4c4e0c3d7ce..5a646db45f5 100644
--- a/modules/freetype/test/test_basic.cpp
+++ b/modules/freetype/test/test_basic.cpp
@@ -55,6 +55,39 @@ TEST(Freetype_Basic, success )
     EXPECT_NO_THROW( ft2->putText(dst, "Basic,success", Point( 0,  50), 50, col, -1, LINE_AA, true ) );
 }
 
+TEST(Freetype_Basic, in_memory_font )
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string font_path = root + "freetype/mplus/Mplus1-Regular.ttf";
+
+    cv::Ptr<cv::freetype::FreeType2> ft2;
+    EXPECT_NO_THROW( ft2 = cv::freetype::createFreeType2() );
+    EXPECT_NO_THROW( ft2->loadFontData( font_path, 0 ) );
+
+    Mat dst(600,600, CV_8UC3, Scalar::all(255) );
+    Scalar col(128,64,255,192);
+    EXPECT_NO_THROW( ft2->putText(dst, "Basic,success", Point( 0,  50), 50, col, -1, LINE_AA, true ) );
+
+    FILE* fp = fopen(font_path.c_str(), "rb");
+    ASSERT_TRUE(fp != NULL);
+    fseek(fp, 0, SEEK_END);
+    const size_t file_size = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+
+    std::vector<char> font_buffer(file_size);
+    const size_t actual_read = fread(&font_buffer[0], 1, file_size, fp);
+    fclose(fp);
+    ASSERT_EQ(file_size, actual_read);
+
+    cv::Ptr<cv::freetype::FreeType2> ft2_in_memory;
+    EXPECT_NO_THROW( ft2_in_memory = cv::freetype::createFreeType2() );
+    EXPECT_NO_THROW( ft2_in_memory->loadFontData( &font_buffer[0], file_size, 0 ) );
+    Mat dst_in_memory(600,600, CV_8UC3, Scalar::all(255) );
+    EXPECT_NO_THROW( ft2_in_memory->putText(dst_in_memory, "Basic,success", Point( 0,  50), 50, col, -1, LINE_AA, true ) );
+
+    EXPECT_PRED_FORMAT2(cvtest::MatComparator(0, 0), dst, dst_in_memory);
+}
+
 /******************
  * loadFontData()
  *****************/
@@ -105,6 +138,37 @@ TEST(Freetype_loadFontData, call_multiple)
     EXPECT_NO_THROW( ft2->putText(dst, "call_mutilple", Point( 0,  50), 50, col, -1, LINE_AA, true ) );
 }
 
+TEST(Freetype_loadFontDataMemory, nullptr )
+{
+    cv::Ptr<cv::freetype::FreeType2> ft2;
+    EXPECT_NO_THROW( ft2 = cv::freetype::createFreeType2() );
+    EXPECT_ANY_THROW( ft2->loadFontData( nullptr, 0, 0 ) );
+}
+
+TEST(Freetype_loadFontDataMemory, broken_data )
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string font_path = root + "freetype/mplus/Mplus1-Regular.ttf";
+
+    FILE* fp = fopen(font_path.c_str(), "rb");
+    ASSERT_TRUE(fp != NULL);
+    fseek(fp, 0, SEEK_END);
+    const size_t file_size = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+
+    std::vector<char> font_buffer(file_size);
+    const size_t actual_read = fread(&font_buffer[0], 1, file_size, fp);
+    fclose(fp);
+    ASSERT_EQ(file_size, actual_read);
+
+    cv::Ptr<cv::freetype::FreeType2> ft2_in_memory;
+    EXPECT_NO_THROW( ft2_in_memory = cv::freetype::createFreeType2() );
+
+    font_buffer[0] = ~font_buffer[0]; // font buffer was broken.
+
+    EXPECT_ANY_THROW( ft2_in_memory->loadFontData( &font_buffer[0], file_size, 0 ) );
+}
+
 typedef testing::TestWithParam<int> idx_range;
 
 TEST_P(idx_range, failed )
diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 8fde6e457c1..e51f8091cbe 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -296,7 +296,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                             v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                             v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                             v_int32x4 t0, t1;
                             v_int16x8 t00, t01, t10, t11;
@@ -304,35 +304,35 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_zip(v10, v11, t10, t11);
 
                             //subpixel interpolation
-                            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                            t0 = t0 >> (W_BITS - 5);
-                            t1 = t1 >> (W_BITS - 5);
+                            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                            t0 = v_shr(t0, W_BITS - 5);
+                            t1 = v_shr(t1, W_BITS - 5);
                             diff0 = v_pack(t0, t1);
                             // I*gain.x + gain.x
                             v_int16x8 diff[4] =
                             {
-                                ((v11 << 5) - vI) & vmask,
-                                ((v01 << 5) - vI) & vmask,
-                                ((v10 << 5) - vI) & vmask,
-                                ((v00 << 5) - vI) & vmask
+                                v_and(v_sub(v_shl<5>(v11), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v01), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v10), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v00), vI), vmask)
                             };
-                            diff0 = diff0 - vI;
-                            diff0 = diff0 & vmask;
+                            diff0 = v_sub(diff0, vI);
+                            diff0 = v_and(diff0, vmask);
 
-                            v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                            veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                            v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                            veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                             // since there is no abs vor int16x8 we have to do this hack
                             v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                             v_int16x8 vset2, vset1;
                             // |It| < sigma1 ?
-                            vset2 = vabs_diff < vparam1;
+                            vset2 = v_lt(vabs_diff, vparam1);
                             // It > 0 ?
-                            v_int16x8 vdiff_is_pos = diff0 > vzero;
+                            v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                             // sigma0 < |It| < sigma1 ?
-                            vset1 = vset2 & (vabs_diff > vparam0);
+                            vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                             // val = |It| -/+ sigma1
-                            v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                            v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
 
                             v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                             v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@@ -342,7 +342,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             for (unsigned int mmi = 0; mmi < 4; mmi++)
                             {
                                 // It == 0     ? |It| > sigma13
-                                diff0 = vset2 & diff[mmi];
+                                diff0 = v_and(vset2, diff[mmi]);
                                 // It == val ? sigma0 < |It| < sigma1
                                 diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -350,16 +350,16 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 // diff = diff * sigma2
                                 v_int32x4 diff_int_0, diff_int_1;
                                 v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                                v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                                v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                                v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                                v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                                 diff0 = v_pack(diff0_0, diff0_1);
                                 v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
                                 v_zip(vIxy_0, vIxy_1, v10, v11);
                                 v_zip(diff2, diff1, v00, v01);
 
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
                             }
                             if (j == 0)
                             {
@@ -387,8 +387,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 v_float32x4 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                v_float32x4 fxtale = fx * vtale_0;
-                                v_float32x4 fytale = fy * vtale_0;
+                                v_float32x4 fxtale = v_mul(fx, vtale_0);
+                                v_float32x4 fytale = v_mul(fy, vtale_0);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
@@ -402,8 +402,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                fxtale = fx * vtale_1;
-                                fytale = fy * vtale_1;
+                                fxtale = v_mul(fx, vtale_1);
+                                fytale = v_mul(fy, vtale_1);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
@@ -544,7 +544,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     float CV_DECL_ALIGNED(16) bbuf[4];
                     for (int mmi = 0; mmi < 4; mmi++)
                     {
-                        v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                        v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                         _b0[mmi] = bbuf[0] + bbuf[2];
                         _b1[mmi] = bbuf[1] + bbuf[3];
                     }
@@ -960,7 +960,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                             v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                             v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                             v_int32x4 t0, t1;
                             v_int16x8 t00, t01, t10, t11;
@@ -968,38 +968,38 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_zip(v10, v11, t10, t11);
 
                             //subpixel interpolation
-                            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                            t0 = t0 >> (W_BITS - 5);
-                            t1 = t1 >> (W_BITS - 5);
+                            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                            t0 = v_shr(t0, W_BITS - 5);
+                            t1 = v_shr(t1, W_BITS - 5);
                             diff0 = v_pack(t0, t1);
                             // I*gain.x + gain.x
                             v_mul_expand(vI, vgain_value, t0, t1);
-                            v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI;
+                            v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);
 
                             v_int16x8 diff[4] =
                             {
-                                ((v11 << 5) + diff_value) & vmask,
-                                ((v01 << 5) + diff_value) & vmask,
-                                ((v10 << 5) + diff_value) & vmask,
-                                ((v00 << 5) + diff_value) & vmask
+                                v_and(v_add(v_shl<5>(v11), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v01), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v10), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v00), diff_value), vmask)
                             };
-                            diff0 = diff0 + diff_value;
-                            diff0 = diff0 & vmask;
+                            diff0 = v_add(diff0, diff_value);
+                            diff0 = v_and(diff0, vmask);
 
-                            v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                            veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                            v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                            veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                             // since there is no abs vor int16x8 we have to do this hack
                             v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                             v_int16x8 vset2, vset1;
                             // |It| < sigma1 ?
-                            vset2 = vabs_diff < vparam1;
+                            vset2 = v_lt(vabs_diff, vparam1);
                             // It > 0 ?
-                            v_int16x8 vdiff_is_pos = diff0 > vzero;
+                            v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                             // sigma0 < |It| < sigma1 ?
-                            vset1 = vset2 & (vabs_diff > vparam0);
+                            vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                             // val = |It| -/+ sigma1
-                            v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                            v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
 
                             v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                             v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@@ -1009,7 +1009,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             for (unsigned int mmi = 0; mmi < 4; mmi++)
                             {
                                 // It == 0     ? |It| > sigma13
-                                diff0 = vset2 & diff[mmi];
+                                diff0 = v_and(vset2, diff[mmi]);
                                 // It == val ? sigma0 < |It| < sigma1
                                 diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -1017,22 +1017,22 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 // diff = diff * sigma2
                                 v_int32x4 diff_int_0, diff_int_1;
                                 v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                                v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                                v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                                v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                                v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                                 diff0 = v_pack(diff0_0, diff0_1);
                                 v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
                                 v_zip(vIxy_0, vIxy_1, v10, v11);
                                 v_zip(diff2, diff1, v00, v01);
 
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
 
-                                vqb2[mmi] += v_cvt_f32(diff0_0 * vI0);
-                                vqb2[mmi] += v_cvt_f32(diff0_1 * vI1);
+                                vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
+                                vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                                vqb3[mmi] += v_cvt_f32(diff0_0);
-                                vqb3[mmi] += v_cvt_f32(diff0_1);
+                                vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
+                                vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
                             }
                             if (j == 0)
                             {
@@ -1060,29 +1060,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 v_float32x4 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                v_float32x4 fxtale = fx * vtale_0;
-                                v_float32x4 fytale = fy * vtale_0;
+                                v_float32x4 fxtale = v_mul(fx, vtale_0);
+                                v_float32x4 fytale = v_mul(fy, vtale_0);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
                                 vAxx = v_muladd(fx, fxtale, vAxx);
 
                                 // sumIx und sumIy
-                                vsumIx += fxtale;
-                                vsumIy += fytale;
+                                vsumIx = v_add(vsumIx, fxtale);
+                                vsumIy = v_add(vsumIy, fytale);
 
-                                vsumW1 += vI_ps * fxtale;
-                                vsumW2 += vI_ps * fytale;
+                                vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                                vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                                 // sumI
-                                v_float32x4 vI_tale = vI_ps * vtale_0;
-                                vsumI += vI_tale;
+                                v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
+                                vsumI = v_add(vsumI, vI_tale);
 
                                 // sumW
-                                vsumW += vtale_0;
+                                vsumW = v_add(vsumW, vtale_0);
 
                                 // sumDI
-                                vsumDI += vI_ps * vI_tale;
+                                vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
 
                                 v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                                 v_expand(v01, t1, t0);
@@ -1092,29 +1092,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                fxtale = fx * vtale_1;
-                                fytale = fy * vtale_1;
+                                fxtale = v_mul(fx, vtale_1);
+                                fytale = v_mul(fy, vtale_1);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
                                 vAxx = v_muladd(fx, fxtale, vAxx);
 
                                 // sumIx und sumIy
-                                vsumIx += fxtale;
-                                vsumIy += fytale;
+                                vsumIx = v_add(vsumIx, fxtale);
+                                vsumIy = v_add(vsumIy, fytale);
 
-                                vsumW1 += vI_ps * fxtale;
-                                vsumW2 += vI_ps * fytale;
+                                vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                                vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                                 // sumI
-                                vI_tale = vI_ps * vtale_1;
-                                vsumI += vI_tale;
+                                vI_tale = v_mul(vI_ps, vtale_1);
+                                vsumI = v_add(vsumI, vI_tale);
 
                                 // sumW
-                                vsumW += vtale_1;
+                                vsumW = v_add(vsumW, vtale_1);
 
                                 // sumDI
-                                vsumDI += vI_ps * vI_tale;
+                                vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
                             }
                      }
 
@@ -1304,7 +1304,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 float CV_DECL_ALIGNED(16) bbuf[4];
                 for(int mmi = 0; mmi < 4; mmi++)
                 {
-                    v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                    v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                     _b0[mmi] = bbuf[0] + bbuf[2];
                     _b1[mmi] = bbuf[1] + bbuf[3];
                     _b2[mmi] = v_reduce_sum(vqb2[mmi]);
@@ -1655,14 +1655,14 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                             v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                             v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                             v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                             v_int16x8 diff[4] =
                             {
-                                ((v00 << 5) - vI) & vmask,
-                                ((v01 << 5) - vI) & vmask,
-                                ((v10 << 5) - vI) & vmask,
-                                ((v11 << 5) - vI) & vmask,
+                                v_and(v_sub(v_shl<5>(v00), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v01), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v10), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v11), vI), vmask),
                             };
 
                             v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@@ -1672,8 +1672,8 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                                 v_zip(diff[mmi], diff[mmi], diff1, diff0);
                                 v_zip(vIxy_0, vIxy_1, v10, v11);
                                 v_zip(diff1, diff0, v00, v01);
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
                             }
                          }
 #else
@@ -1704,7 +1704,7 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                     float CV_DECL_ALIGNED(16) bbuf[4];
                     for (int mmi = 0; mmi < 4; mmi++)
                     {
-                        v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                        v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                         _b1[mmi] = bbuf[0] + bbuf[2];
                         _b2[mmi] = bbuf[1] + bbuf[3];
                     }
@@ -2071,7 +2071,7 @@ namespace radial {
                                 v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                                 v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                                 v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                                v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                                v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                                 v_int32x4 t0, t1;
                                 v_int16x8 t00, t01, t10, t11;
@@ -2079,21 +2079,21 @@ namespace radial {
                                 v_zip(v10, v11, t10, t11);
 
                                 //subpixel interpolation
-                                t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                                t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                                t0 = t0 >> (W_BITS - 5);
-                                t1 = t1 >> (W_BITS - 5);
+                                t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                                t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                                t0 = v_shr(t0, W_BITS - 5);
+                                t1 = v_shr(t1, W_BITS - 5);
                                 diff0 = v_pack(t0, t1);
                                 // I*gain.x + gain.x
                                 v_mul_expand(vI, vgain_value, t0, t1);
-                                v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI;
+                                v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);
 
                                 v_int16x8 diff[4] =
                                 {
-                                    ((v11 << 5) + diff_value) & vmask,
-                                    ((v01 << 5) + diff_value) & vmask,
-                                    ((v10 << 5) + diff_value) & vmask,
-                                    ((v00 << 5) + diff_value) & vmask
+                                    v_and(v_add(v_shl<5>(v11), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v01), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v10), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v00), diff_value), vmask)
                                 };
                                 v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                                 v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@@ -2109,14 +2109,14 @@ namespace radial {
                                     v_zip(diff[mmi], diff[mmi], diff2, diff1);
 
                                     v_zip(diff2, diff1, v00, v01);
-                                    vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                    vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                    vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                    vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
 
-                                    vqb2[mmi] += v_cvt_f32(diff0_0 * vI0);
-                                    vqb2[mmi] += v_cvt_f32(diff0_1 * vI1);
+                                    vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
+                                    vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                                    vqb3[mmi] += v_cvt_f32(diff0_0);
-                                    vqb3[mmi] += v_cvt_f32(diff0_1);
+                                    vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
+                                    vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
                                 }
                                 if (j == 0)
                                 {
@@ -2133,17 +2133,17 @@ namespace radial {
                                     vAxx = v_muladd(fx, fx, vAxx);
 
                                     // sumIx und sumIy
-                                    vsumIx += fx;
-                                    vsumIy += fy;
+                                    vsumIx = v_add(vsumIx, fx);
+                                    vsumIy = v_add(vsumIy, fy);
 
-                                    vsumW1 += vI_ps * fx;
-                                    vsumW2 += vI_ps * fy;
+                                    vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                                    vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                                     // sumI
-                                    vsumI += vI_ps;
+                                    vsumI = v_add(vsumI, vI_ps);
 
                                     // sumDI
-                                    vsumDI += vI_ps * vI_ps;
+                                    vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
 
                                     v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                                     v_expand(v01, t1, t0);
@@ -2158,17 +2158,17 @@ namespace radial {
                                     vAxx = v_muladd(fx, fx, vAxx);
 
                                     // sumIx und sumIy
-                                    vsumIx += fx;
-                                    vsumIy += fy;
+                                    vsumIx = v_add(vsumIx, fx);
+                                    vsumIy = v_add(vsumIy, fy);
 
-                                    vsumW1 += vI_ps * fx;
-                                    vsumW2 += vI_ps * fy;
+                                    vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                                    vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                                     // sumI
-                                    vsumI += vI_ps;
+                                    vsumI = v_add(vsumI, vI_ps);
 
                                     // sumDI
-                                    vsumDI += vI_ps * vI_ps;
+                                    vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
                                 }
                             }
 
@@ -2299,7 +2299,7 @@ namespace radial {
                         float CV_DECL_ALIGNED(16) bbuf[4];
                         for (int mmi = 0; mmi < 4; mmi++)
                         {
-                            v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                            v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                             _b0[mmi] = bbuf[0] + bbuf[2];
                             _b1[mmi] = bbuf[1] + bbuf[3];
                             _b2[mmi] = v_reduce_sum(vqb2[mmi]);
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 5ea85de889e..71cf50c8205 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -229,7 +229,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
@@ -237,17 +237,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(v10, v11, t10, t11);
 
                         //subpixel interpolation
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
 
                         // diff = J - I
-                        diff0 = v_pack(t0, t1) - vI;
+                        diff0 = v_sub(v_pack(t0, t1), vI);
                         // I*gain.x + gain.x
                         v_mul_expand(vI, vgain_value, t0, t1);
-                        diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
-                        diff0 = diff0 & vmask;
+                        diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
+                        diff0 = v_and(diff0, vmask);
                         v_zip(diff0, diff0, diff2, diff1);
 
                         v_int32x4 diff0_0;
@@ -259,16 +259,16 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
 
                         v_int32x4 vI0, vI1;
                         v_expand(vI, vI0, vI1);
-                        vqb2 += v_cvt_f32(diff0_0 * vI0);
-                        vqb2 += v_cvt_f32(diff0_1 * vI1);
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                        vqb3 += v_cvt_f32(diff0_0);
-                        vqb3 += v_cvt_f32(diff0_1);
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));
 
                         if (j == 0)
                         {
@@ -285,17 +285,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             vAxx = v_muladd(fx, fx, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fx;
-                            vsumIy += fy;
+                            vsumIx = v_add(vsumIx, fx);
+                            vsumIy = v_add(vsumIy, fy);
 
-                            vsumW1 += vI_ps * fx;
-                            vsumW2 += vI_ps * fy;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                             // sumI
-                            vsumI += vI_ps;
+                            vsumI = v_add(vsumI, vI_ps);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_ps;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
 
                             v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                             v_expand(v01, t1, t0);
@@ -309,17 +309,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             vAxx = v_muladd(fx, fx, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fx;
-                            vsumIy += fy;
+                            vsumIx = v_add(vsumIx, fx);
+                            vsumIy = v_add(vsumIy, fy);
 
-                            vsumW1 += vI_ps * fx;
-                            vsumW2 += vI_ps * fy;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                             // sumI
-                            vsumI += vI_ps;
+                            vsumI = v_add(vsumI, vI_ps);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_ps;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
                         }
                     }
 #else
@@ -388,7 +388,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 = bbuf[0] + bbuf[2];
                 b2 = bbuf[1] + bbuf[3];
                 b3 = v_reduce_sum(vqb2);
@@ -696,19 +696,19 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
                         v_zip(v00, v01, t00, t01);
                         v_zip(v10, v11, t10, t11);
 
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
-                        diff0 = v_pack(t0, t1) - diff0;
-                        diff0 = diff0 & vmask;
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
+                        diff0 = v_sub(v_pack(t0, t1), diff0);
+                        diff0 = v_and(diff0, vmask);
 
                         v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
@@ -717,8 +717,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
                     }
 #else
                     for( ; x < winSize.width*cn; x++, dIptr += 2 )
@@ -737,7 +737,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 = bbuf[0] + bbuf[2];
                 b2 = bbuf[1] + bbuf[3];
 #endif
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 9bee35fc6a3..5597d882491 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -246,35 +246,35 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
                         v_zip(v00, v01, t00, t01);
                         v_zip(v10, v11, t10, t11);
 
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
-                        diff0 = v_pack(t0, t1) - diff0;
-                        diff0 = diff0 & vmask;
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
+                        diff0 = v_sub(v_pack(t0, t1), diff0);
+                        diff0 = v_and(diff0, vmask);
 
-                        v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                        veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                        v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                        veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                         // since there is no abs vor int16x8 we have to do this hack
                         v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                         v_int16x8 vset2, vset1;
                         // |It| < sigma1 ?
-                        vset2 = vabs_diff < vparam1;
+                        vset2 = v_lt(vabs_diff, vparam1);
                         // It > 0 ?
-                        v_int16x8 vdiff_is_pos = diff0 > vzero;
+                        v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                         // sigma0 < |It| < sigma1 ?
-                        vset1 = vset2 & (vabs_diff > vparam0);
+                        vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                         // val = |It| -/+ sigma1
-                        v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                        v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
                         // It == 0     ? |It| > sigma13
-                        diff0 = vset2 & diff0;
+                        diff0 = v_and(vset2, diff0);
                         // It == val ? sigma0 < |It| < sigma1
                         diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -282,7 +282,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         // diff = diff * sigma2
                         v_int32x4 diff_int_0, diff_int_1;
                         v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                        diff0 = v_pack(diff_int_0 >> s2bitShift, diff_int_1 >> s2bitShift);
+                        diff0 = v_pack(v_shr(diff_int_0, s2bitShift), v_shr(diff_int_1, s2bitShift));
                         v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
                         v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@@ -290,8 +290,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
                         if (j == 0)
                         {
                             v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1;
@@ -316,8 +316,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_float32x4 fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            v_float32x4 fxtale = fx * vtale_0;
-                            v_float32x4 fytale = fy * vtale_0;
+                            v_float32x4 fxtale = v_mul(fx, vtale_0);
+                            v_float32x4 fytale = v_mul(fy, vtale_0);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
@@ -330,8 +330,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            fxtale = fx * vtale_1;
-                            fytale = fy * vtale_1;
+                            fxtale = v_mul(fx, vtale_1);
+                            fytale = v_mul(fy, vtale_1);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
@@ -431,7 +431,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 += bbuf[0] + bbuf[2];
                 b2 += bbuf[1] + bbuf[3];
 #endif
@@ -769,7 +769,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
@@ -777,33 +777,33 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(v10, v11, t10, t11);
 
                         //subpixel interpolation
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
 
                         // diff = J - I
-                        diff0 = v_pack(t0, t1) - vI;
+                        diff0 = v_sub(v_pack(t0, t1), vI);
                         // I*gain.x + gain.x
                         v_mul_expand(vI, vgain_value, t0, t1);
-                        diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
-                        diff0 = diff0 & vmask;
+                        diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
+                        diff0 = v_and(diff0, vmask);
 
-                        v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                        veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                        v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                        veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                         // since there is no abs vor int16x8 we have to do this hack
                         v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                         v_int16x8 vset2, vset1;
                         // |It| < sigma1 ?
-                        vset2 = vabs_diff < vparam1;
+                        vset2 = v_lt(vabs_diff, vparam1);
                         // It > 0 ?
-                        v_int16x8 vdiff_is_pos = diff0 > vzero;
+                        v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                         // sigma0 < |It| < sigma1 ?
-                        vset1 = vset2 & (vabs_diff > vparam0);
+                        vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                         // val = |It| -/+ sigma1
-                        v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                        v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
                         // It == 0     ? |It| > sigma13
-                        diff0 = vset2 & diff0;
+                        diff0 = v_and(vset2, diff0);
                         // It == val ? sigma0 < |It| < sigma1
                         diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -811,8 +811,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         // diff = diff * sigma2
                         v_int32x4 diff_int_0, diff_int_1;
                         v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                        v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                        v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                        v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                        v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                         diff0 = v_pack(diff0_0, diff0_1);
                         v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
@@ -821,16 +821,16 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
 
                         v_int32x4 vI0, vI1;
                         v_expand(vI, vI0, vI1);
-                        vqb2 += v_cvt_f32(diff0_0 * vI0);
-                        vqb2 += v_cvt_f32(diff0_1 * vI1);
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                        vqb3 += v_cvt_f32(diff0_0);
-                        vqb3 += v_cvt_f32(diff0_1);
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));
 
                         if (j == 0)
                         {
@@ -858,29 +858,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_float32x4 fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            v_float32x4 fxtale = fx * vtale_0;
-                            v_float32x4 fytale = fy * vtale_0;
+                            v_float32x4 fxtale = v_mul(fx, vtale_0);
+                            v_float32x4 fytale = v_mul(fy, vtale_0);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
                             vAxx = v_muladd(fx, fxtale, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fxtale;
-                            vsumIy += fytale;
+                            vsumIx = v_add(vsumIx, fxtale);
+                            vsumIy = v_add(vsumIy, fytale);
 
-                            vsumW1 += vI_ps * fxtale;
-                            vsumW2 += vI_ps * fytale;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                             // sumI
-                            v_float32x4 vI_tale = vI_ps * vtale_0;
-                            vsumI += vI_tale;
+                            v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
+                            vsumI = v_add(vsumI, vI_tale);
 
                             // sumW
-                            vsumW += vtale_0;
+                            vsumW = v_add(vsumW, vtale_0);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_tale;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
 
                             v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                             v_expand(v01, t1, t0);
@@ -890,29 +890,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            fxtale = fx * vtale_1;
-                            fytale = fy * vtale_1;
+                            fxtale = v_mul(fx, vtale_1);
+                            fytale = v_mul(fy, vtale_1);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
                             vAxx = v_muladd(fx, fxtale, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fxtale;
-                            vsumIy += fytale;
+                            vsumIx = v_add(vsumIx, fxtale);
+                            vsumIy = v_add(vsumIy, fytale);
 
-                            vsumW1 += vI_ps * fxtale;
-                            vsumW2 += vI_ps * fytale;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                             // sumI
-                            vI_tale = vI_ps * vtale_1;
-                            vsumI += vI_tale;
+                            vI_tale = v_mul(vI_ps, vtale_1);
+                            vsumI = v_add(vsumI, vI_tale);
 
                             // sumW
-                            vsumW += vtale_1;
+                            vsumW = v_add(vsumW, vtale_1);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_tale;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
                         }
                     }
 #else
@@ -1017,7 +1017,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 }
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 = bbuf[0] + bbuf[2];
                 b2 = bbuf[1] + bbuf[3];
                 b3 = v_reduce_sum(vqb2);
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index c6f77f6d62c..2db4234ecd8 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -71,15 +71,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 
         for (; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
         {
-            v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
-            v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
+            v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
+            v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
             if (x + 4 > winSize.width)
             {
-                vmask0 = vmask0 & vmask_border_0;
+                vmask0 = v_and(vmask0, vmask_border_0);
             }
             if (x + 8 > winSize.width)
             {
-                vmask1 = vmask1 & vmask_border_1;
+                vmask1 = v_and(vmask1, vmask_border_1);
             }
 
             v_int32x4 t0, t1;
@@ -91,10 +91,10 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
-            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-            t0 = t0 >> (W_BITS - 5)  & vmask0;
-            t1 = t1 >> (W_BITS - 5)  & vmask1;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+            t0 = v_and(v_shr(t0, W_BITS - 5), vmask0);
+            t1 = v_and(v_shr(t1, W_BITS - 5), vmask1);
             v_store(Iptr + x, v_pack(t0, t1));
 
             v00 = v_reinterpret_as_s16(v_load(dsrc));
@@ -105,12 +105,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask0);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
             v_store(dIptr, v00);
 
             v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2));
@@ -121,12 +121,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask1);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
             v_store(dIptr + 4 * 2, v00);
         }
 #else
@@ -187,15 +187,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 #if CV_SIMD128
         for (int x = 0; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
         {
-            v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
-            v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
+            v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
+            v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
             if (x + 4 > winSize.width)
             {
-                vmask0 = vmask0 & vmask_border0;
+                vmask0 = v_and(vmask0, vmask_border0);
             }
             if (x + 8 > winSize.width)
             {
-                vmask1 = vmask1 & vmask_border1;
+                vmask1 = v_and(vmask1, vmask_border1);
             }
 
             v_int32x4 t0, t1;
@@ -207,12 +207,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
-            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-            t0 = t0 >> (W_BITS - 5);
-            t1 = t1 >> (W_BITS - 5);
-            t0 = t0 & vmask0;
-            t1 = t1 & vmask1;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS - 5);
+            t1 = v_shr(t1, W_BITS - 5);
+            t0 = v_and(t0, vmask0);
+            t1 = v_and(t1, vmask1);
             v_store(Iptr + x, v_pack(t0, t1));
 
             v00 = v_reinterpret_as_s16(v_load(dsrc));
@@ -223,12 +223,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask0);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
             v_store(dIptr, v00);
 
             v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
@@ -249,12 +249,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask1);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
             v_store(dIptr + 4 * 2, v00);
 
             v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index 8f3c728201a..3bc264f3e34 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -52,8 +52,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
                 v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
                 v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
 
-                v_int16x8 t1 = s2 - s0;
-                v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
+                v_int16x8 t1 = v_sub(s2, s0);
+                v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10));
 
                 v_store(trow0 + x, t0);
                 v_store(trow1 + x, t1);
@@ -90,8 +90,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
                 v_int16x8 s3 = v_load(trow1 + x);
                 v_int16x8 s4 = v_load(trow1 + x + cn);
 
-                v_int16x8 t0 = s1 - s0;
-                v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
+                v_int16x8 t0 = v_sub(s1, s0);
+                v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10));
 
                 v_store_interleave((drow + x * 2), t0, t1);
             }
diff --git a/modules/rgbd/src/colored_tsdf.cpp b/modules/rgbd/src/colored_tsdf.cpp
index 0247f66d010..7ce2c7428d0 100644
--- a/modules/rgbd/src/colored_tsdf.cpp
+++ b/modules/rgbd/src/colored_tsdf.cpp
@@ -194,21 +194,21 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 {
     // tx, ty, tz = floor(p)
     v_int32x4 ip = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx = v_get0(t);
     t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty = t.get0();
+    float ty = v_get0(t);
     t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz = t.get0();
+    float tz = v_get0(t);
 
     int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
 
-    int ix = ip.get0();
+    int ix = v_get0(ip);
     ip = v_rotate_right<1>(ip);
-    int iy = ip.get0();
+    int iy = v_get0(ip);
     ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int iz = v_get0(ip);
 
     int coordBase = ix * xdim + iy * ydim + iz * zdim;
 
@@ -218,15 +218,15 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 
     v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
     v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
-    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
     v_float32x4 v00_10 = vxx;
     v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-    float v0 = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0 = v_get0(v0_1);
     v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1 = v0_1.get0();
+    float v1 = v_get0(v0_1);
 
     return v0 + tx * (v1 - v0);
 }
@@ -276,27 +276,27 @@ inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const
 
 inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
 {
-    if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
-        v_check_any(p >= v_float32x4((float)(volResolution.x - 2),
+    if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
+        v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2),
             (float)(volResolution.y - 2),
-            (float)(volResolution.z - 2), 1.f))
+            (float)(volResolution.z - 2), 1.f)))
         )
         return nanv;
 
     v_int32x4 ip = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx = v_get0(t);
     t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty = t.get0();
+    float ty = v_get0(t);
     t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz = t.get0();
+    float tz = v_get0(t);
 
     const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
 
-    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iz = v_get0(ip);
 
     int coordBase = ix * xdim + iy * ydim + iz * zdim;
 
@@ -314,23 +314,23 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) co
 
         v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]);
         v_float32x4 v1357(vx[1], vx[3], vx[5], vx[7]);
-        v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+        v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
         v_float32x4 v00_10 = vxx;
         v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-        v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-        float v0 = v0_1.get0();
+        v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+        float v0 = v_get0(v0_1);
         v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-        float v1 = v0_1.get0();
+        float v1 = v_get0(v0_1);
 
         nv = v0 + tx * (v1 - v0);
     }
 
     v_float32x4 n = v_load_aligned(an);
-    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n * n)));
+    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n))));
 
-    return Norm.get0() < 0.0001f ? nanv : n / Norm;
+    return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm);
 }
 #else
 inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
@@ -388,15 +388,15 @@ inline float ColoredTSDFVolumeCPU::interpolateColor(float tx, float ty, float tz
     v_float32x4 v0246, v1357;
     v_load_deinterleave(vx, v0246, v1357);
 
-    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
     v_float32x4 v00_10 = vxx;
     v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-    float v0 = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0 = v_get0(v0_1);
     v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1 = v0_1.get0();
+    float v1 = v_get0(v0_1);
 
     return v0 + tx * (v1 - v0);
 }
@@ -427,10 +427,10 @@ inline Point3f ColoredTSDFVolumeCPU::getColorVoxel(const Point3f& _p) const
 }
 inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) const
 {
-    if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
-        v_check_any(p >= v_float32x4((float)(volResolution.x - 2),
+    if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
+        v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2),
             (float)(volResolution.y - 2),
-            (float)(volResolution.z - 2), 1.f))
+            (float)(volResolution.z - 2), 1.f)))
         )
         return nanv;
 
@@ -439,9 +439,9 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con
     const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
 
-    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iz = v_get0(ip);
 
     int coordBase = ix * xdim + iy * ydim + iz * zdim;
     float CV_DECL_ALIGNED(16) rgb[4];
@@ -456,12 +456,12 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con
     }
 
     v_float32x4 vsi(voxelSizeInv, voxelSizeInv, voxelSizeInv, voxelSizeInv);
-    v_float32x4 ptVox = p * vsi;
+    v_float32x4 ptVox = v_mul(p, vsi);
     v_int32x4 iptVox = v_floor(ptVox);
-    v_float32x4 t = ptVox - v_cvt_f32(iptVox);
-    float tx = t.get0(); t = v_rotate_right<1>(t);
-    float ty = t.get0(); t = v_rotate_right<1>(t);
-    float tz = t.get0();
+    v_float32x4 t = v_sub(ptVox, v_cvt_f32(iptVox));
+    float tx = v_get0(t); t = v_rotate_right<1>(t);
+    float ty = v_get0(t); t = v_rotate_right<1>(t);
+    float tz = v_get0(t);
     rgb[0] = interpolateColor(tx, ty, tz, r);
     rgb[1] = interpolateColor(tx, ty, tz, g);
     rgb[2] = interpolateColor(tx, ty, tz, b);
@@ -583,21 +583,21 @@ struct ColorRaycastInvoker : ParallelLoopBody
                 // get direction through pixel in volume space:
 
                 // 1. reproject (x, y) on projecting plane where z = 1.f
-                v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy) * vfxy;
+                v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy);
                 planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));
 
                 // 2. rotate to volume space
                 planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());
 
                 // 3. normalize
-                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed * planed)));
-                v_float32x4 dir = planed * invNorm;
+                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed))));
+                v_float32x4 dir = v_mul(planed, invNorm);
 
                 // compute intersection of ray with all six bbox planes
-                v_float32x4 rayinv = v_setall_f32(1.f) / dir;
+                v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir);
                 // div by zero should be eliminated by these products
-                v_float32x4 tbottom = rayinv * (boxDown - orig);
-                v_float32x4 ttop = rayinv * (boxUp - orig);
+                v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig));
+                v_float32x4 ttop = v_mul(rayinv, v_sub(boxUp, orig));
 
                 // re-order intersections to find smallest and largest on each axis
                 v_float32x4 minAx = v_min(ttop, tbottom);
@@ -618,14 +618,14 @@ struct ColorRaycastInvoker : ParallelLoopBody
                 if (tmin < tmax)
                 {
                     // interpolation optimized a little
-                    orig *= invVoxelSize;
-                    dir *= invVoxelSize;
+                    orig = v_mul(orig, invVoxelSize);
+                    dir = v_mul(dir, invVoxelSize);
 
                     int xdim = volume.volDims[0];
                     int ydim = volume.volDims[1];
                     int zdim = volume.volDims[2];
-                    v_float32x4 rayStep = dir * v_setall_f32(tstep);
-                    v_float32x4 next = (orig + dir * v_setall_f32(tmin));
+                    v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep));
+                    v_float32x4 next = (v_add(orig, v_mul(dir, v_setall_f32(tmin))));
                     float f = volume.interpolateVoxel(next), fnext = f;
 
                     //raymarch
@@ -633,11 +633,11 @@ struct ColorRaycastInvoker : ParallelLoopBody
                     int nSteps = cvFloor((tmax - tmin) / tstep);
                     for (; steps < nSteps; steps++)
                     {
-                        next += rayStep;
+                        next = v_add(next, rayStep);
                         v_int32x4 ip = v_round(next);
-                        int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iz = ip.get0();
+                        int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iz = v_get0(ip);
                         int coord = ix * xdim + iy * ydim + iz * zdim;
 
                         fnext = tsdfToFloat(volume.volume.at<RGBTsdfVoxel>(coord).tsdf);
@@ -657,7 +657,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
                     // linearly interpolate t between two f values
                     if (f > 0.f && fnext < 0.f)
                     {
-                        v_float32x4 tp = next - rayStep;
+                        v_float32x4 tp = v_sub(next, rayStep);
                         float ft = volume.interpolateVoxel(tp);
                         float ftdt = volume.interpolateVoxel(next);
                         float ts = tmin + tstep * (steps - ft / (ftdt - ft));
@@ -665,7 +665,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
                         // avoid division by zero
                         if (!cvIsNaN(ts) && !cvIsInf(ts))
                         {
-                            v_float32x4 pv = (orig + dir * v_setall_f32(ts));
+                            v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts))));
                             v_float32x4 nv = volume.getNormalVoxel(pv);
                             v_float32x4 cv = volume.getColorVoxel(pv);
 
@@ -675,9 +675,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
                                 //convert pv and nv to camera space
                                 normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
                                 // interpolation optimized a little
-                                point = v_matmuladd(pv * v_float32x4(volume.voxelSize,
-                                    volume.voxelSize,
-                                    volume.voxelSize, 1.f),
+                                point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)),
                                     volRot0, volRot1, volRot2, volTrans);
                             }
                         }
diff --git a/modules/rgbd/src/fast_icp.cpp b/modules/rgbd/src/fast_icp.cpp
index 07eb84d19fd..5f3ab3f0a19 100644
--- a/modules/rgbd/src/fast_icp.cpp
+++ b/modules/rgbd/src/fast_icp.cpp
@@ -138,7 +138,7 @@ bool ICPImpl::estimateTransformT(cv::Affine3f& transform,
 #if USE_INTRINSICS
 static inline bool fastCheck(const v_float32x4& p0, const v_float32x4& p1)
 {
-    float check = (p0.get0() + p1.get0());
+    float check = (v_get0(p0) + v_get0(p1));
     return !cvIsNaN(check);
 }
 
@@ -160,7 +160,7 @@ static inline v_float32x4 crossProduct(const v_float32x4& a, const v_float32x4&
     v_float32x4 ayzx, azxy, byzx, bzxy;
     getCrossPerm(a, ayzx, azxy);
     getCrossPerm(b, byzx, bzxy);
-    return ayzx*bzxy - azxy*byzx;
+    return v_sub(v_mul(ayzx, bzxy), v_mul(azxy, byzx));
 }
 #else
 static inline bool fastCheck(const Point3f& p)
@@ -235,11 +235,11 @@ struct GetAbInvoker : ParallelLoopBody
 
                 //find correspondence by projecting the point
                 v_float32x4 oldCoords;
-                float pz = (v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))).get0());
+                float pz = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))));
                 // x, y, 0, 0
-                oldCoords = v_muladd(newP/v_setall_f32(pz), vfxy, vcxy);
+                oldCoords = v_muladd(v_div(newP, v_setall_f32(pz)), vfxy, vcxy);
 
-                if(!v_check_all((oldCoords >= v_setzero_f32()) & (oldCoords < vframe)))
+                if(!v_check_all(v_and(v_ge(oldCoords, v_setzero_f32()), v_lt(oldCoords, vframe))))
                     continue;
 
                 // bilinearly interpolate oldPts and oldNrm under oldCoords point
@@ -247,12 +247,12 @@ struct GetAbInvoker : ParallelLoopBody
                 v_float32x4 oldN;
                 {
                     v_int32x4 ixy = v_floor(oldCoords);
-                    v_float32x4 txy = oldCoords - v_cvt_f32(ixy);
-                    int xi = ixy.get0();
-                    int yi = v_rotate_right<1>(ixy).get0();
-                    v_float32x4 tx = v_setall_f32(txy.get0());
+                    v_float32x4 txy = v_sub(oldCoords, v_cvt_f32(ixy));
+                    int xi = v_get0(ixy);
+                    int yi = v_get0(v_rotate_right<1>(ixy));
+                    v_float32x4 tx = v_setall_f32(v_get0(txy));
                     txy = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(txy)));
-                    v_float32x4 ty = v_setall_f32(txy.get0());
+                    v_float32x4 ty = v_setall_f32(v_get0(txy));
 
                     const float* prow0 = (const float*)oldPts[yi+0];
                     const float* prow1 = (const float*)oldPts[yi+1];
@@ -275,23 +275,23 @@ struct GetAbInvoker : ParallelLoopBody
 
                     // NaN check is done later
 
-                    v_float32x4 p0 = p00 + tx*(p01 - p00);
-                    v_float32x4 p1 = p10 + tx*(p11 - p10);
-                    oldP = p0 + ty*(p1 - p0);
+                    v_float32x4 p0 = v_add(p00, v_mul(tx, v_sub(p01, p00)));
+                    v_float32x4 p1 = v_add(p10, v_mul(tx, v_sub(p11, p10)));
+                    oldP = v_add(p0, v_mul(ty, v_sub(p1, p0)));
 
-                    v_float32x4 n0 = n00 + tx*(n01 - n00);
-                    v_float32x4 n1 = n10 + tx*(n11 - n10);
-                    oldN = n0 + ty*(n1 - n0);
+                    v_float32x4 n0 = v_add(n00, v_mul(tx, v_sub(n01, n00)));
+                    v_float32x4 n1 = v_add(n10, v_mul(tx, v_sub(n11, n10)));
+                    oldN = v_add(n0, v_mul(ty, v_sub(n1, n0)));
                 }
 
                 bool oldPNcheck = fastCheck(oldP, oldN);
 
                 //filter by distance
-                v_float32x4 diff = newP - oldP;
-                bool distCheck = !(v_reduce_sum(diff*diff) > sqThresh);
+                v_float32x4 diff = v_sub(newP, oldP);
+                bool distCheck = !(v_reduce_sum(v_mul(diff, diff)) > sqThresh);
 
                 //filter by angle
-                bool angleCheck = !(abs(v_reduce_sum(newN*oldN)) < cosThresh);
+                bool angleCheck = !(abs(v_reduce_sum(v_mul(newN, oldN))) < cosThresh);
 
                 if(!(oldPNcheck && distCheck && angleCheck))
                     continue;
@@ -299,17 +299,17 @@ struct GetAbInvoker : ParallelLoopBody
                 // build point-wise vector ab = [ A | b ]
                 v_float32x4 VxNv = crossProduct(newP, oldN);
                 Point3f VxN;
-                VxN.x = VxNv.get0();
-                VxN.y = v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0();
-                VxN.z = v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0();
+                VxN.x = v_get0(VxNv);
+                VxN.y = v_get0(v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())));
+                VxN.z = v_get0(v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())));
 
-                float dotp = -v_reduce_sum(oldN*diff);
+                float dotp = -v_reduce_sum(v_mul(oldN, diff));
 
                 // build point-wise upper-triangle matrix [ab^T * ab] w/o last row
                 // which is [A^T*A | A^T*b]
                 // and gather sum
 
-                v_float32x4 vd = VxNv | v_float32x4(0, 0, 0, dotp);
+                v_float32x4 vd = v_or(VxNv, v_float32x4(0, 0, 0, dotp));
                 v_float32x4 n = oldN;
                 v_float32x4 nyzx;
                 {
diff --git a/modules/rgbd/src/hash_tsdf.cpp b/modules/rgbd/src/hash_tsdf.cpp
index 9cf05e55691..e47af731dba 100644
--- a/modules/rgbd/src/hash_tsdf.cpp
+++ b/modules/rgbd/src/hash_tsdf.cpp
@@ -425,15 +425,15 @@ inline float interpolate(float tx, float ty, float tz, float vx[8])
     v_float32x4 v0246, v1357;
     v_load_deinterleave(vx, v0246, v1357);
 
-    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
     v_float32x4 v00_10 = vxx;
     v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-    float v0 = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0 = v_get0(v0_1);
     v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1 = v0_1.get0();
+    float v1 = v_get0(v0_1);
 
     return v0 + tx * (v1 - v0);
 }
@@ -598,9 +598,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
     v_float32x8 czp = v_lut(vals, idxzp);
     v_float32x8 czn = v_lut(vals, idxzn);
 
-    v_float32x8 vcxv = cxn - cxp;
-    v_float32x8 vcyv = cyn - cyp;
-    v_float32x8 vczv = czn - czp;
+    v_float32x8 vcxv = v_sub(cxn, cxp);
+    v_float32x8 vcyv = v_sub(cyn, cyp);
+    v_float32x8 vczv = v_sub(czn, czp);
 
     v_store(cxv, vcxv);
     v_store(cyv, vcyv);
@@ -615,9 +615,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
     v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
     v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
 
-    v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
-    v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
-    v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
+    v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1);
+    v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1);
+    v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1);
 
     v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
     v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
@@ -1523,9 +1523,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
     v_float32x8 czp = v_lut(vals, idxzp);
     v_float32x8 czn = v_lut(vals, idxzn);
 
-    v_float32x8 vcxv = cxn - cxp;
-    v_float32x8 vcyv = cyn - cyp;
-    v_float32x8 vczv = czn - czp;
+    v_float32x8 vcxv = v_sub(cxn, cxp);
+    v_float32x8 vcyv = v_sub(cyn, cyp);
+    v_float32x8 vczv = v_sub(czn, czp);
 
     v_store(cxv, vcxv);
     v_store(cyv, vcyv);
@@ -1540,9 +1540,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
     v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
     v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
 
-    v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
-    v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
-    v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
+    v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1);
+    v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1);
+    v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1);
 
     v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
     v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
diff --git a/modules/rgbd/src/tsdf.cpp b/modules/rgbd/src/tsdf.cpp
index 7b76985eb43..73a39a65929 100644
--- a/modules/rgbd/src/tsdf.cpp
+++ b/modules/rgbd/src/tsdf.cpp
@@ -146,21 +146,21 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 {
     // tx, ty, tz = floor(p)
     v_int32x4 ip  = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx      = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx      = v_get0(t);
     t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty      = t.get0();
+    float ty      = v_get0(t);
     t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz      = t.get0();
+    float tz      = v_get0(t);
 
     int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
 
-    int ix = ip.get0();
+    int ix = v_get0(ip);
     ip     = v_rotate_right<1>(ip);
-    int iy = ip.get0();
+    int iy = v_get0(ip);
     ip     = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int iz = v_get0(ip);
 
     int coordBase = ix*xdim + iy*ydim + iz*zdim;
 
@@ -170,15 +170,15 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 
     v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
     v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
-    v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
     v_float32x4 v00_10 = vxx;
     v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10);
-    float v0         = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0         = v_get0(v0_1);
     v0_1             = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1         = v0_1.get0();
+    float v1         = v_get0(v0_1);
 
     return v0 + tx*(v1 - v0);
 }
@@ -228,27 +228,27 @@ inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const
 
 inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
 {
-    if(v_check_any (p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
-       v_check_any (p >= v_float32x4((float)(volResolution.x-2),
+    if(v_check_any (v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
+       v_check_any (v_ge(p, v_float32x4((float)(volResolution.x-2),
                                      (float)(volResolution.y-2),
-                                     (float)(volResolution.z-2), 1.f))
+                                     (float)(volResolution.z-2), 1.f)))
                    )
         return nanv;
 
     v_int32x4 ip  = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx      = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx      = v_get0(t);
     t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty      = t.get0();
+    float ty      = v_get0(t);
     t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz      = t.get0();
+    float tz      = v_get0(t);
 
     const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
 
-    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iz = v_get0(ip);
 
     int coordBase = ix*xdim + iy*ydim + iz*zdim;
 
@@ -266,23 +266,23 @@ inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
 
         v_float32x4 v0246 (vx[0], vx[2], vx[4], vx[6]);
         v_float32x4 v1357 (vx[1], vx[3], vx[5], vx[7]);
-        v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
+        v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
         v_float32x4 v00_10 = vxx;
         v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-        v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10);
-        float v0         = v0_1.get0();
+        v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+        float v0         = v_get0(v0_1);
         v0_1             = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-        float v1         = v0_1.get0();
+        float v1         = v_get0(v0_1);
 
         nv = v0 + tx*(v1 - v0);
     }
 
     v_float32x4 n       = v_load_aligned(an);
-    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n*n)));
+    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n))));
 
-    return Norm.get0() < 0.0001f ? nanv : n/Norm;
+    return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm);
 }
 #else
 inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
@@ -394,21 +394,21 @@ struct RaycastInvoker : ParallelLoopBody
                 // get direction through pixel in volume space:
 
                 // 1. reproject (x, y) on projecting plane where z = 1.f
-                v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy)*vfxy;
+                v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy);
                 planed             = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));
 
                 // 2. rotate to volume space
                 planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());
 
                 // 3. normalize
-                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed*planed)));
-                v_float32x4 dir     = planed*invNorm;
+                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed))));
+                v_float32x4 dir     = v_mul(planed, invNorm);
 
                 // compute intersection of ray with all six bbox planes
-                v_float32x4 rayinv = v_setall_f32(1.f)/dir;
+                v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir);
                 // div by zero should be eliminated by these products
-                v_float32x4 tbottom = rayinv*(boxDown - orig);
-                v_float32x4 ttop    = rayinv*(boxUp   - orig);
+                v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig));
+                v_float32x4 ttop    = v_mul(rayinv, v_sub(boxUp, orig));
 
                 // re-order intersections to find smallest and largest on each axis
                 v_float32x4 minAx = v_min(ttop, tbottom);
@@ -429,14 +429,14 @@ struct RaycastInvoker : ParallelLoopBody
                 if(tmin < tmax)
                 {
                     // interpolation optimized a little
-                    orig *= invVoxelSize;
-                    dir  *= invVoxelSize;
+                    orig = v_mul(orig, invVoxelSize);
+                    dir = v_mul(dir, invVoxelSize);
 
                     int xdim            = volume.volDims[0];
                     int ydim            = volume.volDims[1];
                     int zdim            = volume.volDims[2];
-                    v_float32x4 rayStep = dir * v_setall_f32(tstep);
-                    v_float32x4 next    = (orig + dir * v_setall_f32(tmin));
+                    v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep));
+                    v_float32x4 next    = (v_add(orig, v_mul(dir, v_setall_f32(tmin))));
                     float f = volume.interpolateVoxel(next), fnext = f;
 
                     //raymarch
@@ -444,11 +444,11 @@ struct RaycastInvoker : ParallelLoopBody
                     int nSteps = cvFloor((tmax - tmin)/tstep);
                     for(; steps < nSteps; steps++)
                     {
-                        next += rayStep;
+                        next = v_add(next, rayStep);
                         v_int32x4 ip = v_round(next);
-                        int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iz = ip.get0();
+                        int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iz = v_get0(ip);
                         int coord = ix*xdim + iy*ydim + iz*zdim;
 
                         fnext = tsdfToFloat(volume.volume.at<TsdfVoxel>(coord).tsdf);
@@ -468,7 +468,7 @@ struct RaycastInvoker : ParallelLoopBody
                     // linearly interpolate t between two f values
                     if(f > 0.f && fnext < 0.f)
                     {
-                        v_float32x4 tp = next - rayStep;
+                        v_float32x4 tp = v_sub(next, rayStep);
                         float ft    = volume.interpolateVoxel(tp);
                         float ftdt  = volume.interpolateVoxel(next);
                         float ts = tmin + tstep*(steps - ft/(ftdt - ft));
@@ -476,7 +476,7 @@ struct RaycastInvoker : ParallelLoopBody
                         // avoid division by zero
                         if(!cvIsNaN(ts) && !cvIsInf(ts))
                         {
-                            v_float32x4 pv = (orig + dir*v_setall_f32(ts));
+                            v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts))));
                             v_float32x4 nv = volume.getNormalVoxel(pv);
 
                             if(!isNaN(nv))
@@ -484,9 +484,7 @@ struct RaycastInvoker : ParallelLoopBody
                                 //convert pv and nv to camera space
                                 normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
                                 // interpolation optimized a little
-                                point = v_matmuladd(pv*v_float32x4(volume.voxelSize,
-                                                                   volume.voxelSize,
-                                                                   volume.voxelSize, 1.f),
+                                point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)),
                                                     volRot0, volRot1, volRot2, volTrans);
                             }
                         }
diff --git a/modules/rgbd/src/tsdf_functions.cpp b/modules/rgbd/src/tsdf_functions.cpp
index 3f8bc26f011..09ca986a1bf 100644
--- a/modules/rgbd/src/tsdf_functions.cpp
+++ b/modules/rgbd/src/tsdf_functions.cpp
@@ -235,35 +235,33 @@ void integrateVolumeUnit(
                     // optimization of the following:
                     //Point3f volPt = Point3f(x, y, z)*voxelSize;
                     //Point3f camSpacePt = vol2cam * volPt;
-                    camSpacePt += zStep;
+                    camSpacePt = v_add(camSpacePt, zStep);
 
-                    float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
+                    float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))));
                     if (zCamSpace <= 0.f)
                         continue;
 
-                    v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
+                    v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace));
                     v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
                     // leave only first 2 lanes
-                    projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
-                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+                    projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0)));
 
                     depthType v;
                     // bilinearly interpolate depth at projected
                     {
                         const v_float32x4& pt = projected;
                         // check coords >= 0 and < imgSize
-                        v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
-                            v_reinterpret_as_u32(pt >= upLimits);
-                        limits = limits | v_rotate_right<1>(limits);
-                        if (limits.get0())
+                        v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits)));
+                        limits = v_or(limits, v_rotate_right<1>(limits));
+                        if (v_get0(limits))
                             continue;
 
                         // xi, yi = floor(pt)
                         v_int32x4 ip = v_floor(pt);
                         v_int32x4 ipshift = ip;
-                        int xi = ipshift.get0();
+                        int xi = v_get0(ipshift);
                         ipshift = v_rotate_right<1>(ipshift);
-                        int yi = ipshift.get0();
+                        int yi = v_get0(ipshift);
 
                         const depthType* row0 = depth[yi + 0];
                         const depthType* row1 = depth[yi + 1];
@@ -277,17 +275,17 @@ void integrateVolumeUnit(
 
                         // assume correct depth is positive
                         // don't fix missing data
-                        if (v_check_all(vall > v_setzero_f32()))
+                        if (v_check_all(v_gt(vall, v_setzero_f32())))
                         {
-                            v_float32x4 t = pt - v_cvt_f32(ip);
-                            float tx = t.get0();
+                            v_float32x4 t = v_sub(pt, v_cvt_f32(ip));
+                            float tx = v_get0(t);
                             t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-                            v_float32x4 ty = v_setall_f32(t.get0());
+                            v_float32x4 ty = v_setall_f32(v_get0(t));
                             // vx is y-interpolated between rows 0 and 1
-                            v_float32x4 vx = v001 + ty * (v101 - v001);
-                            float v0 = vx.get0();
+                            v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001)));
+                            float v0 = v_get0(vx);
                             vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
-                            float v1 = vx.get0();
+                            float v1 = v_get0(vx);
                             v = v0 + tx * (v1 - v0);
                         }
                         else
@@ -295,8 +293,8 @@ void integrateVolumeUnit(
                     }
 
                     // norm(camPixVec) produces double which is too slow
-                    int _u = (int)projected.get0();
-                    int _v = (int)v_rotate_right<1>(projected).get0();
+                    int _u = (int)v_get0(projected);
+                    int _v = (int)v_get0(v_rotate_right<1>(projected));
                     if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows))
                         continue;
                     float pixNorm = pixNorms.at<float>(_v, _u);
@@ -500,35 +498,33 @@ void integrateRGBVolumeUnit(
                     // optimization of the following:
                     //Point3f volPt = Point3f(x, y, z)*voxelSize;
                     //Point3f camSpacePt = vol2cam * volPt;
-                    camSpacePt += zStep;
+                    camSpacePt = v_add(camSpacePt, zStep);
 
-                    float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
+                    float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))));
                     if (zCamSpace <= 0.f)
                         continue;
 
-                    v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
+                    v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace));
                     v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
                     // leave only first 2 lanes
-                    projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
-                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+                    projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0)));
 
                     depthType v;
                     // bilinearly interpolate depth at projected
                     {
                         const v_float32x4& pt = projected;
                         // check coords >= 0 and < imgSize
-                        v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
-                            v_reinterpret_as_u32(pt >= upLimits);
-                        limits = limits | v_rotate_right<1>(limits);
-                        if (limits.get0())
+                        v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits)));
+                        limits = v_or(limits,v_rotate_right<1>(limits));
+                        if (v_get0(limits))
                             continue;
 
                         // xi, yi = floor(pt)
                         v_int32x4 ip = v_floor(pt);
                         v_int32x4 ipshift = ip;
-                        int xi = ipshift.get0();
+                        int xi = v_get0(ipshift);
                         ipshift = v_rotate_right<1>(ipshift);
-                        int yi = ipshift.get0();
+                        int yi = v_get0(ipshift);
 
                         const depthType* row0 = depth[yi + 0];
                         const depthType* row1 = depth[yi + 1];
@@ -542,17 +538,17 @@ void integrateRGBVolumeUnit(
 
                         // assume correct depth is positive
                         // don't fix missing data
-                        if (v_check_all(vall > v_setzero_f32()))
+                        if (v_check_all(v_gt(vall, v_setzero_f32())))
                         {
-                            v_float32x4 t = pt - v_cvt_f32(ip);
-                            float tx = t.get0();
+                            v_float32x4 t = v_sub(pt, v_cvt_f32(ip));
+                            float tx = v_get0(t);
                             t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-                            v_float32x4 ty = v_setall_f32(t.get0());
+                            v_float32x4 ty = v_setall_f32(v_get0(t));
                             // vx is y-interpolated between rows 0 and 1
-                            v_float32x4 vx = v001 + ty * (v101 - v001);
-                            float v0 = vx.get0();
+                            v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001)));
+                            float v0 = v_get0(vx);
                             vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
-                            float v1 = vx.get0();
+                            float v1 = v_get0(vx);
                             v = v0 + tx * (v1 - v0);
                         }
                         else
@@ -561,14 +557,13 @@ void integrateRGBVolumeUnit(
 
                     v_float32x4 projectedRGB = v_muladd(camPixVec, rgb_vfxy, rgb_vcxy);
                     // leave only first 2 lanes
-                    projectedRGB = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
-                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+                    projectedRGB = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)));
 
                     // norm(camPixVec) produces double which is too slow
-                    int _u = (int)projected.get0();
-                    int _v = (int)v_rotate_right<1>(projected).get0();
-                    int rgb_u = (int)projectedRGB.get0();
-                    int rgb_v = (int)v_rotate_right<1>(projectedRGB).get0();
+                    int _u = (int)v_get0(projected);
+                    int _v = (int)v_get0(v_rotate_right<1>(projected));
+                    int rgb_u = (int)v_get0(projectedRGB);
+                    int rgb_v = (int)v_get0(v_rotate_right<1>(projectedRGB));
 
                     if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows &&
                         rgb_v >= 0 && rgb_v < color.rows && rgb_u >= 0 && rgb_u < color.cols))
diff --git a/modules/rgbd/src/tsdf_functions.hpp b/modules/rgbd/src/tsdf_functions.hpp
index 1031474d0b7..6038ab60c05 100644
--- a/modules/rgbd/src/tsdf_functions.hpp
+++ b/modules/rgbd/src/tsdf_functions.hpp
@@ -16,11 +16,13 @@ namespace cv
 namespace kinfu
 {
 
+#if USE_INTRINSICS
 inline v_float32x4 tsdfToFloat_INTR(const v_int32x4& num)
 {
     v_float32x4 num128 = v_setall_f32(-1.f / 128.f);
-    return v_cvt_f32(num) * num128;
+    return v_mul(v_cvt_f32(num), num128);
 }
+#endif
 
 inline TsdfType floatToTsdf(float num)
 {
diff --git a/modules/rgbd/src/utils.hpp b/modules/rgbd/src/utils.hpp
index 0b963675390..2bb69713d6a 100644
--- a/modules/rgbd/src/utils.hpp
+++ b/modules/rgbd/src/utils.hpp
@@ -68,7 +68,7 @@ inline bool isNaN(cv::Point3f p)
 #if USE_INTRINSICS
 static inline bool isNaN(const cv::v_float32x4& p)
 {
-    return cv::v_check_any(p != p);
+    return cv::v_check_any(v_ne(p, p));
 }
 #endif
 
diff --git a/modules/sfm/src/libmv_light/libmv/numeric/numeric.h b/modules/sfm/src/libmv_light/libmv/numeric/numeric.h
index 9e7927e0bbc..41a55634dcd 100644
--- a/modules/sfm/src/libmv_light/libmv/numeric/numeric.h
+++ b/modules/sfm/src/libmv_light/libmv/numeric/numeric.h
@@ -32,6 +32,7 @@
 #include <Eigen/LU>
 #include <Eigen/QR>
 #include <Eigen/SVD>
+#include <cassert>
 
 #if !defined(__MINGW32__)
 #  if defined(_WIN32) || defined(__APPLE__) || \
diff --git a/modules/wechat_qrcode/src/zxing/qrcode/detector/detector.cpp b/modules/wechat_qrcode/src/zxing/qrcode/detector/detector.cpp
index d9f3f99f3e5..0d536e7e9bc 100644
--- a/modules/wechat_qrcode/src/zxing/qrcode/detector/detector.cpp
+++ b/modules/wechat_qrcode/src/zxing/qrcode/detector/detector.cpp
@@ -117,7 +117,7 @@ Ref<DetectorResult> Detector::getResultViaAlignment(int patternIdx, int alignmen
     Ref<BitMatrix> bits(sampleGrid(image_, possibleDimension, transform, err_handler));
     if (err_handler.ErrCode()) return Ref<DetectorResult>();
 
-    ArrayRef<Ref<ResultPoint> > corrners(new Array<Ref<ResultPoint> >(4));
+    ArrayRef<Ref<ResultPoint> > corners(new Array<Ref<ResultPoint> >(4));
     vector<float> points(8, 0.0f);
     points[0] = 0.0f;
     points[1] = possibleDimension;  // bottomLeft
@@ -128,12 +128,12 @@ Ref<DetectorResult> Detector::getResultViaAlignment(int patternIdx, int alignmen
     points[6] = possibleDimension;
     points[7] = possibleDimension;  // bottomRight
     transform->transformPoints(points);
-    corrners[0].reset(Ref<FinderPattern>(new FinderPattern(points[0], points[1], 0)));
-    corrners[1].reset(Ref<FinderPattern>(new FinderPattern(points[2], points[3], 0)));
-    corrners[2].reset(Ref<FinderPattern>(new FinderPattern(points[4], points[5], 0)));
-    corrners[3].reset(Ref<FinderPattern>(new FinderPattern(points[6], points[7], 0)));
+    corners[0].reset(Ref<FinderPattern>(new FinderPattern(points[0], points[1], 0)));
+    corners[1].reset(Ref<FinderPattern>(new FinderPattern(points[2], points[3], 0)));
+    corners[2].reset(Ref<FinderPattern>(new FinderPattern(points[4], points[5], 0)));
+    corners[3].reset(Ref<FinderPattern>(new FinderPattern(points[6], points[7], 0)));
 
-    Ref<DetectorResult> result(new DetectorResult(bits, corrners, possibleDimension));
+    Ref<DetectorResult> result(new DetectorResult(bits, corners, possibleDimension));
     return result;
 }
 
diff --git a/modules/ximgproc/src/anisodiff.cpp b/modules/ximgproc/src/anisodiff.cpp
index 996b4ac5b77..2b230a71242 100644
--- a/modules/ximgproc/src/anisodiff.cpp
+++ b/modules/ximgproc/src/anisodiff.cpp
@@ -74,8 +74,8 @@ inline v_uint8x16 v_finalize_pix_ch(const v_int16x8& c0, const v_int16x8& c1,
     v_expand_f32(c0, f0, f1);
     v_expand_f32(c1, f2, f3);
 
-    v_int16x8 d0 = v_pack(v_round(s0*alpha + f0), v_round(s1*alpha + f1));
-    v_int16x8 d1 = v_pack(v_round(s2*alpha + f2), v_round(s3*alpha + f3));
+    v_int16x8 d0 = v_pack(v_round(v_add(v_mul(s0, alpha), f0)), v_round(v_add(v_mul(s1, alpha), f1)));
+    v_int16x8 d1 = v_pack(v_round(v_add(v_mul(s2, alpha), f2)), v_round(v_add(v_mul(s3, alpha), f3)));
 
     return v_pack_u(d0, d1);
 }
@@ -135,12 +135,12 @@ class ADBody : public ParallelLoopBody
                     v_expand_s(p1, p10, p11);
                     v_expand_s(p2, p20, p21);
 
-                    v_int16x8 d00 = p00 - c00, d01 = p01 - c01;
-                    v_int16x8 d10 = p10 - c10, d11 = p11 - c11;
-                    v_int16x8 d20 = p20 - c20, d21 = p21 - c21;
+                    v_int16x8 d00 = v_sub(p00, c00), d01 = v_sub(p01, c01);
+                    v_int16x8 d10 = v_sub(p10, c10), d11 = v_sub(p11, c11);
+                    v_int16x8 d20 = v_sub(p20, c20), d21 = v_sub(p21, c21);
 
-                    v_uint16x8 n0 = v_abs(d00) + v_abs(d10) + v_abs(d20);
-                    v_uint16x8 n1 = v_abs(d01) + v_abs(d11) + v_abs(d21);
+                    v_uint16x8 n0 = v_add(v_add(v_abs(d00), v_abs(d10)), v_abs(d20));
+                    v_uint16x8 n1 = v_add(v_add(v_abs(d01), v_abs(d11)), v_abs(d21));
 
                     ushort CV_DECL_ALIGNED(16) nbuf[16];
                     v_store(nbuf, n0);
@@ -153,13 +153,13 @@ class ADBody : public ParallelLoopBody
 
                     v_expand_f32(d00, fd0, fd1);
                     v_expand_f32(d01, fd2, fd3);
-                    s00 += fd0*w0; s01 += fd1*w1; s02 += fd2*w2; s03 += fd3*w3;
+                    s00 = v_add(s00, v_mul(fd0, w0)); s01 = v_add(s01, v_mul(fd1, w1)); s02 = v_add(s02, v_mul(fd2, w2)); s03 = v_add(s03, v_mul(fd3, w3));
                     v_expand_f32(d10, fd0, fd1);
                     v_expand_f32(d11, fd2, fd3);
-                    s10 += fd0*w0; s11 += fd1*w1; s12 += fd2*w2; s13 += fd3*w3;
+                    s10 = v_add(s10, v_mul(fd0, w0)); s11 = v_add(s11, v_mul(fd1, w1)); s12 = v_add(s12, v_mul(fd2, w2)); s13 = v_add(s13, v_mul(fd3, w3));
                     v_expand_f32(d20, fd0, fd1);
                     v_expand_f32(d21, fd2, fd3);
-                    s20 += fd0*w0; s21 += fd1*w1; s22 += fd2*w2; s23 += fd3*w3;
+                    s20 = v_add(s20, v_mul(fd0, w0)); s21 = v_add(s21, v_mul(fd1, w1)); s22 = v_add(s22, v_mul(fd2, w2)); s23 = v_add(s23, v_mul(fd3, w3));
                 }
 
                 c0 = v_finalize_pix_ch(c00, c01, s00, s01, s02, s03, v_alpha);
diff --git a/modules/ximgproc/src/fgs_filter.cpp b/modules/ximgproc/src/fgs_filter.cpp
index 5e168da5dad..804e9f00a02 100644
--- a/modules/ximgproc/src/fgs_filter.cpp
+++ b/modules/ximgproc/src/fgs_filter.cpp
@@ -303,15 +303,15 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
         v_float32x4 aux0,aux1,aux2,aux3;
 
 #define PROC4(Chor_in,cur_in,coef_prev_in,interD_prev_in,cur_prev_in,interD_out,cur_out,coef_cur_out)\
-        coef_cur_out = lambda_reg*Chor_in;\
-        aux0 = interD_prev_in*coef_prev_in;\
-        aux1 = coef_cur_out+coef_prev_in;\
-        aux1 = one_reg-aux1;\
-        aux0 = aux1-aux0;\
-        interD_out = coef_cur_out/aux0;\
-        aux1 = cur_prev_in*coef_prev_in;\
-        aux1 = cur_in - aux1;\
-        cur_out = aux1/aux0;
+        coef_cur_out = v_mul(lambda_reg, Chor_in);\
+        aux0 = v_mul(interD_prev_in, coef_prev_in);\
+        aux1 = v_add(coef_cur_out, coef_prev_in);\
+        aux1 = v_sub(one_reg, aux1);\
+        aux0 = v_sub(aux1, aux0);\
+        interD_out = v_div(coef_cur_out, aux0);\
+        aux1 = v_mul(cur_prev_in, coef_prev_in);\
+        aux1 = v_sub(cur_in, aux1);\
+        cur_out = v_div(aux1, aux0);
 
         for(;j<w-3;j+=4)
         {
@@ -406,14 +406,14 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
             aux3 = v_load(cur_row_next3+j);
             v_transpose4x4(aux0,aux1,aux2,aux3,b0,b1,b2,b3);
 
-            aux0 = a3*cur_next_reg;
-            b3 = b3-aux0;
-            aux0 = a2*b3;
-            b2 = b2-aux0;
-            aux0 = a1*b2;
-            b1 = b1-aux0;
-            aux0 = a0*b1;
-            b0 = b0-aux0;
+            aux0 = v_mul(a3, cur_next_reg);
+            b3 = v_sub(b3, aux0);
+            aux0 = v_mul(a2, b3);
+            b2 = v_sub(b2, aux0);
+            aux0 = v_mul(a1, b2);
+            b1 = v_sub(b1, aux0);
+            aux0 = v_mul(a0, b1);
+            b0 = v_sub(b0, aux0);
 
             cur_next_reg = b0;
 
@@ -522,24 +522,24 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
         {
             a = v_load(Cvert_row_prev+j);
             b = v_load(Cvert_row+j);
-            coef_prev_reg = lambda_reg*a;
-            coef_cur_reg =  lambda_reg*b;
+            coef_prev_reg = v_mul(lambda_reg, a);
+            coef_cur_reg =  v_mul(lambda_reg, b);
 
             a = v_load(interD_row_prev+j);
-            a = a*coef_prev_reg;
+            a = v_mul(a, coef_prev_reg);
 
-            b = coef_prev_reg+coef_cur_reg;
-            b = b+a;
-            a = one_reg-b; //computed denom
+            b = v_add(coef_prev_reg, coef_cur_reg);
+            b = v_add(b, a);
+            a = v_sub(one_reg, b); //computed denom
 
-            b =  coef_cur_reg/a; //computed interD_row
+            b =  v_div(coef_cur_reg, a); //computed interD_row
 
             c = v_load(cur_row_prev+j);
-            c = c*coef_prev_reg;
+            c = v_mul(c, coef_prev_reg);
 
             d = v_load(cur_row+j);
-            d = d-c;
-            d = d/a; //computed cur_row
+            d = v_sub(d, c);
+            d = v_div(d, a); //computed cur_row
 
             v_store(interD_row+j,b);
             v_store(cur_row+j,d);
@@ -570,10 +570,10 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
         {
             a = v_load(interD_row+j);
             b = v_load(cur_row_next+j);
-            b = a*b;
+            b = v_mul(a, b);
 
             a = v_load(cur_row+j);
-            b = a-b;
+            b = v_sub(a, b);
             v_store(cur_row+j,b);
         }
 #endif
diff --git a/modules/ximgproc/src/find_ellipses.cpp b/modules/ximgproc/src/find_ellipses.cpp
index 66c61d7759a..de73f833b47 100644
--- a/modules/ximgproc/src/find_ellipses.cpp
+++ b/modules/ximgproc/src/find_ellipses.cpp
@@ -338,9 +338,9 @@ float EllipseDetectorImpl::getMedianSlope(std::vector<Point2f> &med, Point2f &ce
         slopes.push_back(num / den);
     }
 
-    nth_element(slopes.begin(), slopes.begin() + quarterSize, slopes.end());
-    nth_element(xx.begin(), xx.begin() + halfSize, xx.end());
-    nth_element(yy.begin(), yy.begin() + halfSize, yy.end());
+    std::nth_element(slopes.begin(), slopes.begin() + quarterSize, slopes.end());
+    std::nth_element(xx.begin(), xx.begin() + halfSize, xx.end());
+    std::nth_element(yy.begin(), yy.begin() + halfSize, yy.end());
     centers.x = xx[halfSize];
     centers.y = yy[halfSize];
 
@@ -390,7 +390,7 @@ void EllipseDetectorImpl::getFastCenter(std::vector<Point> &e1, std::vector<Poin
                 iIdx += iSzBin;
             }
         } else
-            iota(indexes.begin(), indexes.end(), hsize_2);
+            std::iota(indexes.begin(), indexes.end(), hsize_2);
 
         for (uint ii = 0; ii < minPoints; ii++) {
             uint i = indexes[ii];
@@ -483,7 +483,7 @@ void EllipseDetectorImpl::getFastCenter(std::vector<Point> &e1, std::vector<Poin
                 iIdx += iSzBin;
             }
         } else
-            iota(indexes.begin(), indexes.end(), hsize_1);
+            std::iota(indexes.begin(), indexes.end(), hsize_1);
 
 
         for (uint ii = 0; ii < minPoints; ii++) {
@@ -740,7 +740,7 @@ void EllipseDetectorImpl::detectEdges13(Mat1b &DP, VVP &points_1, VVP &points_3)
         }
 
         // order edge points of the same arc
-        sort(edgeSegment.begin(), edgeSegment.end(), sortPoint);
+        std::sort(edgeSegment.begin(), edgeSegment.end(), sortPoint);
         int edgeSegmentSize = unsigned(edgeSegment.size());
 
         // get extrema of the arc
@@ -788,7 +788,7 @@ void EllipseDetectorImpl::detectEdges24(Mat1b &DN, VVP &points_2, VVP &points_4)
         }
 
         // order edge points of the same arc
-        sort(edgeSegment.begin(), edgeSegment.end(), sortPoint);
+        std::sort(edgeSegment.begin(), edgeSegment.end(), sortPoint);
         int edgeSegmentSize = unsigned(edgeSegment.size());
 
         // get extrema of the arc
@@ -840,7 +840,7 @@ void EllipseDetectorImpl::getTriplets124(VVP &pi, VVP &pj, VVP &pk,
 
         // 1 -> reverse 1
         VP rev_i(edge_i.size());
-        reverse_copy(edge_i.begin(), edge_i.end(), rev_i.begin());
+        std::reverse_copy(edge_i.begin(), edge_i.end(), rev_i.begin());
 
         // for each edge j
         for (ushort j = 0; j < sz_j; ++j) {
@@ -936,7 +936,7 @@ void EllipseDetectorImpl::getTriplets231(VVP &pi, VVP &pj, VVP &pk,
 
         // 2 -> reverse 2
         VP rev_i(edge_i.size());
-        reverse_copy(edge_i.begin(), edge_i.end(), rev_i.begin());
+        std::reverse_copy(edge_i.begin(), edge_i.end(), rev_i.begin());
 
         // for each edge j
         for (ushort j = 0; j < sz_j; ++j) {
@@ -958,7 +958,7 @@ void EllipseDetectorImpl::getTriplets231(VVP &pi, VVP &pj, VVP &pk,
 
             // 3 -> reverse 3
             VP rev_j(edge_j.size());
-            reverse_copy(edge_j.begin(), edge_j.end(), rev_j.begin());
+            std::reverse_copy(edge_j.begin(), edge_j.end(), rev_j.begin());
 
             uint key_ij = generateKey(PAIR_23, i, j);
 
@@ -991,7 +991,7 @@ void EllipseDetectorImpl::getTriplets231(VVP &pi, VVP &pj, VVP &pk,
                 if (data.count(key_ik) == 0) {
                     // 1 -> reverse 1
                     VP rev_k(edge_k.size());
-                    reverse_copy(edge_k.begin(), edge_k.end(), rev_k.begin());
+                    std::reverse_copy(edge_k.begin(), edge_k.end(), rev_k.begin());
 
                     getFastCenter(edge_i, rev_k, data_ik);
                     data.insert(std::pair<uint, EllipseData>(key_ik, data_ik));
@@ -1038,7 +1038,7 @@ void EllipseDetectorImpl::getTriplets342(VVP &pi, VVP &pj, VVP &pk,
 
         // 3 -> reverse 3
         VP rev_i(edge_i.size());
-        reverse_copy(edge_i.begin(), edge_i.end(), rev_i.begin());
+        std::reverse_copy(edge_i.begin(), edge_i.end(), rev_i.begin());
 
         // for each edge j
         for (ushort j = 0; j < sz_j; ++j) {
@@ -1060,7 +1060,7 @@ void EllipseDetectorImpl::getTriplets342(VVP &pi, VVP &pj, VVP &pk,
 
             // 4 -> reverse 4
             VP rev_j(edge_j.size());
-            reverse_copy(edge_j.begin(), edge_j.end(), rev_j.begin());
+            std::reverse_copy(edge_j.begin(), edge_j.end(), rev_j.begin());
 
             uint key_ij = generateKey(PAIR_34, i, j);
 
@@ -1093,7 +1093,7 @@ void EllipseDetectorImpl::getTriplets342(VVP &pi, VVP &pj, VVP &pk,
                 if (data.count(key_ik) == 0) {
                     // 2 -> reverse 2
                     VP rev_k(edge_k.size());
-                    reverse_copy(edge_k.begin(), edge_k.end(), rev_k.begin());
+                    std::reverse_copy(edge_k.begin(), edge_k.end(), rev_k.begin());
 
                     getFastCenter(rev_i, rev_k, data_ik);
                     data.insert(std::pair<uint, EllipseData>(key_ik, data_ik));
@@ -1141,7 +1141,7 @@ void EllipseDetectorImpl::getTriplets413(VVP &pi, VVP &pj, VVP &pk,
 
         // 4 -> reverse 4
         VP rev_i(edge_i.size());
-        reverse_copy(edge_i.begin(), edge_i.end(), rev_i.begin());
+        std::reverse_copy(edge_i.begin(), edge_i.end(), rev_i.begin());
 
         // for each edge j
         for (ushort j = 0; j < sz_j; ++j) {
@@ -1507,8 +1507,8 @@ void EllipseDetectorImpl::detect(Mat1b &image, std::vector<Ellipse> &ellipses) {
     getTriplets342(points_3, points_4, points_2, centers, ellipses);
     getTriplets413(points_4, points_1, points_3, centers, ellipses);
 
-    // sort by score
-    sort(ellipses.begin(), ellipses.end());
+    // std::sort by score
+    std::sort(ellipses.begin(), ellipses.end());
 
     // free accumulator memory
     delete[]accN;
diff --git a/modules/ximgproc/src/lsc.cpp b/modules/ximgproc/src/lsc.cpp
index 9ef507c8fcd..1a5539b4fb3 100644
--- a/modules/ximgproc/src/lsc.cpp
+++ b/modules/ximgproc/src/lsc.cpp
@@ -821,7 +821,7 @@ inline void SuperpixelLSCImpl::PostEnforceLabelConnectivity( int threshold )
 
           (*Stmp).Neighbor.insert( (*Stmp).Neighbor.end(), (*S).Neighbor.begin(), (*S).Neighbor.end() );
 
-          sort( (*Stmp).Neighbor.begin(), (*Stmp).Neighbor.end() );
+          std::sort( (*Stmp).Neighbor.begin(), (*Stmp).Neighbor.end() );
 
           I = unique( (*Stmp).Neighbor.begin(), (*Stmp).Neighbor.end() );
           (*Stmp).Neighbor.erase( I, (*Stmp).Neighbor.end() );
diff --git a/modules/ximgproc/src/run_length_morphology.cpp b/modules/ximgproc/src/run_length_morphology.cpp
index 3cf88048469..2819f51ec4a 100644
--- a/modules/ximgproc/src/run_length_morphology.cpp
+++ b/modules/ximgproc/src/run_length_morphology.cpp
@@ -388,7 +388,7 @@ static void convertInputArrayToRuns(InputArray& theArray, rlVec& runs, Size& the
 
 static void sortChords(rlVec& lChords)
 {
-    sort(lChords.begin(), lChords.end());
+    std::sort(lChords.begin(), lChords.end());
 }
 
 static void mergeNeighbouringChords(rlVec& rlIn, rlVec& rlOut)
diff --git a/modules/ximgproc/src/sparse_match_interpolators.cpp b/modules/ximgproc/src/sparse_match_interpolators.cpp
index 517573946af..85261715bbb 100644
--- a/modules/ximgproc/src/sparse_match_interpolators.cpp
+++ b/modules/ximgproc/src/sparse_match_interpolators.cpp
@@ -198,7 +198,7 @@ void EdgeAwareInterpolatorImpl::interpolate(InputArray from_image, InputArray fr
         matches_vector[i] = SparseMatch(from_mat.at<Point2f>(i), to_mat.at<Point2f>(i));
 
 
-    sort(matches_vector.begin(),matches_vector.end());
+    std::sort(matches_vector.begin(),matches_vector.end());
 
     match_num = (int)matches_vector.size();
     CV_Assert(match_num<SHRT_MAX);
diff --git a/modules/ximgproc/src/structured_edge_detection.cpp b/modules/ximgproc/src/structured_edge_detection.cpp
index ffd80660ed0..56070bef8ff 100644
--- a/modules/ximgproc/src/structured_edge_detection.cpp
+++ b/modules/ximgproc/src/structured_edge_detection.cpp
@@ -148,8 +148,9 @@ static cv::Mat rgb2luv(const cv::Mat &src)
 
         lTable[i] = l*maxi;
     }
-    for (int i = 0; i < 40; ++i)
-        lTable.push_back(*--lTable.end());
+    for (int i = 0; i < 40; ++i) {
+        lTable.push_back(*lTable.rbegin());
+    }
 
     const int nchannels = 3;
 
diff --git a/modules/xphoto/src/grayworld_white_balance.cpp b/modules/xphoto/src/grayworld_white_balance.cpp
index 8d5953d34ae..2c944c84e85 100644
--- a/modules/xphoto/src/grayworld_white_balance.cpp
+++ b/modules/xphoto/src/grayworld_white_balance.cpp
@@ -130,21 +130,21 @@ void calculateChannelSums(uint &sumB, uint &sumG, uint &sumR, uchar *src_data, i
         v_expand(v_max_val, v_max1, v_max2);
 
         // Calculate masks
-        v_m1 = ~(v_mul_wrap(v_max1 - v_min1, v_255) > v_mul_wrap(v_thresh, v_max1));
-        v_m2 = ~(v_mul_wrap(v_max2 - v_min2, v_255) > v_mul_wrap(v_thresh, v_max2));
+        v_m1 = v_not(v_gt(v_mul_wrap(v_sub(v_max1, v_min1), v_255), v_mul_wrap(v_thresh, v_max1)));
+        v_m2 = v_not(v_gt(v_mul_wrap(v_sub(v_max2, v_min2), v_255), v_mul_wrap(v_thresh, v_max2)));
 
         // Apply masks
-        v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2);
-        v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2);
-        v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2);
+        v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
+        v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
+        v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));
 
         // Split and add to the sums:
         v_expand(v_iB1, v_uint1, v_uint2);
-        v_SB += v_uint1 + v_uint2;
+        v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
         v_expand(v_iG1, v_uint1, v_uint2);
-        v_SG += v_uint1 + v_uint2;
+        v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
         v_expand(v_iR1, v_uint1, v_uint2);
-        v_SR += v_uint1 + v_uint2;
+        v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
     }
 
     sumB = v_reduce_sum(v_SB);
@@ -197,21 +197,21 @@ void calculateChannelSums(uint64 &sumB, uint64 &sumG, uint64 &sumR, ushort *src_
         v_expand(v_max_val, v_max1, v_max2);
 
         // Calculate masks
-        v_m1 = ~((v_max1 - v_min1) * v_65535 > v_thresh * v_max1);
-        v_m2 = ~((v_max2 - v_min2) * v_65535 > v_thresh * v_max2);
+        v_m1 = v_not(v_gt(v_mul(v_sub(v_max1, v_min1), v_65535), v_mul(v_thresh, v_max1)));
+        v_m2 = v_not(v_gt(v_mul(v_sub(v_max2, v_min2), v_65535), v_mul(v_thresh, v_max2)));
 
         // Apply masks
-        v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2);
-        v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2);
-        v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2);
+        v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
+        v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
+        v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));
 
         // Split and add to the sums:
         v_expand(v_iB1, v_u64_1, v_u64_2);
-        v_SB += v_u64_1 + v_u64_2;
+        v_SB = v_add(v_SB, v_add(v_u64_1, v_u64_2));
         v_expand(v_iG1, v_u64_1, v_u64_2);
-        v_SG += v_u64_1 + v_u64_2;
+        v_SG = v_add(v_SG, v_add(v_u64_1, v_u64_2));
         v_expand(v_iR1, v_u64_1, v_u64_2);
-        v_SR += v_u64_1 + v_u64_2;
+        v_SR = v_add(v_SR, v_add(v_u64_1, v_u64_2));
     }
 
     // Perform final reduction
@@ -282,12 +282,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
             v_expand(v_inR, v_sR1, v_sR2);
 
             // Multiply by gains
-            v_sB1 = v_mul_wrap(v_sB1, v_gainB) >> 8;
-            v_sB2 = v_mul_wrap(v_sB2, v_gainB) >> 8;
-            v_sG1 = v_mul_wrap(v_sG1, v_gainG) >> 8;
-            v_sG2 = v_mul_wrap(v_sG2, v_gainG) >> 8;
-            v_sR1 = v_mul_wrap(v_sR1, v_gainR) >> 8;
-            v_sR2 = v_mul_wrap(v_sR2, v_gainR) >> 8;
+            v_sB1 = v_shr(v_mul_wrap(v_sB1, v_gainB), 8);
+            v_sB2 = v_shr(v_mul_wrap(v_sB2, v_gainB), 8);
+            v_sG1 = v_shr(v_mul_wrap(v_sG1, v_gainG), 8);
+            v_sG2 = v_shr(v_mul_wrap(v_sG2, v_gainG), 8);
+            v_sR1 = v_shr(v_mul_wrap(v_sR1, v_gainR), 8);
+            v_sR2 = v_shr(v_mul_wrap(v_sR2, v_gainR), 8);
 
             // Pack into vectors of v_uint8x16
             v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
@@ -325,12 +325,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
             v_expand(v_inR, v_sR1, v_sR2);
 
             // Multiply by scaling factors
-            v_sB1 = (v_sB1 * v_gainB) >> 16;
-            v_sB2 = (v_sB2 * v_gainB) >> 16;
-            v_sG1 = (v_sG1 * v_gainG) >> 16;
-            v_sG2 = (v_sG2 * v_gainG) >> 16;
-            v_sR1 = (v_sR1 * v_gainR) >> 16;
-            v_sR2 = (v_sR2 * v_gainR) >> 16;
+            v_sB1 = v_shr(v_mul(v_sB1, v_gainB), 16);
+            v_sB2 = v_shr(v_mul(v_sB2, v_gainB), 16);
+            v_sG1 = v_shr(v_mul(v_sG1, v_gainG), 16);
+            v_sG2 = v_shr(v_mul(v_sG2, v_gainG), 16);
+            v_sR1 = v_shr(v_mul(v_sR1, v_gainR), 16);
+            v_sR2 = v_shr(v_mul(v_sR2, v_gainR), 16);
 
             // Pack into vectors of v_uint16x8
             v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
diff --git a/modules/xphoto/src/learning_based_color_balance.cpp b/modules/xphoto/src/learning_based_color_balance.cpp
index bd408e6cb49..de1958dcc60 100644
--- a/modules/xphoto/src/learning_based_color_balance.cpp
+++ b/modules/xphoto/src/learning_based_color_balance.cpp
@@ -192,7 +192,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
             v_global_max = v_max(v_local_max, v_global_max);
-            v_mask = (v_local_max < v_thresh);
+            v_mask = (v_lt(v_local_max, v_thresh));
             v_store(mask_ptr + i, v_mask);
         }
         uchar global_max[16];
@@ -225,7 +225,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
             v_global_max = v_max(v_local_max, v_global_max);
-            v_mask = (v_local_max < v_thresh);
+            v_mask = (v_lt(v_local_max, v_thresh));
             v_pack_store(mask_ptr + i, v_mask);
         }
         ushort global_max[8];
@@ -270,9 +270,9 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_uint8x16 v_mask = v_load(mask_ptr + i);
 
-            v_inB &= v_mask;
-            v_inG &= v_mask;
-            v_inR &= v_mask;
+            v_inB = v_and(v_inB, v_mask);
+            v_inG = v_and(v_inG, v_mask);
+            v_inR = v_and(v_inR, v_mask);
 
             v_uint16x8 v_sR1, v_sR2, v_sG1, v_sG2, v_sB1, v_sB2;
             v_expand(v_inB, v_sB1, v_sB2);
@@ -280,33 +280,33 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_expand(v_inR, v_sR1, v_sR2);
 
             // update the brightest (R,G,B) tuple (process left half):
-            v_uint16x8 v_sum = v_sB1 + v_sG1 + v_sR1;
-            v_uint16x8 v_max_mask = (v_sum > v_max_sum);
+            v_uint16x8 v_sum = v_add(v_add(v_sB1, v_sG1), v_sR1);
+            v_uint16x8 v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_sB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_sG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_sR1 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_sB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_sG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_sR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update the brightest (R,G,B) tuple (process right half):
-            v_sum = v_sB2 + v_sG2 + v_sR2;
-            v_max_mask = (v_sum > v_max_sum);
+            v_sum = v_add(v_add(v_sB2, v_sG2), v_sR2);
+            v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_sB2 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_sG2 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_sR2 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_sB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_sG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_sR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update sums:
-            v_sB1 = v_sB1 + v_sB2;
-            v_sG1 = v_sG1 + v_sG2;
-            v_sR1 = v_sR1 + v_sR2;
+            v_sB1 = v_add(v_sB1, v_sB2);
+            v_sG1 = v_add(v_sG1, v_sG2);
+            v_sR1 = v_add(v_sR1, v_sR2);
 
             v_uint32x4 v_uint1, v_uint2;
             v_expand(v_sB1, v_uint1, v_uint2);
-            v_SB += v_uint1 + v_uint2;
+            v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
             v_expand(v_sG1, v_uint1, v_uint2);
-            v_SG += v_uint1 + v_uint2;
+            v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
             v_expand(v_sR1, v_uint1, v_uint2);
-            v_SR += v_uint1 + v_uint2;
+            v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
         }
         sumB = v_reduce_sum(v_SB);
         sumG = v_reduce_sum(v_SG);
@@ -361,11 +361,11 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_uint16x8 v_inB, v_inG, v_inR;
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_uint16x8 v_mask = v_load_expand(mask_ptr + i);
-            v_mask = v_mask | ((v_mask & v_mask_lower) << 8);
+            v_mask = v_or(v_mask, v_shl<8>(v_and(v_mask, v_mask_lower)));
 
-            v_inB &= v_mask;
-            v_inG &= v_mask;
-            v_inR &= v_mask;
+            v_inB = v_and(v_inB, v_mask);
+            v_inG = v_and(v_inG, v_mask);
+            v_inR = v_and(v_inR, v_mask);
 
             v_uint32x4 v_iR1, v_iR2, v_iG1, v_iG2, v_iB1, v_iB2;
             v_expand(v_inB, v_iB1, v_iB2);
@@ -373,32 +373,32 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_expand(v_inR, v_iR1, v_iR2);
 
             // update the brightest (R,G,B) tuple (process left half):
-            v_uint32x4 v_sum = v_iB1 + v_iG1 + v_iR1;
-            v_uint32x4 v_max_mask = (v_sum > v_max_sum);
+            v_uint32x4 v_sum = v_add(v_add(v_iB1, v_iG1), v_iR1);
+            v_uint32x4 v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_iB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_iG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_iR1 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_iB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_iG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_iR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update the brightest (R,G,B) tuple (process right half):
-            v_sum = v_iB2 + v_iG2 + v_iR2;
-            v_max_mask = (v_sum > v_max_sum);
+            v_sum = v_add(v_add(v_iB2, v_iG2), v_iR2);
+            v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_iB2 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_iG2 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_iR2 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_iB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_iG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_iR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update sums:
-            v_iB1 = v_iB1 + v_iB2;
-            v_iG1 = v_iG1 + v_iG2;
-            v_iR1 = v_iR1 + v_iR2;
+            v_iB1 = v_add(v_iB1, v_iB2);
+            v_iG1 = v_add(v_iG1, v_iG2);
+            v_iR1 = v_add(v_iR1, v_iR2);
             v_uint64x2 v_uint64_1, v_uint64_2;
             v_expand(v_iB1, v_uint64_1, v_uint64_2);
-            v_SB += v_uint64_1 + v_uint64_2;
+            v_SB = v_add(v_SB, v_add(v_uint64_1, v_uint64_2));
             v_expand(v_iG1, v_uint64_1, v_uint64_2);
-            v_SG += v_uint64_1 + v_uint64_2;
+            v_SG = v_add(v_SG, v_add(v_uint64_1, v_uint64_2));
             v_expand(v_iR1, v_uint64_1, v_uint64_2);
-            v_SR += v_uint64_1 + v_uint64_2;
+            v_SR = v_add(v_SR, v_add(v_uint64_1, v_uint64_2));
         }
         uint64 sum_arr[2];
         v_store(sum_arr, v_SB);
diff --git a/modules/xphoto/src/oilpainting.cpp b/modules/xphoto/src/oilpainting.cpp
index 21e62414c32..daeffd386a7 100644
--- a/modules/xphoto/src/oilpainting.cpp
+++ b/modules/xphoto/src/oilpainting.cpp
@@ -58,7 +58,7 @@ class ParallelOilPainting : public ParallelLoopBody
     int dynRatio;
 
 public:
-    ParallelOilPainting<Type>(Mat& img, Mat &d, Mat &iLuminance, int r,int k) :
+    ParallelOilPainting(Mat& img, Mat &d, Mat &iLuminance, int r,int k) :
         imgSrc(img),
         dst(d),
         imgLuminance(iLuminance),