From b07acec3f18b7d10778d588f9814a645d79f32aa Mon Sep 17 00:00:00 2001
From: huafengchun <huafengchun@gmail.com>
Date: Fri, 28 Jul 2023 03:14:34 +0000
Subject: [PATCH] Add operators support for Ascend NPU (CANN backend)

CANN (Compute Architecture of Neural Networks), developped by Huawei, is
a heterogeneous computing architecture for AI. Opencv DNN has already
suppoted CANN backend [#22634](https://github.com/opencv/opencv/pull/22634).

There are more and more users using [Ascend NPU](https://www.hiascend.com/)
and programming with CANN, and the number is still growing rapidly.
AI training and inference are inseparable from data preprocessing.
When users use OpenCV to work with CANN backend, data preprocessing can
only run on CPUs, resulting in inefficiency.

The purpose of this commit is to enable OpenCV operators on CANN backend.

The usage of CANN backend is consistent, Please refer to OpenCV DNN: [CANN backend manual]
(https://gist.github.com/fengyuentau/083f7f339592545c1f1d2c1fde6a53dc#file-a_ocv_cann-md):
1. [Install dependencies]
   (https://gist.github.com/fengyuentau/083f7f339592545c1f1d2c1fde6a53dc#install-dependencies)
2. [Install CANN]
   (https://gist.github.com/fengyuentau/083f7f339592545c1f1d2c1fde6a53dc#install-cann)
3. [Compile OpenCV with CANN]
   (https://gist.github.com/fengyuentau/083f7f339592545c1f1d2c1fde6a53dc#build-opencv-with-cann)

The CANN backend is used in a similar way to CUDA:
| Object    | CANN      | CUDA     |
| --------- | --------- | -------- |
| Namespace | cv::cann  | cv::cuda |
| Matrix    | AclMat    | GpuMat   |
| Stream    | AclStream | Stream   |
| Event     | AclEvent  | Event    |

The current commit provides CANN backend operator support framework, In
order to make code viewing easy, only 7 basic interfaces are
implemented, all of the following operators are tested and compared
result with CPU backend:
- [x] Add
- [x] subtract
- [x] multiply
- [x] divide
- [x] bitwise_and
- [x] bitwise_or
- [x] bitwise_xor

More operators will continue implement in new independent commits.

Co-authored-by: CaoMengqing <cmq0113@163.com>
---
 modules/cannarithm/CMakeLists.txt             |  16 +
 .../include/opencv2/acl_stream_accessor.hpp   |  40 ++
 modules/cannarithm/include/opencv2/cann.hpp   | 335 ++++++++++
 .../cannarithm/include/opencv2/cann.inl.hpp   | 111 ++++
 .../include/opencv2/cann_arithm.hpp           | 176 +++++
 .../cannarithm/include/opencv2/cann_call.hpp  |  52 ++
 .../include/opencv2/cann_common.hpp           |  43 ++
 .../include/opencv2/cann_prepare.hpp          |  96 +++
 .../cannarithm/misc/python/pyopencv_cann.hpp  |  23 +
 .../perf/perf_element_operations.cpp          |  81 +++
 modules/cannarithm/perf/perf_main.cpp         |  36 ++
 modules/cannarithm/perf/perf_precomp.hpp      |  20 +
 modules/cannarithm/samples/sample.cpp         |  32 +
 modules/cannarithm/samples/sample.py          |  20 +
 modules/cannarithm/src/aclmat.cpp             | 605 ++++++++++++++++++
 modules/cannarithm/src/cann_call.cpp          | 140 ++++
 modules/cannarithm/src/element_operations.cpp | 165 +++++
 modules/cannarithm/src/precomp.hpp            |  16 +
 modules/cannarithm/test/test_cann.cpp         | 227 +++++++
 .../test/test_element_operation.cpp           | 137 ++++
 modules/cannarithm/test/test_main.cpp         |  21 +
 modules/cannarithm/test/test_precomp.hpp      |  16 +
 22 files changed, 2408 insertions(+)
 create mode 100644 modules/cannarithm/CMakeLists.txt
 create mode 100644 modules/cannarithm/include/opencv2/acl_stream_accessor.hpp
 create mode 100644 modules/cannarithm/include/opencv2/cann.hpp
 create mode 100644 modules/cannarithm/include/opencv2/cann.inl.hpp
 create mode 100644 modules/cannarithm/include/opencv2/cann_arithm.hpp
 create mode 100644 modules/cannarithm/include/opencv2/cann_call.hpp
 create mode 100644 modules/cannarithm/include/opencv2/cann_common.hpp
 create mode 100644 modules/cannarithm/include/opencv2/cann_prepare.hpp
 create mode 100644 modules/cannarithm/misc/python/pyopencv_cann.hpp
 create mode 100644 modules/cannarithm/perf/perf_element_operations.cpp
 create mode 100644 modules/cannarithm/perf/perf_main.cpp
 create mode 100644 modules/cannarithm/perf/perf_precomp.hpp
 create mode 100644 modules/cannarithm/samples/sample.cpp
 create mode 100644 modules/cannarithm/samples/sample.py
 create mode 100644 modules/cannarithm/src/aclmat.cpp
 create mode 100644 modules/cannarithm/src/cann_call.cpp
 create mode 100644 modules/cannarithm/src/element_operations.cpp
 create mode 100644 modules/cannarithm/src/precomp.hpp
 create mode 100644 modules/cannarithm/test/test_cann.cpp
 create mode 100644 modules/cannarithm/test/test_element_operation.cpp
 create mode 100644 modules/cannarithm/test/test_main.cpp
 create mode 100644 modules/cannarithm/test/test_precomp.hpp

diff --git a/modules/cannarithm/CMakeLists.txt b/modules/cannarithm/CMakeLists.txt
new file mode 100644
index 00000000000..55bcc028510
--- /dev/null
+++ b/modules/cannarithm/CMakeLists.txt
@@ -0,0 +1,16 @@
+ if(IOS OR WINRT OR ANDROID OR APPLE OR WIN32 OR (NOT HAVE_CANN))
+   ocv_module_disable(cannarithm)
+ endif()
+
+set(the_description "Ascend-accelerated Operations on Matrices")
+
+ocv_add_module(cannarithm opencv_core WRAP python)
+ocv_module_include_directories(${CANN_INCLUDE_DIRS})
+ocv_glob_module_sources()
+ocv_install_used_external_targets(${CANN_LIBRARIES})
+ocv_create_module(${CANN_LIBRARIES})
+
+ocv_include_directories(${CMAKE_SOURCE_DIR}/modules/ts/include)
+
+ocv_add_accuracy_tests(DEPENDS_ON opencv_cannarithm)
+ocv_add_perf_tests(DEPENDS_ON opencv_cannarithm)
diff --git a/modules/cannarithm/include/opencv2/acl_stream_accessor.hpp b/modules/cannarithm/include/opencv2/acl_stream_accessor.hpp
new file mode 100644
index 00000000000..27118d807e3
--- /dev/null
+++ b/modules/cannarithm/include/opencv2/acl_stream_accessor.hpp
@@ -0,0 +1,40 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANN_STREAM_ACCESSOR_HPP
+#define OPENCV_CANN_STREAM_ACCESSOR_HPP
+
+#include <acl/acl.h>
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+
+//! @addtogroup cann_struct
+//! @{
+
+/** @brief Class that enables getting aclrtAclStream from cann::AclStream
+ */
+struct AclStreamAccessor
+{
+    CV_EXPORTS static aclrtStream getStream(const AclStream& stream);
+    CV_EXPORTS static AclStream wrapStream(aclrtStream stream);
+};
+
+/** @brief Class that enables getting aclrtAclEvent from cann::AclEvent
+ */
+struct AclEventAccessor
+{
+    CV_EXPORTS static aclrtEvent getEvent(const AclEvent& event);
+    CV_EXPORTS static AclEvent wrapEvent(aclrtEvent event);
+};
+
+//! @} cann_struct
+
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANN_STREAM_ACCESSOR_HPP
diff --git a/modules/cannarithm/include/opencv2/cann.hpp b/modules/cannarithm/include/opencv2/cann.hpp
new file mode 100644
index 00000000000..6b79f045c0e
--- /dev/null
+++ b/modules/cannarithm/include/opencv2/cann.hpp
@@ -0,0 +1,335 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANN_HPP
+#define OPENCV_CANN_HPP
+
+#include "opencv2/core.hpp"
+
+/**
+  @defgroup cann Ascend-accelerated Computer Vision
+  @{
+    @defgroup canncore Core part
+    @{
+      @defgroup cann_struct Data Structures
+      @defgroup cann_init Initializeation and Information
+    @}
+  @}
+ */
+
+namespace cv
+{
+namespace cann
+{
+class AclStream;
+
+//! @addtogroup cann_struct
+//! @{
+
+//===================================================================================
+// AclMat
+//===================================================================================
+
+/** @brief Base storage class for NPU memory with reference counting.
+ * AclMat class has a similar interface with Mat and AclMat, and work on [Ascend
+ * NPU](https://www.hiascend.com/) backend.
+ * @sa Mat cuda::GpuMat
+ */
+
+class CV_EXPORTS_W AclMat
+{
+public:
+    class CV_EXPORTS_W Allocator
+    {
+    public:
+        virtual ~Allocator() {}
+
+        // allocator must fill data, step and refcount fields
+        virtual bool allocate(AclMat* mat, int rows, int cols, size_t elemSize) = 0;
+        virtual void free(AclMat* mat) = 0;
+    };
+
+    /**
+     * @brief Create default allocator for AclMat. This allocator alloc memory from device for
+     * specific size.
+     */
+    CV_WRAP static AclMat::Allocator* defaultAllocator();
+
+    /**
+     * @brief Set allocator for AclMat.
+     * @param allocator
+     */
+    CV_WRAP static void setDefaultAllocator(AclMat::Allocator* allocator);
+
+    //! default constructor
+    CV_WRAP explicit AclMat(AclMat::Allocator* allocator_ = AclMat::defaultAllocator());
+
+    //! constructs AclMat of the specified size and type
+    CV_WRAP AclMat(int rows, int cols, int type,
+                   AclMat::Allocator* allocator = AclMat::defaultAllocator());
+    //! constructs AclMat of the specified size and type
+    CV_WRAP AclMat(Size size, int type, AclMat::Allocator* allocator = AclMat::defaultAllocator());
+
+    //! constructs AclMat and fills it with the specified value s
+    CV_WRAP AclMat(int rows, int cols, int type, Scalar& s,
+                   AclMat::Allocator* allocator = AclMat::defaultAllocator());
+    //! constructs AclMat and fills it with the specified value s
+    CV_WRAP AclMat(Size size, int type, Scalar& s,
+                   AclMat::Allocator* allocator = AclMat::defaultAllocator());
+
+    //! copy constructor
+    CV_WRAP AclMat(const AclMat& m);
+
+    //! constructor for AclMat headers pointing to user-allocated data
+    AclMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
+    //! constructor for AclMat headers pointing to user-allocated data
+    AclMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
+
+    //! builds AclMat from host memory (Blocking call)
+    CV_WRAP explicit AclMat(InputArray arr,
+                            AclMat::Allocator* allocator = AclMat::defaultAllocator());
+
+    //! assignment operators
+    AclMat& operator=(const AclMat& m);
+
+    //! destructor - calls release()
+    ~AclMat();
+
+    //! sets some of the AclMat elements to s (Blocking call)
+    CV_WRAP AclMat& setTo(Scalar s);
+    //! sets some of the AclMat elements to s (Non-Blocking call)
+    CV_WRAP AclMat& setTo(Scalar s, AclStream& stream);
+
+    //! swaps with other smart pointer
+    CV_WRAP void swap(AclMat& mat);
+
+    //! allocates new AclMat data unless the AclMat already has specified size and type
+    CV_WRAP void create(int rows, int cols, int type);
+
+    //! upload host memory data to AclMat (Blocking call)
+    CV_WRAP void upload(InputArray arr);
+    //! upload host memory data to AclMat (Non-Blocking call)
+    CV_WRAP void upload(InputArray arr, AclStream& stream);
+
+    //! download data from AclMat to host (Blocking call)
+    CV_WRAP void download(OutputArray dst) const;
+    //! download data from AclMat to host (Non-Blocking call)
+    CV_WRAP void download(OutputArray dst, AclStream& stream) const;
+
+    //! converts AclMat to another datatype (Blocking call)
+    CV_WRAP void convertTo(CV_OUT AclMat& dst, int rtype) const;
+
+    //! converts AclMat to another datatype (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT AclMat& dst, int rtype, AclStream& stream) const;
+
+    //! decreases reference counter, deallocate the data when reference counter reaches 0
+    CV_WRAP void release();
+
+    //! returns element size in bytes
+    CV_WRAP size_t elemSize() const;
+
+    //! returns the size of element channel in bytes
+    CV_WRAP size_t elemSize1() const;
+
+    //! returns element type
+    CV_WRAP int type() const;
+
+    //! returns element type
+    CV_WRAP int depth() const;
+
+    //! returns number of channels
+    CV_WRAP int channels() const;
+
+    //! returns step/elemSize1()
+    CV_WRAP size_t step1() const;
+
+    //! returns AclMat size : width == number of columns, height == number of rows
+    CV_WRAP Size size() const;
+
+    //! returns true if AclMat data is NULL
+    CV_WRAP bool empty() const;
+
+    //! internal use method: updates the continuity flag
+    CV_WRAP void updateContinuityFlag();
+
+    //! expand one channel mat to multi-channels (Blocking call)
+    //! @note, source mat must only have one channel, copy value to all channels.
+    CV_WRAP void expandTo(CV_OUT AclMat& dst, int channels) const;
+
+    //! expand one channel mat to multi-channels (Non-Blocking call)
+    //! @note, source mat must only have one channel, copy value to all channels.
+    CV_WRAP void expandTo(CV_OUT AclMat& dst, int channels, AclStream& stream) const;
+
+    /*! includes several bit-fields:
+     - the magic signature
+     - continuity flag
+     - depth
+     - number of channels
+     */
+    int flags;
+
+    //! the number of rows and columns
+    int rows, cols;
+
+    //! a distance between successive rows in bytes; includes the gap if any
+    CV_PROP size_t step;
+
+    //! pointer to the data
+    uchar* data;
+
+    //! pointer to the reference counter;
+    //! when AclMat points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    //! helper fields used in locateROI and adjustROI
+    uchar* datastart;
+    const uchar* dataend;
+
+    //! allocator
+    Allocator* allocator;
+};
+
+class AclStream;
+class AclStreamAccessor;
+class AclEvent;
+class AclEventAccessor;
+class DefaultDeviceInitializer;
+
+//===================================================================================
+// AclStream
+//===================================================================================
+
+/** @brief In AscendCL Stream(AclStream) is a task queue. Stream is used to manage the parallelism
+ * of tasks. The tasks inside a Stream are executed sequentially, that is, the Stream executes
+ * sequentially according to the sent tasks; the tasks in different Streams are executed in
+ * parallel.
+ *
+ * All Non-blocking functions should pass parameter stream, These function returns immediately after
+ * the task is submitted. Caller should wait stream until completion.
+ *
+ * Blocking functions implicityly use the default stream, and synchronize stream before function
+ * return.
+ * @sa cuda::Stream
+ */
+
+// TODO: Stream is defined in namespace cuda, and pybind code does not use a namespace of stream,
+// change stream name to AclStream to avoid confilct.
+class CV_EXPORTS_W AclStream
+{
+public:
+    CV_WRAP AclStream();
+
+    //! blocks the current CPU thread until all operations in the stream are complete.
+    CV_WRAP void waitForCompletion();
+
+    //! blocks the current CPU thread until event trigger.
+    CV_WRAP void waitAclEvent(const cv::cann::AclEvent& event);
+
+    /**
+     * @brief return default AclStream object for default Acl stream.
+     */
+    CV_WRAP static AclStream& Null();
+
+    // acl symbols CANNOT used in any hpp files. Use a inner class to avoid acl symbols defined in
+    // hpp.
+    class Impl;
+
+    // add temporary mat for async release.
+    void addToAsyncRelease(const AclMat& mat);
+
+private:
+    Ptr<Impl> impl_;
+    AclStream(const Ptr<Impl>& impl);
+
+    friend class AclStreamAccessor;
+    friend class DefaultDeviceInitializer;
+};
+
+/**
+ * @brief AclEvent to synchronize between different streams.
+ */
+class CV_EXPORTS_W AclEvent
+{
+public:
+    CV_WRAP AclEvent();
+
+    //! records an event
+    CV_WRAP void record(AclStream& stream = AclStream::Null());
+
+    //! waits for an event to complete
+    CV_WRAP void waitForComplete() const;
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    AclEvent(const Ptr<Impl>& impl);
+
+    friend class AclEventAccessor;
+};
+
+/** @brief Bindings overload to create a Stream object from the address stored in an existing CANN
+ * Runtime API stream pointer (aclrtStream).
+ * @param aclStreamAddress Memory address stored in a CANN Runtime API stream pointer
+ * (aclrtStream). The created Stream object does not perform any allocation or deallocation and simply
+ * wraps existing raw CANN Runtime API stream pointer.
+ * @note Overload for generation of bindings only, not exported or intended for use internally fro C++.
+ */
+CV_EXPORTS_W AclStream wrapStream(size_t aclStreamAddress);
+
+//! @} cann_struct
+
+//===================================================================================
+// Initialization & Info
+//===================================================================================
+
+//! @addtogroup cann_init
+//! @{
+
+//! Get Ascend matrix object from Input array, upload matrix memory if need. (Blocking call)
+AclMat getInputMat(InputArray src);
+//! Get Ascend matrix object from Input array, upload matrix memory if need. (Non-Blocking call)
+AclMat getInputMat(InputArray src, AclStream& stream);
+
+//! Get Ascend matrix object from Output array, upload matrix memory if need.
+AclMat getOutputMat(OutputArray dst, int rows, int cols, int type);
+
+//! Sync output matrix to Output array, download matrix memory if need.
+void syncOutput(const AclMat& dst, OutputArray _dst);
+
+/**
+ * @brief Choose Ascend npu device.
+ */
+CV_EXPORTS_W void setDevice(int device);
+
+/**
+ * @brief Clear all context created in current Ascend device.
+ */
+CV_EXPORTS_W void resetDevice();
+
+/**
+ * @brief Get current Ascend device.
+ */
+CV_EXPORTS_W int32_t getDevice();
+
+/**
+ * @brief init AscendCL.
+ */
+CV_EXPORTS_W void initAcl();
+
+/**
+ * @brief finalize AscendCL.
+ * @note finalizeAcl only can be called once for a process. Call this function after all AscendCL
+ * options finished.
+ */
+CV_EXPORTS_W void finalizeAcl();
+
+//! @} cann_init
+
+} // namespace cann
+} // namespace cv
+
+#include "opencv2/cann.inl.hpp"
+
+#endif /* OPENCV_CANN_HPP */
diff --git a/modules/cannarithm/include/opencv2/cann.inl.hpp b/modules/cannarithm/include/opencv2/cann.inl.hpp
new file mode 100644
index 00000000000..0c85e8dcc7a
--- /dev/null
+++ b/modules/cannarithm/include/opencv2/cann.inl.hpp
@@ -0,0 +1,111 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNINL_HPP
+#define OPENCV_CANNINL_HPP
+
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+inline AclMat::AclMat(AclMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+}
+
+inline AclMat::AclMat(int rows_, int cols_, int type_, AclMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline AclMat::AclMat(Size size_, int type_, AclMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline AclMat::AclMat(InputArray arr, AclMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    upload(arr);
+}
+
+inline AclMat::AclMat(const AclMat& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount),
+      datastart(m.datastart), dataend(m.dataend), allocator(m.allocator)
+{
+    if (refcount)
+        CV_XADD(refcount, 1);
+}
+
+inline AclMat::~AclMat() { release(); }
+
+inline AclMat& AclMat::operator=(const AclMat& m)
+{
+    if (this != &m)
+    {
+        AclMat temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline void AclMat::swap(AclMat& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(refcount, b.refcount);
+    std::swap(allocator, b.allocator);
+}
+
+inline void AclMat::release()
+{
+    CV_DbgAssert(allocator != 0);
+
+    if (refcount && CV_XADD(refcount, -1) == 1)
+        allocator->free(this);
+
+    dataend = data = datastart = 0;
+    step = rows = cols = 0;
+    refcount = 0;
+}
+
+inline size_t AclMat::elemSize() const { return CV_ELEM_SIZE(flags); }
+
+inline size_t AclMat::elemSize1() const { return CV_ELEM_SIZE1(flags); }
+
+inline int AclMat::type() const { return CV_MAT_TYPE(flags); }
+
+inline int AclMat::depth() const { return CV_MAT_DEPTH(flags); }
+
+inline int AclMat::channels() const { return CV_MAT_CN(flags); }
+
+inline size_t AclMat::step1() const { return step / elemSize1(); }
+
+inline Size AclMat::size() const { return Size(cols, rows); }
+
+inline bool AclMat::empty() const { return data == 0; }
+
+inline AclStream::AclStream(const Ptr<AclStream::Impl>& impl) : impl_(impl) {}
+
+inline AclEvent::AclEvent(const Ptr<AclEvent::Impl>& impl) : impl_(impl) {}
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNINL_HPP
diff --git a/modules/cannarithm/include/opencv2/cann_arithm.hpp b/modules/cannarithm/include/opencv2/cann_arithm.hpp
new file mode 100644
index 00000000000..9a0f3f1655f
--- /dev/null
+++ b/modules/cannarithm/include/opencv2/cann_arithm.hpp
@@ -0,0 +1,176 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNARITHM_HPP
+#define OPENCV_CANNARITHM_HPP
+
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+
+/**
+  @addtogroup cann
+  @{
+    @defgroup cannarithm Operations on Matrices
+    @{
+        @defgroup cannarithm_elem Per-element Operations
+    @}
+  @}
+ */
+
+//! @addtogroup cannarithm_elem
+//! @{
+
+/** @brief Computes a matrix-matrix or matrix-scalar sum.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param dtype Optional depth of the output array.
+ * @param stream AclStream for the asynchronous version.
+ * @sa cv::add cuda::add
+ */
+CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst,
+                      InputArray mask = noArray(), int dtype = -1,
+                      AclStream& stream = AclStream::Null());
+// This code should not be compiled nor analyzed by doxygen. This interface only for python binding
+// code generation. add(InputArray, InputArray ...) can accept Scalar as its parametr.(Scalar -> Mat
+// -> InputArray)
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void add(InputArray src1, Scalar src2, OutputArray dst, InputArray mask = noArray(),
+                      int dtype = -1, AclStream& stream = AclStream::Null());
+CV_EXPORTS_W void add(Scalar src1, InputArray src2, OutputArray dst, InputArray mask = noArray(),
+                      int dtype = -1, AclStream& stream = AclStream::Null());
+#endif
+
+/** @brief Computes a matrix-matrix or matrix-scalar difference.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param dtype Optional depth of the output array.
+ * @param stream AclStream for the asynchronous version.
+ * @sa cv::subtract cuda::subtract
+ */
+CV_EXPORTS_W void subtract(InputArray src1, InputArray src2, OutputArray dst,
+                           InputArray mask = noArray(), int dtype = -1,
+                           AclStream& stream = AclStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void subtract(InputArray src1, Scalar src2, OutputArray dst,
+                           InputArray mask = noArray(), int dtype = -1,
+                           AclStream& stream = AclStream::Null());
+CV_EXPORTS_W void subtract(Scalar src1, InputArray src2, OutputArray dst,
+                           InputArray mask = noArray(), int dtype = -1,
+                           AclStream& stream = AclStream::Null());
+#endif
+
+/** @brief Computes a matrix-matrix or matrix-scalar per-element product.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param scale Optional scale factor.
+ * @param dtype Optional depth of the output array.
+ * @param stream AclStream for the asynchronous version.
+ * @sa cv::multiply cuda::multiply
+ */
+CV_EXPORTS_W void multiply(InputArray src1, InputArray src2, OutputArray dst, float scale,
+                           int dtype = -1, AclStream& stream = AclStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void multiply(InputArray src1, Scalar src2, OutputArray dst, float scale,
+                           int dtype = -1, AclStream& stream = AclStream::Null());
+CV_EXPORTS_W void multiply(Scalar src1, InputArray src2, OutputArray dst, float scale,
+                           int dtype = -1, AclStream& stream = AclStream::Null());
+#endif
+
+/** @brief Computes a matrix-matrix or matrix-scalar division.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param scale Optional scale factor.
+ * @param dtype Optional depth of the output array.
+ * @param stream AclStream for the asynchronous version.
+ * @sa cv::divide cuda::divide
+ */
+CV_EXPORTS_W void divide(InputArray src1, InputArray src2, OutputArray dst, float scale,
+                         int dtype = -1, AclStream& stream = AclStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void divide(InputArray src1, Scalar src2, OutputArray dst, float scale, int dtype = -1,
+                         AclStream& stream = AclStream::Null());
+CV_EXPORTS_W void divide(Scalar src1, InputArray src2, OutputArray dst, float scale, int dtype = -1,
+                         AclStream& stream = AclStream::Null());
+#endif
+
+/** @brief Performs a per-element bitwise conjunction of two matrices (or of matrix and scalar).
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AclStream for the asynchronous version.
+ * @sa cv::bitwise_and cuda::bitwise_and
+ */
+CV_EXPORTS_W void bitwise_and(InputArray src1, InputArray src2, OutputArray dst,
+                              InputArray mask = noArray(), AclStream& stream = AclStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void bitwise_and(InputArray src1, Scalar src2, OutputArray dst,
+                              InputArray mask = noArray(), AclStream& stream = AclStream::Null());
+CV_EXPORTS_W void bitwise_and(Scalar src1, InputArray src2, OutputArray dst,
+                              InputArray mask = noArray(), AclStream& stream = AclStream::Null());
+#endif
+
+/** @brief Performs a per-element bitwise disjunction of two matrices (or of matrix and scalar).
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AclStream for the asynchronous version.
+ * @sa cv::bitwise_or cuda::bitwise_or
+ */
+CV_EXPORTS_W void bitwise_or(InputArray src1, InputArray src2, OutputArray dst,
+                             InputArray mask = noArray(), AclStream& stream = AclStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void bitwise_or(InputArray src1, Scalar src2, OutputArray dst,
+                             InputArray mask = noArray(), AclStream& stream = AclStream::Null());
+CV_EXPORTS_W void bitwise_or(Scalar src1, InputArray src2, OutputArray dst,
+                             InputArray mask = noArray(), AclStream& stream = AclStream::Null());
+#endif
+
+/** @brief Performs a per-element bitwise exclusive or operation of two matrices (or of matrix and
+ * scalar).
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AclStream for the asynchronous version.
+ * @sa cv::bitwise_xor cuda::bitwise_xor
+ */
+CV_EXPORTS_W void bitwise_xor(InputArray src1, InputArray src2, OutputArray dst,
+                              InputArray mask = noArray(), AclStream& stream = AclStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void bitwise_xor(InputArray src1, Scalar src2, OutputArray dst,
+                              InputArray mask = noArray(), AclStream& stream = AclStream::Null());
+CV_EXPORTS_W void bitwise_xor(Scalar src1, InputArray src2, OutputArray dst,
+                              InputArray mask = noArray(), AclStream& stream = AclStream::Null());
+#endif
+
+//! @} cannarithm_elem
+
+} // namespace cann
+} // namespace cv
+
+#endif /* OPENCV_CANNARITHM_HPP */
diff --git a/modules/cannarithm/include/opencv2/cann_call.hpp b/modules/cannarithm/include/opencv2/cann_call.hpp
new file mode 100644
index 00000000000..6afdd266a21
--- /dev/null
+++ b/modules/cannarithm/include/opencv2/cann_call.hpp
@@ -0,0 +1,52 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNCALL_HPP
+#define OPENCV_CANNCALL_HPP
+
+#include <vector>
+#include <acl/acl.h>
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+struct AclAttribute
+{
+    virtual ~AclAttribute() = default;
+    virtual void addAttr(aclopAttr* opAttr) = 0;
+};
+
+#define DEFINE_ATTR(FUNC, TYPE)                                                              \
+    class Acl##FUNC##Attribute : public AclAttribute                                         \
+    {                                                                                        \
+        const char* name;                                                                    \
+        TYPE value;                                                                          \
+                                                                                             \
+    public:                                                                                  \
+        Acl##FUNC##Attribute(const char* _name, TYPE _value) : name(_name), value(_value){}; \
+        void addAttr(aclopAttr* opAttr) override                                             \
+        {                                                                                    \
+            CV_ACL_SAFE_CALL(aclopSetAttr##FUNC(opAttr, name, value));                       \
+        }                                                                                    \
+    }
+
+DEFINE_ATTR(Float, float);
+DEFINE_ATTR(String, const char*);
+
+static std::vector<AclAttribute*> emptyattr;
+void aclOneInput(const AclMat& src, AclMat& dst, const char* op,
+                 AclStream& stream = AclStream::Null(),
+                 std::vector<AclAttribute*>& attrs = emptyattr);
+
+void aclTwoInputs(const AclMat& src1, const AclMat& src2, AclMat& dst, const char* op,
+                  AclStream& stream = AclStream::Null());
+
+void transNCHWToNHWC(const AclMat& src, AclMat& dst, AclStream& stream = AclStream::Null());
+
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNCALL_HPP
diff --git a/modules/cannarithm/include/opencv2/cann_common.hpp b/modules/cannarithm/include/opencv2/cann_common.hpp
new file mode 100644
index 00000000000..ecff9f07589
--- /dev/null
+++ b/modules/cannarithm/include/opencv2/cann_common.hpp
@@ -0,0 +1,43 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANN_COMMON_HPP
+#define OPENCV_CANN_COMMON_HPP
+
+#include <acl/acl.h>
+
+namespace cv
+{
+namespace cann
+{
+static inline void checkAclError(aclError err, const char* file, const int line, const char* func)
+{
+    if (ACL_SUCCESS != err)
+    {
+        const char* errMsg = aclGetRecentErrMsg();
+        cv::error(cv::Error::AscendApiCallError, errMsg == nullptr ? "" : errMsg, func, file, line);
+    }
+}
+
+static inline void checkAclPtr(void* ptr, const char* file, const int line, const char* func)
+{
+    if (nullptr == ptr)
+    {
+        const char* errMsg = aclGetRecentErrMsg();
+        cv::error(cv::Error::AscendApiCallError, errMsg == nullptr ? "" : errMsg, func, file, line);
+    }
+}
+
+} // namespace cann
+} // namespace cv
+
+#define CV_ACL_SAFE_CALL(expr) cv::cann::checkAclError((expr), __FILE__, __LINE__, CV_Func)
+#define CV_ACL_SAFE_CALL_PTR(expr)                               \
+    ({                                                           \
+        auto ptr = (expr);                                       \
+        cv::cann::checkAclPtr(ptr, __FILE__, __LINE__, CV_Func); \
+        ptr;                                                     \
+    })
+
+#endif // OPENCV_CANN_COMMON_HPP
diff --git a/modules/cannarithm/include/opencv2/cann_prepare.hpp b/modules/cannarithm/include/opencv2/cann_prepare.hpp
new file mode 100644
index 00000000000..cc1aba25618
--- /dev/null
+++ b/modules/cannarithm/include/opencv2/cann_prepare.hpp
@@ -0,0 +1,96 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNPREPARE_HPP
+#define OPENCV_CANNPREPARE_HPP
+
+#include <vector>
+#include <acl/acl.h>
+#include "opencv2/core.hpp"
+#include "opencv2/cann_common.hpp"
+
+namespace cv
+{
+namespace cann
+{
+struct CannPreparation
+{
+    CannPreparation() { opAttr_ = CV_ACL_SAFE_CALL_PTR(aclopCreateAttr()); }
+
+    virtual ~CannPreparation()
+    {
+        for (auto desc : inputDesc_)
+        {
+            aclDestroyTensorDesc(desc);
+        }
+
+        for (auto desc : outputDesc_)
+        {
+            aclDestroyTensorDesc(desc);
+        }
+
+        for (auto buf : inputBuffers_)
+        {
+            aclDestroyDataBuffer(buf);
+        }
+
+        for (auto buf : outputBuffers_)
+        {
+            aclDestroyDataBuffer(buf);
+        }
+
+        aclopDestroyAttr(opAttr_);
+    }
+
+    std::vector<aclDataBuffer*> inputBuffers_;
+    std::vector<aclDataBuffer*> outputBuffers_;
+    std::vector<aclTensorDesc*> inputDesc_;
+    std::vector<aclTensorDesc*> outputDesc_;
+    aclopAttr* opAttr_;
+};
+
+#define CANN_PREPARE_ADD_ATTR(var, type, ...)                           \
+    do                                                                  \
+    {                                                                   \
+        CV_ACL_SAFE_CALL(aclopSetAttr##type(var.opAttr_, __VA_ARGS__)); \
+    } while (0)
+
+#define CANN_PREPARE_INPUTDESC(var, ...)                                     \
+    do                                                                       \
+    {                                                                        \
+        auto _rPtr = CV_ACL_SAFE_CALL_PTR(aclCreateTensorDesc(__VA_ARGS__)); \
+        if (_rPtr != nullptr)                                                \
+            var.inputDesc_.push_back(_rPtr);                                 \
+    } while (0)
+
+#define CANN_PREPARE_OUTPUTDESC(var, ...)                                    \
+    do                                                                       \
+    {                                                                        \
+        auto _rPtr = CV_ACL_SAFE_CALL_PTR(aclCreateTensorDesc(__VA_ARGS__)); \
+        if (_rPtr != nullptr)                                                \
+            var.outputDesc_.push_back(_rPtr);                                \
+    } while (0)
+
+#define CANN_PREPARE_INPUTBUFFER(var, ...)                                   \
+    do                                                                       \
+    {                                                                        \
+        auto _rPtr = CV_ACL_SAFE_CALL_PTR(aclCreateDataBuffer(__VA_ARGS__)); \
+        if (_rPtr != nullptr)                                                \
+            var.inputBuffers_.push_back(_rPtr);                              \
+    } while (0)
+
+#define CANN_PREPARE_OUTPUTBUFFER(var, ...)                                  \
+    do                                                                       \
+    {                                                                        \
+        auto _rPtr = CV_ACL_SAFE_CALL_PTR(aclCreateDataBuffer(__VA_ARGS__)); \
+        if (_rPtr != nullptr)                                                \
+            var.outputBuffers_.push_back(_rPtr);                             \
+    } while (0)
+
+aclDataType getACLType(int opencvdepth);
+
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNPREPARE_HPP
diff --git a/modules/cannarithm/misc/python/pyopencv_cann.hpp b/modules/cannarithm/misc/python/pyopencv_cann.hpp
new file mode 100644
index 00000000000..61dc824c886
--- /dev/null
+++ b/modules/cannarithm/misc/python/pyopencv_cann.hpp
@@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifdef HAVE_OPENCV_CORE
+
+#include "opencv2/cann.hpp"
+
+typedef std::vector<cann::AclMat> vector_AclMat;
+typedef cann::AclMat::Allocator AclMat_Allocator;
+
+CV_PY_TO_CLASS(cann::AclMat);
+CV_PY_TO_CLASS(cann::AclStream);
+
+CV_PY_TO_CLASS_PTR(cann::AclMat);
+CV_PY_TO_CLASS_PTR(cann::AclMat::Allocator);
+
+CV_PY_FROM_CLASS(cann::AclMat);
+CV_PY_FROM_CLASS(cann::AclStream);
+
+CV_PY_FROM_CLASS_PTR(cann::AclMat::Allocator);
+
+#endif
diff --git a/modules/cannarithm/perf/perf_element_operations.cpp b/modules/cannarithm/perf/perf_element_operations.cpp
new file mode 100644
index 00000000000..5299f4b3c78
--- /dev/null
+++ b/modules/cannarithm/perf/perf_element_operations.cpp
@@ -0,0 +1,81 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_arithm.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+#define ARITHM_MAT_DEPTH Values(CV_32S, CV_32SC3)
+#define TYPICAL_ACL_MAT_SIZES ::perf::sz1080p, ::perf::sz2K, ::perf::sz2160p, ::perf::sz4320p
+#define DEVICE_ID 0
+#define DEF_PARAM_TEST(name, ...) \
+    typedef ::perf::TestBaseWithParam<testing::tuple<__VA_ARGS__>> name
+
+// NPU Perf Test
+DEF_PARAM_TEST(NPU, cv::Size, perf::MatDepth);
+#define TEST_NPU_OP_MAT(idx, op, ...)                                                       \
+    PERF_TEST_P(NPU, MAT_##op##_MAT_##idx,                                                  \
+                testing::Combine(testing::Values(TYPICAL_ACL_MAT_SIZES), ARITHM_MAT_DEPTH)) \
+    {                                                                                       \
+        Size size = GET_PARAM(0);                                                           \
+        int depth = GET_PARAM(1);                                                           \
+                                                                                            \
+        Mat src1(size, depth), src2(size, depth);                                           \
+        declare.in(src1, WARMUP_RNG);                                                       \
+        declare.in(src2, WARMUP_RNG);                                                       \
+        cv::cann::setDevice(DEVICE_ID);                                                     \
+                                                                                            \
+        AclMat npu_src1, npu_src2, dst;                                                     \
+        npu_src1.upload(src1);                                                              \
+        npu_src2.upload(src2);                                                              \
+        AclStream stream;                                                                   \
+        TEST_CYCLE() { cv::cann::op(npu_src1, npu_src2, dst, __VA_ARGS__); }                \
+        SANITY_CHECK_NOTHING();                                                             \
+        cv::cann::resetDevice();                                                            \
+    }
+
+// CPU Perf Test
+DEF_PARAM_TEST(CPU, cv::Size, perf::MatDepth);
+#define TEST_CPU_OP_MAT(idx, op, ...)                                                       \
+    PERF_TEST_P(CPU, MAT_##op##_MAT_##idx,                                                  \
+                testing::Combine(testing::Values(TYPICAL_ACL_MAT_SIZES), ARITHM_MAT_DEPTH)) \
+    {                                                                                       \
+        Size size = GET_PARAM(0);                                                           \
+        int depth = GET_PARAM(1);                                                           \
+                                                                                            \
+        Mat src1(size, depth), src2(size, depth), dst(size, depth);                         \
+        declare.in(src1, WARMUP_RNG);                                                       \
+        declare.in(src2, WARMUP_RNG);                                                       \
+                                                                                            \
+        TEST_CYCLE() cv::op(src1, src2, dst, __VA_ARGS__);                                  \
+        SANITY_CHECK_NOTHING();                                                             \
+    }
+
+TEST_NPU_OP_MAT(1, add, noArray(), -1);
+TEST_CPU_OP_MAT(1, add, noArray(), -1);
+
+TEST_NPU_OP_MAT(1, subtract, noArray(), -1);
+TEST_CPU_OP_MAT(1, subtract, noArray(), -1);
+
+TEST_NPU_OP_MAT(1, multiply, 1, -1);
+TEST_CPU_OP_MAT(1, multiply, 1, -1);
+
+TEST_NPU_OP_MAT(1, divide, 1, -1);
+TEST_CPU_OP_MAT(1, divide, 1, -1);
+
+TEST_NPU_OP_MAT(1, bitwise_and, noArray());
+TEST_CPU_OP_MAT(1, bitwise_and, noArray());
+
+TEST_NPU_OP_MAT(1, bitwise_or, noArray());
+TEST_CPU_OP_MAT(1, bitwise_or, noArray());
+
+TEST_NPU_OP_MAT(1, bitwise_xor, noArray());
+TEST_CPU_OP_MAT(1, bitwise_xor, noArray());
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannarithm/perf/perf_main.cpp b/modules/cannarithm/perf/perf_main.cpp
new file mode 100644
index 00000000000..13cde8f491e
--- /dev/null
+++ b/modules/cannarithm/perf/perf_main.cpp
@@ -0,0 +1,36 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_arithm.hpp"
+using namespace perf;
+
+class CannEnvironment : public ::testing::Environment
+{
+public:
+    virtual ~CannEnvironment() = default;
+    virtual void SetUp() CV_OVERRIDE {
+        cv::cann::initAcl();
+
+        // for device warmup
+        Scalar s1(1,2,3), s2(4,5,6);
+        Mat src1(10, 10, CV_32SC3, s1), src2(10, 10, CV_32SC3, s2);
+        cv::cann::setDevice(0);
+
+        cv::cann::AclMat npu_src1, npu_src2, dst;
+        npu_src1.upload(src1);
+        npu_src2.upload(src2);
+        cv::cann::add(npu_src1, npu_src2, dst);
+        cv::cann::resetDevice();
+        }
+    virtual void TearDown() CV_OVERRIDE { cv::cann::finalizeAcl(); }
+};
+
+static void initTests()
+{
+    CannEnvironment* cannEnv = new CannEnvironment();
+    ::testing::AddGlobalTestEnvironment(cannEnv);
+}
+
+CV_PERF_TEST_MAIN("cannarithm", initTests())
diff --git a/modules/cannarithm/perf/perf_precomp.hpp b/modules/cannarithm/perf/perf_precomp.hpp
new file mode 100644
index 00000000000..d0ff9533235
--- /dev/null
+++ b/modules/cannarithm/perf/perf_precomp.hpp
@@ -0,0 +1,20 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+#include "opencv2/cann.hpp"
+
+namespace opencv_test
+{
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::cann;
+} // namespace opencv_test
+
+#endif
diff --git a/modules/cannarithm/samples/sample.cpp b/modules/cannarithm/samples/sample.cpp
new file mode 100644
index 00000000000..772ca96f54f
--- /dev/null
+++ b/modules/cannarithm/samples/sample.cpp
@@ -0,0 +1,32 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+//g++ -o sample sample.cpp -I opencv/include/opencv4/ -L opencv/build/install/lib/ -l opencv_cannarithm -l opencv_core -l opencv_imgcodecs
+
+#include <iostream>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/cann.hpp>
+#include <opencv2/cann_arithm.hpp>
+
+int main()
+{
+    cv::Mat img = cv::imread("/path/to/img");
+
+    cv::cann::initAcl();
+    cv::cann::setDevice(0);
+
+    cv::cann::AclMat aclMat = cv::cann::AclMat();
+    aclMat.upload(img);
+
+    cv::cann::AclMat aclMatSum;
+    cv::cann::add(aclMat, aclMat, aclMatSum);
+    cv::Mat imgResult;
+    aclMatSum.download(imgResult);
+    std::cout<<imgResult<<std::endl;
+
+    cv::cann::resetDevice();
+    cv::cann::finalizeAcl();
+
+    return 0;
+}
diff --git a/modules/cannarithm/samples/sample.py b/modules/cannarithm/samples/sample.py
new file mode 100644
index 00000000000..b769e83ad44
--- /dev/null
+++ b/modules/cannarithm/samples/sample.py
@@ -0,0 +1,20 @@
+# This file is part of OpenCV project.
+# It is subject to the license terms in the LICENSE file found in the top-level directory
+# of this distribution and at http://opencv.org/license.html.
+
+import numpy as np
+import cv2
+
+img = cv2.imread("/path/to/img")
+
+cv2.cann.initAcl()
+cv2.cann.setDevice(0)
+
+aclMat = cv2.cann.AclMat()
+aclMat.upload(img)
+
+aclMatSum = cv2.cann.add(aclMat, aclMat)
+imgResult = aclMatSum.download()
+print(imgResult)
+
+cv2.cann.finalizeAcl()
diff --git a/modules/cannarithm/src/aclmat.cpp b/modules/cannarithm/src/aclmat.cpp
new file mode 100644
index 00000000000..a7d0dced4d0
--- /dev/null
+++ b/modules/cannarithm/src/aclmat.cpp
@@ -0,0 +1,605 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+namespace
+{
+/********************************************AclMat********************************************/
+class DefaultAllocator : public cv::cann::AclMat::Allocator
+{
+public:
+    bool allocate(cv::cann::AclMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE;
+    void free(cv::cann::AclMat* mat) CV_OVERRIDE;
+};
+
+bool DefaultAllocator::allocate(cv::cann::AclMat* mat, int rows, int cols, size_t elemSize)
+{
+    CV_ACL_SAFE_CALL(
+        aclrtMalloc((void**)(&mat->data), elemSize * cols * rows, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    mat->step = cols * elemSize;
+    mat->refcount = (int*)cv::fastMalloc(sizeof(int));
+
+    return true;
+}
+
+void DefaultAllocator::free(cv::cann::AclMat* mat)
+{
+    aclrtFree(mat->datastart);
+    cv::fastFree(mat->refcount);
+}
+
+DefaultAllocator cannDefaultAllocator;
+cv::cann::AclMat::Allocator* g_defaultAllocator = &cannDefaultAllocator;
+} // namespace
+
+namespace cv
+{
+namespace cann
+{
+AclMat::Allocator* AclMat::defaultAllocator() { return g_defaultAllocator; }
+
+void AclMat::setDefaultAllocator(AclMat::Allocator* allocator)
+{
+    CV_Assert(allocator != 0);
+    g_defaultAllocator = allocator;
+}
+
+// TODO: this function is copied from matrix.cpp, which is a local symbol there and can be
+// refreneced.
+static int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step)
+{
+    int i, j;
+    for (i = 0; i < dims; i++)
+    {
+        if (size[i] > 1)
+            break;
+    }
+
+    uint64 t = (uint64)size[std::min(i, dims - 1)] * CV_MAT_CN(flags);
+    for (j = dims - 1; j > i; j--)
+    {
+        t *= size[j];
+        if (step[j] * size[j] < step[j - 1])
+            break;
+    }
+
+    if (j <= i && t == (uint64)(int)t)
+        return flags | Mat::CONTINUOUS_FLAG;
+    return flags & ~Mat::CONTINUOUS_FLAG;
+}
+
+void AclMat::updateContinuityFlag()
+{
+    int sz[] = {rows, cols};
+    size_t steps[] = {step, elemSize()};
+    flags = cv::cann::updateContinuityFlag(flags, 2, sz, steps);
+}
+
+AclMat::AclMat(int rows_, int cols_, int type_, void* data_, size_t step_)
+    : flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_), step(step_),
+      data((uchar*)data_), refcount(0), datastart((uchar*)data_), dataend((const uchar*)data_),
+      allocator(defaultAllocator())
+{
+    size_t minstep = cols * elemSize();
+
+    if (step == Mat::AUTO_STEP)
+    {
+        step = minstep;
+    }
+    else
+    {
+        if (rows == 1)
+            step = minstep;
+
+        CV_DbgAssert(step >= minstep);
+    }
+
+    dataend += step * (rows - 1) + minstep;
+    updateContinuityFlag();
+}
+
+AclMat::AclMat(Size size_, int type_, void* data_, size_t step_)
+    : flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(size_.height), cols(size_.width),
+      step(step_), data((uchar*)data_), refcount(0), datastart((uchar*)data_),
+      dataend((const uchar*)data_), allocator(defaultAllocator())
+{
+    size_t minstep = cols * elemSize();
+
+    if (step == Mat::AUTO_STEP)
+    {
+        step = minstep;
+    }
+    else
+    {
+        if (rows == 1)
+            step = minstep;
+
+        CV_DbgAssert(step >= minstep);
+    }
+
+    dataend += step * (rows - 1) + minstep;
+    updateContinuityFlag();
+}
+
+void AclMat::create(int _rows, int _cols, int _type)
+{
+    CV_DbgAssert(_rows >= 0 && _cols >= 0);
+
+    _type &= Mat::TYPE_MASK;
+
+    if (rows == _rows && cols == _cols && type() == _type && data)
+        return;
+
+    if (data)
+        release();
+
+    if (_rows > 0 && _cols > 0)
+    {
+        flags = Mat::MAGIC_VAL + _type;
+        rows = _rows;
+        cols = _cols;
+
+        const size_t esz = elemSize();
+
+        bool allocSuccess = allocator->allocate(this, rows, cols, esz);
+
+        if (!allocSuccess)
+        {
+            // custom allocator fails, try default allocator
+            allocator = defaultAllocator();
+            allocSuccess = allocator->allocate(this, rows, cols, esz);
+            CV_Assert(allocSuccess);
+        }
+
+        if (esz * cols == step)
+            flags |= Mat::CONTINUOUS_FLAG;
+
+        datastart = data;
+        dataend = data + step * (rows - 1) + cols * esz;
+
+        if (refcount)
+            *refcount = 1;
+    }
+}
+
+void AclMat::upload(InputArray arr)
+{
+    Mat mat = arr.getMat();
+    CV_DbgAssert(!mat.empty());
+    create(mat.rows, mat.cols, mat.type());
+    CV_ACL_SAFE_CALL(aclrtMemcpy2d(data, step, mat.data, mat.step[0], cols * elemSize(), rows,
+                                   ACL_MEMCPY_HOST_TO_DEVICE));
+}
+
+void AclMat::upload(InputArray arr, AclStream& _stream)
+{
+    Mat mat = arr.getMat();
+    CV_DbgAssert(!mat.empty());
+    create(mat.rows, mat.cols, mat.type());
+    aclrtStream stream = AclStreamAccessor::getStream(_stream);
+    CV_ACL_SAFE_CALL(aclrtMemcpy2dAsync(data, step, mat.data, mat.step[0], cols * elemSize(), rows,
+                                        ACL_MEMCPY_HOST_TO_DEVICE, stream));
+}
+
+void AclMat::download(OutputArray _dst) const
+{
+    CV_DbgAssert(!empty());
+
+    _dst.create(size(), type());
+    Mat dst = _dst.getMat();
+    CV_ACL_SAFE_CALL(aclrtMemcpy2d(dst.data, dst.step[0], data, step, cols * elemSize(), rows,
+                                   ACL_MEMCPY_DEVICE_TO_HOST));
+}
+
+void AclMat::download(OutputArray _dst, AclStream& _stream) const
+{
+    CV_DbgAssert(!empty());
+
+    _dst.create(size(), type());
+    Mat dst = _dst.getMat();
+    aclrtStream stream = AclStreamAccessor::getStream(_stream);
+    CV_ACL_SAFE_CALL(aclrtMemcpy2dAsync(dst.data, dst.step[0], data, step, cols * elemSize(), rows,
+                                        ACL_MEMCPY_DEVICE_TO_HOST, stream));
+}
+
+AclMat::AclMat(int rows_, int cols_, int type_, Scalar& s_, AclMat::Allocator* allocator_)
+    : flags(0), rows(rows_), cols(cols_), step(0), data(0), refcount(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    create(rows_, cols_, type_);
+    setTo(s_);
+}
+
+AclMat::AclMat(Size size_, int type_, Scalar& s_, AclMat::Allocator* allocator_)
+    : flags(0), rows(size_.height), cols(size_.width), step(0), data(0), refcount(0), datastart(0),
+      dataend(0), allocator(allocator_)
+{
+    create(size_.height, size_.width, type_);
+    setTo(s_);
+}
+
+AclMat& AclMat::setTo(Scalar s_) { return setTo(s_, AclStream::Null()); }
+
+AclMat& AclMat::setTo(Scalar s_, AclStream& stream_)
+{
+    size_t totalBytes = (size_t)rows * cols * elemSize();
+    if (totalBytes == 0)
+        return *this;
+
+    CV_ACL_SAFE_CALL(aclrtMemset(data, totalBytes, 0, totalBytes));
+
+    Mat scMat(1, 1, type(), s_);
+    AclMat scAclMat;
+    scAclMat.upload(scMat);
+
+    AclMat dst(rows, cols, type());
+    // TODO use AssignAdd to avoid memcpy, or use broadcase.
+    aclTwoInputs(*this, scAclMat, dst, "Add", stream_);
+    swap(dst);
+
+    return *this;
+}
+
+void AclMat::convertTo(AclMat& dst, int rtype) const { convertTo(dst, rtype, AclStream::Null()); }
+
+void AclMat::convertTo(AclMat& dst, int _rtype, AclStream& _stream) const
+{
+    int cn = channels();
+    dst.create(rows, cols, CV_MAKE_TYPE(_rtype, cn));
+    aclOneInput(*this, dst, "Cast", _stream);
+}
+
+void AclMat::expandTo(CV_OUT AclMat& dst, int chs) const { expandTo(dst, chs, AclStream::Null()); }
+
+void AclMat::expandTo(CV_OUT AclMat& dst, int chs, AclStream& stream) const
+{
+    CV_Assert(channels() == 1);
+
+    // TODO use inplace expand.
+    AclMat NCHW_mat;
+    NCHW_mat.create(rows, cols, CV_MAKE_TYPE(depth(), chs));
+
+    aclrtStream rawStream = AclStreamAccessor::getStream(stream);
+    size_t expandsize = rows * step * chs;
+    uchar* dataptr = (uchar*)NCHW_mat.data;
+    for (int ch = 0; ch < chs; ch++)
+    {
+        if (rawStream == nullptr)
+        {
+            CV_ACL_SAFE_CALL(
+                aclrtMemcpy(dataptr, expandsize, data, rows * step, ACL_MEMCPY_DEVICE_TO_DEVICE));
+        }
+        else
+        {
+            CV_ACL_SAFE_CALL(aclrtMemcpyAsync(dataptr, expandsize, data, rows * step,
+                                              ACL_MEMCPY_DEVICE_TO_DEVICE, rawStream));
+        }
+
+        dataptr += (step * rows);
+    }
+
+    dst.create(rows, cols, CV_MAKE_TYPE(depth(), chs));
+
+    transNCHWToNHWC(NCHW_mat, dst, stream);
+}
+
+AclStream wrapStream(size_t aclStreamAddress)
+{
+    return AclStreamAccessor::wrapStream(reinterpret_cast<aclrtStream>(aclStreamAddress));
+}
+
+static AclMat getAclMat(InputArray arr)
+{
+    _InputArray::KindFlag k = arr.kind();
+    if (k == _InputArray::ACL_MAT)
+    {
+        const cann::AclMat* a_mat = (const cann::AclMat*)arr.getObj();
+        return *a_mat;
+    }
+
+    if (k == _InputArray::NONE)
+        return cann::AclMat();
+
+    CV_Error(cv::Error::StsNotImplemented, "getAclMat is available only for cann::AclMat");
+}
+
+AclMat getInputMat(InputArray _src)
+{
+    AclMat src;
+    if (_src.kind() == _InputArray::ACL_MAT)
+    {
+        src = getAclMat(_src);
+    }
+    else if (!_src.empty())
+    {
+        src.upload(_src);
+    }
+    return src;
+}
+
+AclMat getInputMat(InputArray _src, AclStream& stream)
+{
+    AclMat src;
+    if (_src.kind() == _InputArray::ACL_MAT)
+    {
+        src = getAclMat(_src);
+    }
+    else if (!_src.empty())
+    {
+        aclrtStream rawStream = AclStreamAccessor::getStream(stream);
+        if (rawStream == nullptr)
+        {
+            src.upload(_src);
+        }
+        else
+        {
+            src.upload(_src, stream);
+        }
+    }
+    return src;
+}
+
+AclMat getOutputMat(OutputArray _dst, int rows, int cols, int type)
+{
+    AclMat dst;
+    if (_dst.kind() == _InputArray::ACL_MAT)
+    {
+        ((cann::AclMat*)(_dst.getObj()))->create(rows, cols, type);
+        dst = getAclMat(_dst);
+    }
+    else
+    {
+        dst.create(rows, cols, type);
+    }
+    return dst;
+}
+
+void syncOutput(const AclMat& dst, OutputArray _dst)
+{
+    if (_dst.kind() != _InputArray::ACL_MAT)
+    {
+        dst.download(_dst);
+    }
+}
+
+/********************************************Device********************************************/
+
+void setDevice(int device_id)
+{
+    aclrtContext context;
+    CV_ACL_SAFE_CALL(aclrtSetDevice(device_id));
+    CV_ACL_SAFE_CALL(aclrtCreateContext(&context, device_id));
+}
+
+void resetDevice() { CV_ACL_SAFE_CALL(aclrtResetDevice(getDevice())); }
+
+int32_t getDevice()
+{
+    int32_t deviceId;
+    CV_ACL_SAFE_CALL(aclrtGetDevice(&deviceId));
+    return deviceId;
+}
+
+void initAcl() { CV_ACL_SAFE_CALL(aclInit(nullptr)); }
+
+void finalizeAcl() { CV_ACL_SAFE_CALL(aclFinalize()); }
+
+class DefaultDeviceInitializer
+{
+public:
+    DefaultDeviceInitializer();
+    ~DefaultDeviceInitializer();
+
+    AclStream& getNullAclStream(int deviceId);
+
+private:
+    std::vector<Ptr<AclStream>> streams_;
+    Mutex streams_mtx_;
+};
+
+DefaultDeviceInitializer::DefaultDeviceInitializer() {}
+
+DefaultDeviceInitializer::~DefaultDeviceInitializer() { streams_.clear(); }
+
+AclStream& DefaultDeviceInitializer::getNullAclStream(int deviceId)
+{
+    AutoLock lock(streams_mtx_);
+
+    if (streams_.empty())
+    {
+        uint32_t deviceCount;
+        CV_ACL_SAFE_CALL(aclrtGetDeviceCount(&deviceCount));
+
+        if (deviceCount > 0)
+            streams_.resize(deviceCount);
+    }
+
+    CV_DbgAssert(deviceId >= 0 && deviceId < static_cast<int>(streams_.size()));
+
+    if (streams_[deviceId].empty())
+    {
+        aclrtStream stream = nullptr;
+        Ptr<AclStream::Impl> impl = makePtr<AclStream::Impl>(stream);
+        streams_[deviceId] = Ptr<AclStream>(new AclStream(impl));
+    }
+
+    return *streams_[deviceId];
+}
+
+DefaultDeviceInitializer initializer;
+
+/********************************************AclEvent********************************************/
+class AclEvent::Impl
+{
+public:
+    aclrtEvent event;
+    bool ownEvent;
+
+    Impl();
+    explicit Impl(aclrtEvent event);
+
+    ~Impl();
+};
+
+AclEvent::Impl::Impl() : event(nullptr), ownEvent(true)
+{
+    CV_ACL_SAFE_CALL(aclrtCreateEvent(&event));
+}
+
+AclEvent::Impl::Impl(aclrtEvent e) : event(e), ownEvent(false) {}
+
+AclEvent::Impl::~Impl()
+{
+    if (event && ownEvent)
+    {
+        CV_ACL_SAFE_CALL(aclrtDestroyEvent(event));
+    }
+}
+
+aclrtEvent AclEventAccessor::getEvent(const AclEvent& event) { return event.impl_->event; }
+
+AclEvent AclEventAccessor::wrapEvent(aclrtEvent event)
+{
+    return AclEvent(makePtr<AclEvent::Impl>(event));
+}
+
+AclEvent::AclEvent() { impl_ = makePtr<Impl>(); }
+
+void AclEvent::record(AclStream& stream)
+{
+    CV_ACL_SAFE_CALL(aclrtRecordEvent(impl_->event, AclStreamAccessor::getStream(stream)));
+}
+
+void AclEvent::waitForComplete() const { CV_ACL_SAFE_CALL(aclrtSynchronizeEvent(impl_->event)); }
+
+/******************************************AclStream********************************************/
+struct AsyncThdArgs
+{
+    bool isExit;
+    void* context;
+    pthread_mutex_t mutex;
+    AsyncThdArgs() : isExit(false), context(nullptr), mutex(PTHREAD_MUTEX_INITIALIZER) {}
+};
+
+class AclStream::Impl
+{
+public:
+    aclrtStream stream;
+    bool ownStream;
+    AsyncThdArgs asyncThdArgs;
+    pthread_t asyncThdId;
+
+    void bindThread();
+    void addToAsyncRelease(const AclMat& mat);
+
+    Impl();
+    explicit Impl(aclrtStream stream);
+
+    ~Impl();
+};
+
+AclStream::Impl::Impl() : stream(nullptr), ownStream(true), asyncThdId(0)
+{
+    CV_ACL_SAFE_CALL(aclrtCreateStream(&stream));
+}
+
+AclStream::Impl::Impl(aclrtStream s) : stream(s), ownStream(false), asyncThdId(0) {}
+
+AclStream::Impl::~Impl()
+{
+    if (stream && ownStream)
+    {
+        aclrtSynchronizeStream(stream);
+        if (asyncThdId != 0)
+        {
+            asyncThdArgs.isExit = true;
+            CV_ACL_SAFE_CALL(aclrtUnSubscribeReport(asyncThdId, stream));
+            (void)pthread_join(asyncThdId, nullptr);
+        }
+        CV_ACL_SAFE_CALL(aclrtDestroyStream(stream));
+    }
+}
+
+static void* processReportLoop(void* args_)
+{
+    AsyncThdArgs* args = (AsyncThdArgs*)args_;
+    CV_ACL_SAFE_CALL(aclrtSetCurrentContext(args->context));
+
+    // Wait for subscribe.
+    pthread_mutex_lock(&args->mutex);
+    pthread_mutex_unlock(&args->mutex);
+
+    while (!args->isExit)
+    {
+        aclError ret = aclrtProcessReport(-1);
+        // Skip error check if exiting. aclrtProcessReport will report an timeout error when
+        // unsubscribing.
+        if (!args->isExit)
+            CV_ACL_SAFE_CALL(ret);
+    }
+
+    return (nullptr);
+}
+
+void AclStream::Impl::bindThread()
+{
+    // Only one thread will created. Lock for parallelling.
+    pthread_mutex_lock(&asyncThdArgs.mutex);
+    if (asyncThdId == 0)
+    {
+        CV_ACL_SAFE_CALL(aclrtGetCurrentContext(&asyncThdArgs.context));
+        (void)pthread_create(&asyncThdId, nullptr, processReportLoop, &asyncThdArgs);
+        CV_ACL_SAFE_CALL(aclrtSubscribeReport(asyncThdId, stream));
+    }
+    pthread_mutex_unlock(&asyncThdArgs.mutex);
+}
+
+static void releaseAclMatCB(void* releaseHandle)
+{
+    if (releaseHandle == nullptr)
+        return;
+    AclMat* mat = (AclMat*)releaseHandle;
+    delete mat;
+}
+
+void AclStream::Impl::addToAsyncRelease(const AclMat& mat)
+{
+    if (stream != nullptr)
+    {
+        if (asyncThdId == 0)
+            bindThread();
+        AclMat* releaseHandle = new AclMat(mat);
+        CV_ACL_SAFE_CALL(
+            aclrtLaunchCallback(releaseAclMatCB, releaseHandle, ACL_CALLBACK_BLOCK, stream));
+    }
+}
+
+aclrtStream AclStreamAccessor::getStream(const AclStream& stream) { return stream.impl_->stream; }
+
+AclStream AclStreamAccessor::wrapStream(aclrtStream stream)
+{
+    return AclStream(makePtr<AclStream::Impl>(stream));
+}
+
+AclStream::AclStream() { impl_ = makePtr<Impl>(); }
+
+void AclStream::waitForCompletion() { CV_ACL_SAFE_CALL(aclrtSynchronizeStream(impl_->stream)); }
+
+void AclStream::waitAclEvent(const AclEvent& event)
+{
+    CV_ACL_SAFE_CALL(aclrtStreamWaitEvent(impl_->stream, AclEventAccessor::getEvent(event)));
+}
+
+AclStream& AclStream::Null()
+{
+    const uint32_t deviceId = getDevice();
+    return initializer.getNullAclStream(deviceId);
+}
+
+void AclStream::addToAsyncRelease(const AclMat& mat) { impl_->addToAsyncRelease(mat); }
+
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannarithm/src/cann_call.cpp b/modules/cannarithm/src/cann_call.cpp
new file mode 100644
index 00000000000..0e9ad8036bb
--- /dev/null
+++ b/modules/cannarithm/src/cann_call.cpp
@@ -0,0 +1,140 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+namespace cv
+{
+namespace cann
+{
+void aclOneInput(const AclMat& src, AclMat& dst, const char* op, AclStream& stream,
+                 std::vector<AclAttribute*>& attrs)
+{
+    CannPreparation prepare;
+    for (auto& attrIterator : attrs)
+    {
+        attrIterator->addAttr(prepare.opAttr_);
+    }
+
+    int64_t dimSrc[] = {1, src.rows, src.cols, src.channels()};
+    int64_t dimDst[] = {1, dst.rows, dst.cols, dst.channels()};
+    CANN_PREPARE_INPUTDESC(prepare, getACLType(src.depth()), sizeof(dimSrc) / sizeof(dimSrc[0]),
+                           dimSrc, ACL_FORMAT_NHWC);
+    CANN_PREPARE_OUTPUTDESC(prepare, getACLType(dst.depth()), sizeof(dimDst) / sizeof(dimDst[0]),
+                            dimDst, ACL_FORMAT_NHWC);
+
+    CANN_PREPARE_INPUTBUFFER(prepare, const_cast<uchar*>(src.data), src.rows * src.step);
+    CANN_PREPARE_OUTPUTBUFFER(prepare, const_cast<uchar*>(dst.data), dst.rows * dst.step);
+
+    aclrtStream rawStream = AclStreamAccessor::getStream(stream);
+
+    CV_ACL_SAFE_CALL(aclopCompileAndExecute(
+        op, prepare.inputDesc_.size(), prepare.inputDesc_.data(), prepare.inputBuffers_.data(),
+        prepare.outputDesc_.size(), prepare.outputDesc_.data(), prepare.outputBuffers_.data(),
+        prepare.opAttr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, rawStream));
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtSynchronizeStream(rawStream));
+    else
+    {
+        stream.addToAsyncRelease(src);
+        stream.addToAsyncRelease(dst);
+    }
+}
+
+void aclTwoInputs(const AclMat& src1, const AclMat& src2, AclMat& dst, const char* op,
+                  AclStream& stream)
+{
+    CannPreparation prepare;
+    aclrtStream rawStream = AclStreamAccessor::getStream(stream);
+
+    int64_t dimSrc1[] = {1, src1.rows, src1.cols, src1.channels()};
+    int64_t dimSrc2[] = {1, src2.rows, src2.cols, src2.channels()};
+
+    int64_t dimDst[] = {1, dst.rows, dst.cols, dst.channels()};
+
+    CANN_PREPARE_INPUTDESC(prepare, getACLType(src1.depth()), sizeof(dimSrc1) / sizeof(dimSrc1[0]),
+                           dimSrc1, ACL_FORMAT_NHWC);
+
+    CANN_PREPARE_INPUTDESC(prepare, getACLType(src2.depth()), sizeof(dimSrc2) / sizeof(dimSrc2[0]),
+                           dimSrc2, ACL_FORMAT_NHWC);
+
+    CANN_PREPARE_OUTPUTDESC(prepare, getACLType(dst.depth()), sizeof(dimDst) / sizeof(dimDst[0]),
+                            dimDst, ACL_FORMAT_NHWC);
+
+    CANN_PREPARE_INPUTBUFFER(prepare, const_cast<uchar*>(src1.data), src1.rows * src1.step);
+    CANN_PREPARE_INPUTBUFFER(prepare, const_cast<uchar*>(src2.data), src2.rows * src2.step);
+    CANN_PREPARE_OUTPUTBUFFER(prepare, const_cast<uchar*>(dst.data), dst.rows * dst.step);
+
+    CV_ACL_SAFE_CALL(aclopCompileAndExecute(
+        op, prepare.inputDesc_.size(), prepare.inputDesc_.data(), prepare.inputBuffers_.data(),
+        prepare.outputDesc_.size(), prepare.outputDesc_.data(), prepare.outputBuffers_.data(),
+        prepare.opAttr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, rawStream));
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtSynchronizeStream(rawStream));
+    else
+    {
+        stream.addToAsyncRelease(src1);
+        stream.addToAsyncRelease(src2);
+        stream.addToAsyncRelease(dst);
+    }
+}
+
+void transNCHWToNHWC(const AclMat& src, AclMat& dst, AclStream& stream)
+{
+    CannPreparation prepare;
+    CANN_PREPARE_ADD_ATTR(prepare, String, "src_format", "NCHW");
+    CANN_PREPARE_ADD_ATTR(prepare, String, "dst_format", "NHWC");
+
+    int64_t dimSrc[] = {1, src.channels(), src.rows, src.cols};
+    int64_t dimDst[] = {1, dst.rows, dst.cols, dst.channels()};
+
+    CANN_PREPARE_INPUTDESC(prepare, getACLType(src.depth()), sizeof(dimSrc) / sizeof(dimSrc[0]),
+                           dimSrc, ACL_FORMAT_NCHW);
+    CANN_PREPARE_OUTPUTDESC(prepare, getACLType(dst.depth()), sizeof(dimDst) / sizeof(dimDst[0]),
+                            dimDst, ACL_FORMAT_NHWC);
+
+    CANN_PREPARE_INPUTBUFFER(prepare, const_cast<uchar*>(src.data), src.rows * src.step);
+    CANN_PREPARE_OUTPUTBUFFER(prepare, const_cast<uchar*>(dst.data), dst.rows * dst.step);
+
+    aclrtStream rawStream = AclStreamAccessor::getStream(stream);
+
+    CV_ACL_SAFE_CALL(aclopCompileAndExecute("TransData", prepare.inputDesc_.size(),
+                                            prepare.inputDesc_.data(), prepare.inputBuffers_.data(),
+                                            prepare.outputDesc_.size(), prepare.outputDesc_.data(),
+                                            prepare.outputBuffers_.data(), prepare.opAttr_,
+                                            ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, rawStream));
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtSynchronizeStream(rawStream));
+    else
+    {
+        stream.addToAsyncRelease(src);
+        stream.addToAsyncRelease(dst);
+    }
+}
+
+aclDataType getACLType(int opencvdepth)
+{
+    switch (opencvdepth)
+    {
+        case CV_8S:
+            return ACL_INT8;
+        case CV_16S:
+            return ACL_INT16;
+        case CV_8U:
+            return ACL_UINT8;
+        case CV_16U:
+            return ACL_UINT16;
+        case CV_32S:
+            return ACL_INT32;
+        case CV_64F:
+            return ACL_DOUBLE;
+        case CV_16F:
+            return ACL_FLOAT16;
+        default:
+            return ACL_DT_UNDEFINED;
+    }
+}
+
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannarithm/src/element_operations.cpp b/modules/cannarithm/src/element_operations.cpp
new file mode 100644
index 00000000000..f23323bf384
--- /dev/null
+++ b/modules/cannarithm/src/element_operations.cpp
@@ -0,0 +1,165 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+#include <map>
+
+namespace cv
+{
+namespace cann
+{
+void opMatMat(AclMat&, AclMat&, AclMat&, const char*, AclStream& stream = AclStream::Null());
+void opMatMat(AclMat& src1, AclMat& src2, AclMat& dst, const char* op, AclStream& stream)
+{
+    aclTwoInputs(src1, src2, dst, op, stream);
+}
+
+void opMatScalar(AclMat&, AclMat&, bool, Scalar, const char*,
+                 AclStream& stream = AclStream::Null());
+void opMatScalar(AclMat& src, AclMat& dst, bool inv, Scalar s, const char* op, AclStream& stream)
+{
+    Mat scMat(1, 1, src.type(), s);
+    AclMat scAclMat;
+    scAclMat.upload(scMat);
+    if (inv)
+        aclTwoInputs(scAclMat, src, dst, op, stream);
+    else
+        aclTwoInputs(src, scAclMat, dst, op, stream);
+}
+
+void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, float scale, int dtype,
+               const char* op, AclStream& stream = AclStream::Null());
+void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, float scale,  int dtype,
+               const char* op, AclStream& stream)
+{
+    const int kind1 = _src1.kind();
+    const int kind2 = _src2.kind();
+
+    const bool isScalar1 = (kind1 == _InputArray::MATX);
+    const bool isScalar2 = (kind2 == _InputArray::MATX);
+
+    AclMat src1, src2;
+
+    if (!isScalar1)
+        src1 = getInputMat(_src1, stream);
+
+    if (!isScalar2)
+        src2 = getInputMat(_src2, stream);
+
+    Mat scalar;
+    if (isScalar1)
+        scalar = _src1.getMat();
+    else if (isScalar2)
+        scalar = _src2.getMat();
+
+    Scalar val;
+    if (!scalar.empty())
+    {
+        CV_Assert(scalar.total() <= 4);
+        scalar.convertTo(Mat_<double>(scalar.rows, scalar.cols, &val[0]), CV_64F);
+    }
+
+    const int sdepth = src1.empty() ? src2.depth() : src1.depth();
+    const int cn = src1.empty() ? src2.channels() : src1.channels();
+    const Size size = src1.empty() ? src2.size() : src1.size();
+
+    if (dtype < 0)
+        dtype = sdepth;
+
+    const int ddepth = CV_MAT_DEPTH(dtype);
+
+    CV_Assert(sdepth <= CV_64F && ddepth <= CV_64F);
+    CV_Assert(!scalar.empty() || (src2.depth() == src1.depth() && src2.size() == src1.size()));
+
+    AclMat dst = getOutputMat(_dst, size.height, size.width, CV_MAKE_TYPE(ddepth, cn));
+
+    if (isScalar1)
+        opMatScalar(src2, dst, true, val, op, stream);
+    else if (isScalar2)
+        opMatScalar(src1, dst, false, val, op, stream);
+    else
+        opMatMat(src1, src2, dst, op, stream);
+
+    // TODO implement emtpy for AclMat in InputArray
+    AclMat mask = getInputMat(_mask, stream);
+    if (!mask.empty())
+    {
+        int mtype = mask.type();
+
+        CV_Assert((mtype == CV_8UC1 || mtype == CV_8SC1) && mask.size() == size);
+        // TODO use MaskSelect?
+        AclMat formatedMask;
+        if (mask.depth() != dst.depth())
+            mask.convertTo(formatedMask, dst.depth());
+        else
+            formatedMask = mask;
+
+        AclMat expandedMask;
+        if (dst.channels() != 1)
+            formatedMask.expandTo(expandedMask, dst.channels());
+        else
+            expandedMask = formatedMask;
+
+        // TODO call DIV before expand?
+        AclMat divRet;
+        arithm_op(expandedMask, expandedMask, divRet, noArray(), 1, -1, "Div", stream);
+        AclMat dstCopy = dst;
+        // TODO dst memory and dskCopy mempry point to a same memory area, seems no harm yet.
+        arithm_op(dstCopy, divRet, dst, noArray(), 1,  -1, "Mul", stream);
+    }
+
+    if(scale != 1)
+    {
+        AclMat dstCpy = dst;
+        AclFloatAttribute scaleOP("value", scale);
+        std::vector<AclAttribute*> attrs{&scaleOP};
+        aclOneInput(dstCpy, dst, "Muls", stream, attrs);
+    }
+
+    syncOutput(dst, _dst);
+}
+
+void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype,
+         AclStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
+}
+
+void subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype,
+              AclStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
+}
+
+void multiply(InputArray src1, InputArray src2, OutputArray dst, float scale, int dtype, AclStream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), scale, dtype, "Mul", stream);
+}
+
+void divide(InputArray src1, InputArray src2, OutputArray dst, float scale, int dtype, AclStream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), scale, dtype, "Div", stream);
+}
+
+void bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask,
+                 AclStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
+}
+
+void bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask,
+                AclStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
+}
+
+void bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask,
+                 AclStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
+}
+
+
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannarithm/src/precomp.hpp b/modules/cannarithm/src/precomp.hpp
new file mode 100644
index 00000000000..1541ec80a69
--- /dev/null
+++ b/modules/cannarithm/src/precomp.hpp
@@ -0,0 +1,16 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include "opencv2/cann.hpp"
+#include "opencv2/cann_prepare.hpp"
+#include "opencv2/acl_stream_accessor.hpp"
+#include "opencv2/cann_call.hpp"
+#include "opencv2/cann_arithm.hpp"
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cannarithm/test/test_cann.cpp b/modules/cannarithm/test/test_cann.cpp
new file mode 100644
index 00000000000..6c2e65beefe
--- /dev/null
+++ b/modules/cannarithm/test/test_cann.cpp
@@ -0,0 +1,227 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include <opencv2/ts/cuda_test.hpp>
+
+namespace opencv_test
+{
+namespace
+{
+
+class DummyAllocator : public AclMat::Allocator
+{
+public:
+    bool allocate(cv::cann::AclMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE
+    {
+        CV_UNUSED(rows);
+        CV_UNUSED(cols);
+        CV_UNUSED(elemSize);
+        mat->data = (uchar*)0x12345;
+        mat->refcount = (int*)cv::fastMalloc(sizeof(int));
+        return true;
+    }
+    void free(cv::cann::AclMat* mat) CV_OVERRIDE
+    {
+        mat->data = (uchar*)0x54321;
+        cv::fastFree(mat->refcount);
+    }
+};
+
+TEST(AclMat, Construct)
+{
+    cv::cann::setDevice(0);
+    // 1 Default constructor.
+    AclMat defaultAclMat;
+    AclMat::Allocator* defaultAllocator = AclMat::defaultAllocator();
+    ASSERT_EQ(defaultAclMat.allocator, defaultAllocator);
+
+    // 2 get & set allocator.
+    DummyAllocator dummyAllocator;
+    AclMat::setDefaultAllocator(&dummyAllocator);
+    ASSERT_EQ(defaultAclMat.defaultAllocator(), &dummyAllocator);
+    AclMat::setDefaultAllocator(defaultAllocator);
+
+    // 3 constructs AclMat of the specified size and type
+    AclMat specifiedSizeAclMat1(5, 6, CV_8UC3);
+    AclMat specifiedSizeAclMat2(Size(300, 200), CV_64F);
+
+    ASSERT_EQ(specifiedSizeAclMat1.rows, 5);
+    ASSERT_EQ(specifiedSizeAclMat1.cols, 6);
+    ASSERT_EQ(specifiedSizeAclMat1.depth(), CV_8U);
+    ASSERT_EQ(specifiedSizeAclMat1.channels(), 3);
+
+    ASSERT_EQ(specifiedSizeAclMat2.cols, 300);
+    ASSERT_EQ(specifiedSizeAclMat2.rows, 200);
+    ASSERT_EQ(specifiedSizeAclMat2.depth(), CV_64F);
+    ASSERT_EQ(specifiedSizeAclMat2.channels(), 1);
+
+    // 4 constructs AclMat and fills it with the specified value s
+    srand((unsigned int)(time(NULL)));
+    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+
+    Mat scalarToMat(7, 8, CV_8UC3, sc);
+    AclMat scalarToAclMat1(7, 8, CV_8UC3, sc);
+    Mat scalarToMatChecker;
+    scalarToAclMat1.download(scalarToMatChecker);
+
+    EXPECT_MAT_NEAR(scalarToMat, scalarToMatChecker, 0.0);
+
+    AclMat scalarToAclMat2(Size(123, 345), CV_32S);
+
+    ASSERT_EQ(scalarToAclMat1.rows, 7);
+    ASSERT_EQ(scalarToAclMat1.cols, 8);
+    ASSERT_EQ(scalarToAclMat1.depth(), CV_8U);
+    ASSERT_EQ(scalarToAclMat1.channels(), 3);
+
+    ASSERT_EQ(scalarToAclMat2.cols, 123);
+    ASSERT_EQ(scalarToAclMat2.rows, 345);
+    ASSERT_EQ(scalarToAclMat2.depth(), CV_32S);
+    ASSERT_EQ(scalarToAclMat2.channels(), 1);
+
+    // 5 constructor for AclMat headers pointing to user-allocated data
+    void* userAllocatedData = malloc(1);
+    AclMat userAllocatedAclMat1(9, 10, CV_16SC2, userAllocatedData);
+    AclMat userAllocatedAclMat2(Size(1024, 2048), CV_16F, userAllocatedData);
+
+    ASSERT_EQ(userAllocatedAclMat1.rows, 9);
+    ASSERT_EQ(userAllocatedAclMat1.cols, 10);
+    ASSERT_EQ(userAllocatedAclMat1.depth(), CV_16S);
+    ASSERT_EQ(userAllocatedAclMat1.channels(), 2);
+    ASSERT_EQ(userAllocatedAclMat1.data, userAllocatedData);
+
+    ASSERT_EQ(userAllocatedAclMat2.cols, 1024);
+    ASSERT_EQ(userAllocatedAclMat2.rows, 2048);
+    ASSERT_EQ(userAllocatedAclMat2.depth(), CV_16F);
+    ASSERT_EQ(userAllocatedAclMat2.channels(), 1);
+    ASSERT_EQ(userAllocatedAclMat1.data, userAllocatedData);
+
+    // 6 builds AclMat from host memory
+    Scalar sc2(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+    Mat randomMat(7, 8, CV_8UC3, sc2);
+    InputArray arr = randomMat;
+
+    AclMat fromInputArray(arr);
+    Mat randomMatChecker;
+    fromInputArray.download(randomMatChecker);
+    EXPECT_MAT_NEAR(randomMat, randomMatChecker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(AclMat, RefCount)
+{
+    DummyAllocator dummyAllocator;
+    AclMat* mat = new AclMat(1, 1, CV_8U, &dummyAllocator);
+    ASSERT_EQ(*(mat->refcount), 1);
+    ASSERT_EQ(mat->data, (uchar*)0x12345);
+
+    AclMat* copy1 = new AclMat(*mat);
+    ASSERT_EQ(mat->refcount, copy1->refcount);
+    ASSERT_EQ(*(copy1->refcount), 2);
+
+    AclMat* copy2 = new AclMat(*copy1);
+    ASSERT_EQ(mat->refcount, copy2->refcount);
+    ASSERT_EQ(*(copy2->refcount), 3);
+
+    delete copy1;
+    ASSERT_EQ(mat->data, (uchar*)0x12345);
+    ASSERT_EQ(*(mat->refcount), 2);
+
+    delete copy2;
+    ASSERT_EQ(mat->data, (uchar*)0x12345);
+    ASSERT_EQ(*(mat->refcount), 1);
+
+    delete mat;
+}
+
+TEST(AclMat, Assignment)
+{
+    DummyAllocator dummyAllocator;
+    AclMat mat1;
+    AclMat mat2(3, 4, CV_8SC1, &dummyAllocator);
+    mat1 = mat2;
+
+    ASSERT_EQ(mat1.rows, 3);
+    ASSERT_EQ(mat1.cols, 4);
+    ASSERT_EQ(mat1.depth(), CV_8S);
+    ASSERT_EQ(mat1.channels(), 1);
+    ASSERT_EQ(mat1.data, (uchar*)0x12345);
+}
+
+TEST(AclMat, SetTo)
+{
+    cv::cann::setDevice(0);
+
+    srand((unsigned int)(time(NULL)));
+    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+
+    AclMat aclMat(2, 2, CV_8UC4);
+    aclMat.setTo(sc);
+    Mat mat(2, 2, CV_8UC4, sc);
+    Mat checker;
+    aclMat.download(checker);
+
+    EXPECT_MAT_NEAR(mat, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(AclMat, ConvertTo)
+{
+    cv::cann::setDevice(0);
+
+    srand((unsigned int)(time(NULL)));
+    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+
+    AclMat aclMat(2, 2, CV_8UC4, sc);
+    AclMat convertedAclMat;
+    aclMat.convertTo(convertedAclMat, CV_16S);
+    Mat mat(2, 2, CV_16SC4, sc);
+    Mat checker;
+    convertedAclMat.download(checker);
+
+    EXPECT_MAT_NEAR(mat, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(AclMat, ExpandTo)
+{
+    cv::cann::setDevice(0);
+
+    Scalar sc1(1);
+    Scalar sc2(1, 1, 1);
+    AclMat aclMat(10, 10, CV_8UC1, sc1);
+    Mat mat(10, 10, CV_8UC3, sc2);
+    AclMat expandedAclMat;
+    aclMat.expandTo(expandedAclMat, 3);
+    Mat checker;
+    expandedAclMat.download(checker);
+
+    EXPECT_MAT_NEAR(mat, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(AclStream, AsyncProcess)
+{
+    cv::cann::setDevice(0);
+
+    DummyAllocator dummyAllocator;
+    AclMat* mat = new AclMat(&dummyAllocator);
+    AclStream stream;
+
+    stream.addToAsyncRelease(*mat);
+    stream.waitForCompletion();
+
+    // TODO: need sync point to check:
+    // 1. mat->data is not freed after it add to async release list even mat is deleted.
+    // 2. mat->data is freed after callback is called.
+
+    cv::cann::resetDevice();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannarithm/test/test_element_operation.cpp b/modules/cannarithm/test/test_element_operation.cpp
new file mode 100644
index 00000000000..db20321d43f
--- /dev/null
+++ b/modules/cannarithm/test/test_element_operation.cpp
@@ -0,0 +1,137 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+#include "opencv2/cann_arithm.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+// Random Generator
+Mat randomMat(int w, int h, int dtype)
+{
+    Mat rnMat(w, h, dtype);
+    RNG rng;
+    rng.fill(rnMat, RNG::UNIFORM, 0.f, 1.f);
+    return rnMat;
+}
+cv::Scalar randomScalar()
+{
+    RNG rng;
+    return Scalar(rng, rng.next(), rng.next(), rng.next());
+}
+float randomNum()
+{
+    RNG rng;
+    float rdnNum = float(rng.uniform(0.3, 3.0));
+    return rdnNum;
+}
+Mat genMask()
+{
+    Mat mask = Mat::zeros(Size(10, 10), CV_8UC1);
+    rectangle(mask, cv::Rect(5, 5, 3, 3), Scalar(255), -1);
+    return mask;
+}
+
+#define DEVICE_ID 0
+
+/****************TEST CASE***************/
+// MAT & Mat
+#define TEST_MAT_OP_MAT(idx, op, ...)                        \
+    TEST(ELEMENTWISE_OP, MAT_##op##_MAT_##idx)               \
+    {                                                        \
+        cv::cann::setDevice(DEVICE_ID);                      \
+                                                             \
+        Mat cpuMat1 = randomMat(10, 10, CV_32SC3);           \
+        Mat cpuMat2 = randomMat(10, 10, CV_32SC3);           \
+        Mat cpuDst;                                          \
+        cv::op(cpuMat1, cpuMat2, cpuDst, __VA_ARGS__);       \
+                                                             \
+        AclMat mat1, mat2;                                   \
+        mat1.upload(cpuMat1);                                \
+        mat2.upload(cpuMat2);                                \
+        AclMat dst, dstS;                                    \
+        cv::cann::op(mat1, mat2, dst, __VA_ARGS__);          \
+        Mat npuDst, npuDstS;                                 \
+        dst.download(npuDst);                                \
+        AclStream stream;                                    \
+        cv::cann::op(mat1, mat2, dstS, __VA_ARGS__, stream); \
+        stream.waitForCompletion();                          \
+        dstS.download(npuDstS);                              \
+                                                             \
+        EXPECT_MAT_NEAR(npuDst, cpuDst, 0.0);                \
+        EXPECT_MAT_NEAR(npuDst, npuDstS, 0.0);               \
+        cv::cann::resetDevice();                             \
+    }
+
+TEST_MAT_OP_MAT(1, add, noArray(), -1);
+TEST_MAT_OP_MAT(1, subtract, noArray(), -1);
+TEST_MAT_OP_MAT(1, multiply, 1, -1);
+TEST_MAT_OP_MAT(1, divide, 1, -1);
+TEST_MAT_OP_MAT(1, bitwise_and, noArray());
+TEST_MAT_OP_MAT(1, bitwise_or, noArray());
+TEST_MAT_OP_MAT(1, bitwise_xor, noArray());
+
+TEST_MAT_OP_MAT(2, add, genMask(), CV_32SC3);
+TEST_MAT_OP_MAT(2, subtract, genMask(), CV_32SC3);
+TEST_MAT_OP_MAT(2, multiply, randomNum(), -1);
+TEST_MAT_OP_MAT(2, divide, randomNum(), -1);
+TEST_MAT_OP_MAT(2, bitwise_and, genMask());
+TEST_MAT_OP_MAT(2, bitwise_or, genMask());
+TEST_MAT_OP_MAT(2, bitwise_xor, genMask());
+
+// SCALAR & MAT
+#define TEST_MAT_OP_SCALAR(idx, op, ...)                           \
+    TEST(ELEMENTWISE_OP, MAT_##op##_SCALAR_##idx)                  \
+    {                                                              \
+        Scalar cpuS1 = randomScalar();                             \
+        Scalar cpuS2 = randomScalar();                             \
+        Mat cpuMatS1(10, 10, CV_32SC3, cpuS1);                     \
+        Mat cpuMatS2(10, 10, CV_32SC3, cpuS2);                     \
+        Mat cpuDst, cpuDstC;                                       \
+        cv::op(cpuMatS1, cpuMatS2, cpuDst, __VA_ARGS__);           \
+        cv::op(cpuMatS2, cpuMatS1, cpuDstC, __VA_ARGS__);          \
+        cv::cann::setDevice(DEVICE_ID);                            \
+                                                                   \
+        AclMat mat;                                                \
+        mat.upload(cpuMatS2);                                      \
+        AclMat dst, dstS, dstC, dstCS;                             \
+        cv::cann::op(cpuS1, cpuMatS2, dst, __VA_ARGS__);           \
+        cv::cann::op(cpuMatS2, cpuS1, dstC, __VA_ARGS__);          \
+        Mat npuDst, npuDstS, npuDstC, npuDstCS;                    \
+        dst.download(npuDst);                                      \
+        dstC.download(npuDstC);                                    \
+        AclStream stream;                                          \
+        cv::cann::op(cpuS1, cpuMatS2, dstS, __VA_ARGS__, stream);  \
+        cv::cann::op(cpuMatS2, cpuS1, dstCS, __VA_ARGS__, stream); \
+        stream.waitForCompletion();                                \
+        dstS.download(npuDstS);                                    \
+        dstCS.download(npuDstCS);                                  \
+                                                                   \
+        EXPECT_MAT_NEAR(npuDst, npuDstS, 0.0);                     \
+        EXPECT_MAT_NEAR(npuDst, cpuDst, 0.0);                      \
+        EXPECT_MAT_NEAR(npuDstC, npuDstCS, 0.0);                   \
+        EXPECT_MAT_NEAR(npuDstC, cpuDstC, 0.0);                    \
+                                                                   \
+        cv::cann::resetDevice();                                   \
+    }
+TEST_MAT_OP_SCALAR(1, add, noArray(), -1);
+TEST_MAT_OP_SCALAR(1, subtract, noArray(), -1);
+TEST_MAT_OP_SCALAR(1, multiply, 1, -1);
+TEST_MAT_OP_SCALAR(1, divide, 1, -1);
+TEST_MAT_OP_SCALAR(1, bitwise_and, noArray());
+TEST_MAT_OP_SCALAR(1, bitwise_or, noArray());
+TEST_MAT_OP_SCALAR(1, bitwise_xor, noArray());
+
+TEST_MAT_OP_SCALAR(2, add, genMask(), CV_32SC3);
+TEST_MAT_OP_SCALAR(2, subtract, genMask(), CV_32SC3);
+TEST_MAT_OP_SCALAR(2, bitwise_and, genMask());
+TEST_MAT_OP_SCALAR(2, bitwise_or, genMask());
+TEST_MAT_OP_SCALAR(2, bitwise_xor, genMask());
+TEST_MAT_OP_SCALAR(2, multiply, randomNum(), -1);
+TEST_MAT_OP_SCALAR(2, divide, randomNum(), -1);
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannarithm/test/test_main.cpp b/modules/cannarithm/test/test_main.cpp
new file mode 100644
index 00000000000..14bd66005ec
--- /dev/null
+++ b/modules/cannarithm/test/test_main.cpp
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+class CannEnvironment : public ::testing::Environment
+{
+public:
+    virtual ~CannEnvironment() = default;
+    virtual void SetUp() CV_OVERRIDE { initAcl(); }
+    virtual void TearDown() CV_OVERRIDE { finalizeAcl(); }
+};
+
+static void initTests()
+{
+    CannEnvironment* cannEnv = new CannEnvironment();
+    ::testing::AddGlobalTestEnvironment(cannEnv);
+}
+
+CV_TEST_MAIN("cannarithm", initTests());
diff --git a/modules/cannarithm/test/test_precomp.hpp b/modules/cannarithm/test/test_precomp.hpp
new file mode 100644
index 00000000000..e95abb86e1c
--- /dev/null
+++ b/modules/cannarithm/test/test_precomp.hpp
@@ -0,0 +1,16 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/cann.hpp"
+
+using namespace cv::cann;
+#undef EXPECT_MAT_NEAR
+#define EXPECT_MAT_NEAR(m1, m2, eps) EXPECT_PRED_FORMAT3(cvtest::assertMatNear, m1, m2, eps)
+#define ASSERT_MAT_NEAR(m1, m2, eps) ASSERT_PRED_FORMAT3(cvtest::assertMatNear, m1, m2, eps)
+
+#endif