buddy-compiler · LiuYang328 · Oct 18, 2024 · Oct 28, 2024 · Oct 21, 2024 · Oct 28, 2024
diff --git a/examples/BuddyLeNet/README.md b/examples/BuddyLeNet/README.md
@@ -25,7 +25,9 @@ $ cmake -G Ninja .. \
     -DCMAKE_BUILD_TYPE=RELEASE \
     -DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON \
     -DPython3_EXECUTABLE=$(which python3) \
-    -DBUDDY_MLIR_ENABLE_DIP_LIB=ON
+    -DBUDDY_MLIR_ENABLE_DIP_LIB=ON \
+    -DBUDDY_ENABLE_PNG=ON
+
 $ ninja
 $ ninja check-buddy
 ```

diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile
@@ -198,6 +198,39 @@ next-sigmoid-run:
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
 
+next-eliminate-add-zero-run:
+	@${MLIR_OPT} ./next-eliminate-add-zero.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
 next-rope-run:
 	@${MLIR_OPT} ./next-rope.mlir \
 		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
@@ -230,3 +263,37 @@ next-rope-run:
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+
+next-eliminate-identity-run:
+	@${MLIR_OPT} ./next-eliminate-identity.mlir \
+        -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+    ${MLIR_OPT} \
+        -arith-expand \
+        -eliminate-empty-tensors \
+        -empty-tensor-to-alloc-tensor \
+        -one-shot-bufferize \
+        -convert-linalg-to-affine-loops \
+        -affine-loop-fusion \
+        -lower-affine \
+        -func-bufferize \
+        -arith-bufferize \
+        -tensor-bufferize \
+        -buffer-deallocation \
+        -finalizing-bufferize \
+        -convert-vector-to-scf \
+        -expand-strided-metadata \
+        -convert-vector-to-llvm \
+        -memref-expand \
+        -arith-expand \
+        -convert-arith-to-llvm \
+        -finalize-memref-to-llvm \
+        -convert-scf-to-cf \
+        -convert-openmp-to-llvm \
+        -convert-arith-to-llvm \
+        -convert-math-to-llvm \
+        -convert-math-to-libm  \
+        -convert-func-to-llvm \
+        -reconcile-unrealized-casts | \
+    ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+        -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
diff --git a/examples/BuddyNext/next-eliminate-add-zero.mlir b/examples/BuddyNext/next-eliminate-add-zero.mlir
@@ -0,0 +1,96 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+module {
+    func.func private @printMemrefF32(tensor<*xf32>)
+    func.func private @rtclock() -> f64
+
+    func.func @uvue_original() {
+        %t0_original = call @rtclock() : () -> f64 
+
+        %84 = arith.constant dense<2.0> : tensor<1x32x40x128xf32>
+        %92 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+        %93 = tosa.add %84, %92 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+        %94 = tosa.reshape %93 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+
+        %t1_original = call @rtclock() : () -> f64
+        %tensor_unranked = tensor.cast %94 : tensor<32x40x128xf32> to tensor<*xf32>
+
+        // All the elements of the MemRef are the same,
+        // only check the first line to verify the correctness.
+        // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [32, 40, 128] strides = [5120, 128, 1] data = 
+        // CHECK-NEXT: [
+        // CHECK-SAME: [
+        // CHECK-SAME: [2{{(, 2)*}}],
+
+        // Print results.
+        call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+        // Print timings.
+
+        %t_original = arith.subf %t1_original, %t0_original : f64
+        vector.print str "original operation time: "
+        vector.print %t_original : f64
+        return 
+    }
+
+    func.func @uve_optimized() {
+        %t0_optimized = call @rtclock() : () -> f64
+
+        %84 = arith.constant dense<2.0> : tensor<1x32x40x128xf32>
+        %94 = tosa.reshape %84 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+        %t1_optimized = call @rtclock() : () -> f64
+
+        %tensor_unranked = tensor.cast %94 : tensor<32x40x128xf32> to tensor<*xf32>
+
+
+
+        // Print results.
+        call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+        // Print timings.
+
+        %t_optimized = arith.subf %t1_optimized, %t0_optimized : f64
+        vector.print str "optimized operation time: "
+        vector.print %t_optimized : f64
+        return 
+    }
+
+
+    func.func @main() {
+        %84 = arith.constant dense<2.0> : tensor<1x32x40x128xf32>
+
+        call @uvue_original() : () -> ()
+        call @uve_optimized() : () -> ()
+
+        return 
+    }
+}
diff --git a/examples/BuddyNext/next-eliminate-identity.mlir b/examples/BuddyNext/next-eliminate-identity.mlir
@@ -0,0 +1,96 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+module {
+    func.func private @printMemrefF32(tensor<*xf32>)
+    func.func private @rtclock() -> f64
+
+    func.func @ie_original() {
+        %t0_original = call @rtclock() : () -> f64 
+
+        %119 = arith.constant dense<1.0> : tensor<1x40x32x128xf32>
+        %120 = tosa.identity %119 : (tensor<1x40x32x128xf32>) -> tensor<1x40x32x128xf32>
+        %121 = tosa.reshape %120 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
+        %t1_original = call @rtclock() : () -> f64
+
+        %tensor_unranked = tensor.cast %121 : tensor<1x40x4096xf32> to tensor<*xf32>
+        // All the elements of the MemRef are the same,
+        // only check the first line to verify the correctness.
+        // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 4096] strides = [163840, 4096, 1] data = 
+        // CHECK-NEXT: [
+        // CHECK-SAME: [
+        // CHECK-SAME: [1{{(, 1)*}}],
+
+        // Print results.
+        call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+        // Print timings.
+
+        %t_original = arith.subf %t1_original, %t0_original : f64
+        vector.print str "original operation time: "
+        vector.print %t_original : f64
+        return 
+    }
+
+    func.func @ie_optimized() {
+        %t0_optimized = call @rtclock() : () -> f64
+
+        %119 = arith.constant dense<1.0> : tensor<1x40x32x128xf32>
+        %121 = tosa.reshape %119 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
+        %t1_optimized = call @rtclock() : () -> f64
+
+        %tensor_unranked = tensor.cast %121 : tensor<1x40x4096xf32> to tensor<*xf32>
+        // All the elements of the MemRef are the same,
+        // only check the first line to verify the correctness.
+        // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 4096] strides = [163840, 4096, 1] data = 
+        // CHECK-NEXT: [
+        // CHECK-SAME: [
+        // CHECK-SAME: [1{{(, 1)*}}],
+
+        // Print results.
+        call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+        // Print timings.
+
+        %t_optimized = arith.subf %t1_optimized, %t0_optimized : f64
+        vector.print str "optimized operation time: "
+        vector.print %t_optimized : f64
+        return 
+    }
+
+    func.func @main() {
+
+        call @ie_original() : () -> ()
+        call @ie_optimized() : () -> ()
+
+        return 
+    }
+}
diff --git a/examples/DIPDialect/README.md b/examples/DIPDialect/README.md
@@ -0,0 +1,8 @@
+If you want to test the functionality of image
+
+Please follow:
+$ cd buddy-mlir/build
+$ cmake -G Ninja .. -DBUDDY_EXAMPLES=ON 
+$ ninja resize4D_nchw
+$ cd bin
+$ ./resize4D_nhwc ../../examples/images/YuTu.png result-dip-resize.bmp
diff --git a/examples/DIPDialect/resize4D_nchw.cpp b/examples/DIPDialect/resize4D_nchw.cpp
@@ -23,17 +23,17 @@
 //===----------------------------------------------------------------------===//
 #include "buddy/DIP/imgcodecs/loadsave.h"
 #include <buddy/Core/Container.h>
-#include <buddy/DIP/ImgContainer.h>
 #include <buddy/DIP/DIP.h>
 #include <buddy/DIP/ImageContainer.h>
+#include <buddy/DIP/ImgContainer.h>
 #include <iostream>
 #include <math.h>
 
 using namespace std;
 
 void testImplementation(int argc, char *argv[]) {
   // Read as colar image.
-  dip::Image<float, 4> inputBatch(argv[1], dip::DIP_RGB, true);
+  dip::Image<float, 4> inputBatch(argv[1], dip::DIP_RGB);
 
   // Note : Both values in output image dimensions and scaling ratios must be
   // positive numbers.
@@ -42,12 +42,12 @@ void testImplementation(int argc, char *argv[]) {
       {1, 3, 224, 224} /*{image_cols, image_rows}*/);
 
   // Define Img with the output of Resize4D.
-  intptr_t outSizes[3] = {output.getSizes()[2], output.getSizes()[3],
-                          output.getSizes()[1]};
+  intptr_t outSizes[4] = {output.getSizes()[0], output.getSizes()[1],
+                          output.getSizes()[2], output.getSizes()[3]};
 
-  Img<float, 3> outputImageResize4D(output.getData(), outSizes);
+  dip::Image<float, 4> outputImageResize4D(output.getData(), outSizes);
 
-  // dip::imwrite(argv[2], outputImageResize4D);
+  dip::imageWrite(argv[2], outputImageResize4D);
 
   return;
 }