buddy-compiler · DongYuqi1 · Nov 2, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/examples/BuddyGPU/README.md b/examples/BuddyGPU/README.md
@@ -6,7 +6,7 @@ The example program is a simple matrix multiplication kernel. The linalg definit
 A transform sequence is in `transform.mlir` to optimize this kernel and prepare it for execution on the GPU.
 The `matmul-cubin.mlir` provides a lowered file, in case the pipeline is not working.
 
-Run the following command to compile and run the program:
+Run the following command to compile and run the matmul program:
 ```
   make buddy-gpu-matmul
   python run-module-gpu.py --source matmul.mlir --target matmul-cubin.mlir --llvm_dir ../../llvm
@@ -33,6 +33,80 @@ MLIR equal to NumPy? True
 
 As the tensorcore doesn't support fp32 computation, the operands are converted to tf32, hence the result is not exactly the same as the PyTorch result. 
 
+## Batch_Matmul
+The project Batch_Matmul is similar to Matmul
+Run the following command to compile and run the batch_matmul program:
+```
+  make bud-batch_matmul-gpu-lower
+  python run-module-gpu.py --source batch_matmul.mlir --target batch_matmul-cubin.mlir --llvm_dir ../../llvm
+```
+The result should be:
+```
+[[[262.76825 254.99942 261.9515  ... 261.36447 260.22684 262.3626 ]
+  [260.58057 244.43068 263.4109  ... 260.566   253.99341 260.5964 ]
+  [261.4656  252.95885 261.60065 ... 254.64616 252.29713 256.57367]
+  ...
+  [262.47998 249.4758  258.5593  ... 257.48874 249.1685  256.86334]
+  [261.89804 257.19324 267.10922 ... 266.3719  261.73676 259.08725]
+  [254.11542 241.08536 254.38817 ... 252.5039  249.27812 250.29007]]
+
+ [[258.64035 255.68985 253.599   ... 252.50366 255.07352 254.71776]
+  [265.12173 266.81024 258.41632 ... 253.75903 259.09418 260.67685]
+  [262.44107 259.28223 251.45758 ... 254.06253 257.65988 262.2896 ]
+  ...
+  [263.62085 268.88962 255.04141 ... 258.6461  263.7392  259.8988 ]
+  [265.2162  263.92514 258.99207 ... 259.3231  260.62183 264.48645]
+  [265.87585 264.66345 257.75262 ... 254.98543 258.77817 261.12802]]
+
+ [[258.97577 254.96257 256.22888 ... 260.3291  256.66476 254.08638]
+  [264.09952 259.17035 263.79138 ... 263.42252 258.6401  259.27136]
+  [256.46616 251.93016 262.46368 ... 255.35817 259.9043  256.903  ]
+  ...
+  [249.51251 249.01341 251.28108 ... 254.42873 257.35614 249.16321]
+  [256.62653 245.54889 256.8105  ... 259.68204 250.49097 254.27005]
+  [253.04216 251.057   252.49603 ... 247.81493 256.3396  254.00908]]
+
+ [[270.9063  264.9813  247.92715 ... 254.21812 261.9063  248.39583]
+  [265.76254 261.60773 257.71106 ... 256.41833 262.47015 251.41019]
+  [273.32095 264.63913 252.10524 ... 257.88034 267.4981  254.9017 ]
+  ...
+  [273.4149  263.96182 251.55174 ... 258.74625 264.74557 250.82188]
+  [277.82263 272.03714 256.53876 ... 258.11078 270.44397 258.1671 ]
+  [270.86002 266.3878  253.41849 ... 255.5107  268.78973 256.5687 ]]]
+[[[262.94034 255.16846 262.1241  ... 261.53763 260.397   262.53165]
+  [260.7543  244.59506 263.58673 ... 260.74084 254.1648  260.76572]
+  [261.64075 253.12537 261.77618 ... 254.815   252.46964 256.74323]
+  ...
+  [262.6519  249.64151 258.72855 ... 257.65933 249.33191 257.03214]
+  [262.07538 257.36707 267.28915 ... 266.55304 261.91498 259.25803]
+  [254.28532 241.24493 254.5542  ... 252.67004 249.44293 250.45274]]
+
+ [[258.81226 255.85918 253.76608 ... 252.67378 255.24254 254.88916]
+  [265.298   266.98398 258.58246 ... 253.92906 259.26694 260.8536 ]
+  [262.6151  259.45175 251.62009 ... 254.23112 257.82693 262.46008]
+  ...
+  [263.79868 269.0709  255.21349 ... 258.82214 263.9164  260.07523]
+  [265.39304 264.09897 259.15875 ... 259.4956  260.79318 264.6643 ]
+  [266.05365 264.83865 257.92352 ... 255.15732 258.95    261.3032 ]]
+
+ [[259.14935 255.12946 256.3977  ... 260.5007  256.83392 254.25664]
+  [264.27432 259.3416  263.96405 ... 263.59418 258.8106  259.44534]
+  [256.63733 252.09796 262.6346  ... 255.52626 260.07693 257.0755 ]
+  ...
+  [249.67671 249.17755 251.44931 ... 254.59613 257.5282  249.33191]
+  [256.80063 245.71855 256.98483 ... 259.85873 250.65874 254.44258]
+  [253.20753 251.22415 252.65993 ... 247.97794 256.5093  254.1801 ]]
+
+ [[271.0874  265.1627  248.09352 ... 254.387   262.07678 248.56134]
+  [265.93964 261.78238 257.88007 ... 256.58508 262.6365  251.57634]
+  [273.50278 264.8204  252.27304 ... 258.0516  267.67328 255.07361]
+  ...
+  [273.59256 264.1399  251.7161  ... 258.91672 264.9156  250.98383]
+  [278.0076  272.22174 256.70898 ... 258.28174 270.62238 258.3416 ]
+  [271.03775 266.56964 253.5844  ... 255.67926 268.96625 256.73773]]]
+MLIR equal to NumPy? True
+```
+
 ### Profiling
 You need to install nsight compute first.
 ```

diff --git a/examples/BuddyGPU/batch_matmul-cubin.mlir b/examples/BuddyGPU/batch_matmul-cubin.mlir
diff --git a/examples/BuddyGPU/batch_matmul.mlir b/examples/BuddyGPU/batch_matmul.mlir
@@ -0,0 +1,17 @@
+!unit = f32
+!lhs = tensor<4x768x1024x!unit>
+!rhs = tensor<4x1024x768x!unit>
+!res = tensor<4x768x768x!unit>
+
+func.func @batch_matmul(
+        %arg0: !lhs, %arg1: !rhs)
+            -> !res {
+       %cst = arith.constant 0.000000e+00 : !unit
+        %0 = tensor.empty() : !res
+        %1 = linalg.fill ins(%cst : !unit) outs(%0 : !res) -> !res
+        %2 = linalg.batch_matmul 
+                            ins(%arg0, %arg1: !lhs, !rhs)
+                            outs(%1: !res)
+            -> !res
+        func.return %2 : !res
+    }
diff --git a/examples/BuddyGPU/makefile b/examples/BuddyGPU/makefile
@@ -20,3 +20,13 @@ buddy-gpu-matmul:
 	${BUDDY_OPT} -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm --convert-vector-to-llvm -convert-gpu-to-nvvm='has-redux=1' | \
 	${BUDDY_OPT} -llvm-request-c-wrappers -canonicalize -cse -sccp | \
 	${MLIR_OPT} --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o matmul-cubin.mlir
+
+bud-batch_matmul-gpu-lower:
+	@${BUDDY_OPT} batch_matmul.mlir -transform-preload-library="transform-library-paths=transform_batch_matmul.mlir" -transform-interpreter="entry-point=codegen" | \
+	${BUDDY_OPT} --pass-pipeline='builtin.module(func.func(nvgpu-optimize-shared-memory))' | \
+	${BUDDY_OPT} -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -linalg-bufferize -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -func-bufferize -arith-bufferize -tensor-bufferize -buffer-deallocation -finalizing-bufferize -canonicalize | \
+	${BUDDY_OPT} -gpu-launch-sink-index-computations -canonicalize -legalize-shmem-outlining -canonicalize | \
+	${BUDDY_OPT} -convert-memcpy-to-gpu -gpu-async-region -canonicalize | \
+	${BUDDY_OPT} -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm --convert-vector-to-llvm -convert-gpu-to-nvvm='has-redux=1' | \
+	${BUDDY_OPT} -llvm-request-c-wrappers -canonicalize -cse -sccp | \
+	${MLIR_OPT} --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o batch_matmul-cubin.mlir