Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add high-dimensional adaptation for GPU #425

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 75 additions & 1 deletion examples/BuddyGPU/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The example program is a simple matrix multiplication kernel. The linalg definit
A transform sequence is in `transform.mlir` to optimize this kernel and prepare it for execution on the GPU.
The `matmul-cubin.mlir` provides a lowered file, in case the pipeline is not working.

Run the following command to compile and run the program:
Run the following command to compile and run the matmul program:
```
make buddy-gpu-matmul
python run-module-gpu.py --source matmul.mlir --target matmul-cubin.mlir --llvm_dir ../../llvm
Expand All @@ -33,6 +33,80 @@ MLIR equal to NumPy? True

As the tensorcore doesn't support fp32 computation, the operands are converted to tf32, hence the result is not exactly the same as the PyTorch result.

## Batch_Matmul
The project Batch_Matmul is similar to Matmul
Run the following command to compile and run the batch_matmul program:
```
make bud-batch_matmul-gpu-lower
python run-module-gpu.py --source batch_matmul.mlir --target batch_matmul-cubin.mlir --llvm_dir ../../llvm
```
The result should be:
```
[[[262.76825 254.99942 261.9515 ... 261.36447 260.22684 262.3626 ]
[260.58057 244.43068 263.4109 ... 260.566 253.99341 260.5964 ]
[261.4656 252.95885 261.60065 ... 254.64616 252.29713 256.57367]
...
[262.47998 249.4758 258.5593 ... 257.48874 249.1685 256.86334]
[261.89804 257.19324 267.10922 ... 266.3719 261.73676 259.08725]
[254.11542 241.08536 254.38817 ... 252.5039 249.27812 250.29007]]

[[258.64035 255.68985 253.599 ... 252.50366 255.07352 254.71776]
[265.12173 266.81024 258.41632 ... 253.75903 259.09418 260.67685]
[262.44107 259.28223 251.45758 ... 254.06253 257.65988 262.2896 ]
...
[263.62085 268.88962 255.04141 ... 258.6461 263.7392 259.8988 ]
[265.2162 263.92514 258.99207 ... 259.3231 260.62183 264.48645]
[265.87585 264.66345 257.75262 ... 254.98543 258.77817 261.12802]]

[[258.97577 254.96257 256.22888 ... 260.3291 256.66476 254.08638]
[264.09952 259.17035 263.79138 ... 263.42252 258.6401 259.27136]
[256.46616 251.93016 262.46368 ... 255.35817 259.9043 256.903 ]
...
[249.51251 249.01341 251.28108 ... 254.42873 257.35614 249.16321]
[256.62653 245.54889 256.8105 ... 259.68204 250.49097 254.27005]
[253.04216 251.057 252.49603 ... 247.81493 256.3396 254.00908]]

[[270.9063 264.9813 247.92715 ... 254.21812 261.9063 248.39583]
[265.76254 261.60773 257.71106 ... 256.41833 262.47015 251.41019]
[273.32095 264.63913 252.10524 ... 257.88034 267.4981 254.9017 ]
...
[273.4149 263.96182 251.55174 ... 258.74625 264.74557 250.82188]
[277.82263 272.03714 256.53876 ... 258.11078 270.44397 258.1671 ]
[270.86002 266.3878 253.41849 ... 255.5107 268.78973 256.5687 ]]]
[[[262.94034 255.16846 262.1241 ... 261.53763 260.397 262.53165]
[260.7543 244.59506 263.58673 ... 260.74084 254.1648 260.76572]
[261.64075 253.12537 261.77618 ... 254.815 252.46964 256.74323]
...
[262.6519 249.64151 258.72855 ... 257.65933 249.33191 257.03214]
[262.07538 257.36707 267.28915 ... 266.55304 261.91498 259.25803]
[254.28532 241.24493 254.5542 ... 252.67004 249.44293 250.45274]]

[[258.81226 255.85918 253.76608 ... 252.67378 255.24254 254.88916]
[265.298 266.98398 258.58246 ... 253.92906 259.26694 260.8536 ]
[262.6151 259.45175 251.62009 ... 254.23112 257.82693 262.46008]
...
[263.79868 269.0709 255.21349 ... 258.82214 263.9164 260.07523]
[265.39304 264.09897 259.15875 ... 259.4956 260.79318 264.6643 ]
[266.05365 264.83865 257.92352 ... 255.15732 258.95 261.3032 ]]

[[259.14935 255.12946 256.3977 ... 260.5007 256.83392 254.25664]
[264.27432 259.3416 263.96405 ... 263.59418 258.8106 259.44534]
[256.63733 252.09796 262.6346 ... 255.52626 260.07693 257.0755 ]
...
[249.67671 249.17755 251.44931 ... 254.59613 257.5282 249.33191]
[256.80063 245.71855 256.98483 ... 259.85873 250.65874 254.44258]
[253.20753 251.22415 252.65993 ... 247.97794 256.5093 254.1801 ]]

[[271.0874 265.1627 248.09352 ... 254.387 262.07678 248.56134]
[265.93964 261.78238 257.88007 ... 256.58508 262.6365 251.57634]
[273.50278 264.8204 252.27304 ... 258.0516 267.67328 255.07361]
...
[273.59256 264.1399 251.7161 ... 258.91672 264.9156 250.98383]
[278.0076 272.22174 256.70898 ... 258.28174 270.62238 258.3416 ]
[271.03775 266.56964 253.5844 ... 255.67926 268.96625 256.73773]]]
MLIR equal to NumPy? True
```

### Profiling
You need to install nsight compute first.
```
Expand Down
158 changes: 158 additions & 0 deletions examples/BuddyGPU/batch_matmul-cubin.mlir

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions examples/BuddyGPU/batch_matmul.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
!unit = f32
!lhs = tensor<4x768x1024x!unit>
!rhs = tensor<4x1024x768x!unit>
!res = tensor<4x768x768x!unit>

func.func @batch_matmul(
%arg0: !lhs, %arg1: !rhs)
-> !res {
%cst = arith.constant 0.000000e+00 : !unit
%0 = tensor.empty() : !res
%1 = linalg.fill ins(%cst : !unit) outs(%0 : !res) -> !res
%2 = linalg.batch_matmul
ins(%arg0, %arg1: !lhs, !rhs)
outs(%1: !res)
-> !res
func.return %2 : !res
}
10 changes: 10 additions & 0 deletions examples/BuddyGPU/makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,13 @@ buddy-gpu-matmul:
${BUDDY_OPT} -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm --convert-vector-to-llvm -convert-gpu-to-nvvm='has-redux=1' | \
${BUDDY_OPT} -llvm-request-c-wrappers -canonicalize -cse -sccp | \
${MLIR_OPT} --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o matmul-cubin.mlir

bud-batch_matmul-gpu-lower:
@${BUDDY_OPT} batch_matmul.mlir -transform-preload-library="transform-library-paths=transform_batch_matmul.mlir" -transform-interpreter="entry-point=codegen" | \
${BUDDY_OPT} --pass-pipeline='builtin.module(func.func(nvgpu-optimize-shared-memory))' | \
${BUDDY_OPT} -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -linalg-bufferize -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -func-bufferize -arith-bufferize -tensor-bufferize -buffer-deallocation -finalizing-bufferize -canonicalize | \
${BUDDY_OPT} -gpu-launch-sink-index-computations -canonicalize -legalize-shmem-outlining -canonicalize | \
${BUDDY_OPT} -convert-memcpy-to-gpu -gpu-async-region -canonicalize | \
${BUDDY_OPT} -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm --convert-vector-to-llvm -convert-gpu-to-nvvm='has-redux=1' | \
${BUDDY_OPT} -llvm-request-c-wrappers -canonicalize -cse -sccp | \
${MLIR_OPT} --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o batch_matmul-cubin.mlir
Loading