Skip to content

Commit

Permalink
riscv64: Add SIMD avg_round (bytecodealliance#6599)
Browse files Browse the repository at this point in the history
  • Loading branch information
afonso360 authored Jun 17, 2023
1 parent e19dcfa commit 728d0f5
Show file tree
Hide file tree
Showing 6 changed files with 234 additions and 0 deletions.
3 changes: 3 additions & 0 deletions cranelift/codegen/src/isa/riscv64/inst/vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ impl VecAluOpRRImm5 {
VecAluOpRRImm5::VorVI => 0b001010,
VecAluOpRRImm5::VxorVI => 0b001011,
VecAluOpRRImm5::VslidedownVI => 0b001111,
VecAluOpRRImm5::VssrlVI => 0b101010,
VecAluOpRRImm5::VmergeVIM => 0b010111,
VecAluOpRRImm5::VsadduVI => 0b100000,
VecAluOpRRImm5::VsaddVI => 0b100001,
Expand All @@ -526,6 +527,7 @@ impl VecAluOpRRImm5 {
| VecAluOpRRImm5::VandVI
| VecAluOpRRImm5::VorVI
| VecAluOpRRImm5::VxorVI
| VecAluOpRRImm5::VssrlVI
| VecAluOpRRImm5::VslidedownVI
| VecAluOpRRImm5::VmergeVIM
| VecAluOpRRImm5::VsadduVI
Expand All @@ -539,6 +541,7 @@ impl VecAluOpRRImm5 {
match self {
VecAluOpRRImm5::VsllVI
| VecAluOpRRImm5::VsrlVI
| VecAluOpRRImm5::VssrlVI
| VecAluOpRRImm5::VsraVI
| VecAluOpRRImm5::VslidedownVI
| VecAluOpRRImm5::VrgatherVI
Expand Down
10 changes: 10 additions & 0 deletions cranelift/codegen/src/isa/riscv64/inst_vector.isle
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@
(VandVI)
(VorVI)
(VxorVI)
(VssrlVI)
(VslidedownVI)
(VmergeVIM)
(VrgatherVI)
Expand Down Expand Up @@ -663,6 +664,15 @@
(rule (rv_vxor_vi vs2 imm mask vstate)
(vec_alu_rr_imm5 (VecAluOpRRImm5.VxorVI) vs2 imm mask vstate))

;; Helper for emitting the `vssrl.vi` instruction.
;;
;; vd[i] = (unsigned(vs2[i]) >> imm) + r
;;
;; `r` here is the rounding mode currently selected.
(decl rv_vssrl_vi (VReg UImm5 VecOpMasking VState) VReg)
(rule (rv_vssrl_vi vs2 imm mask vstate)
(vec_alu_rr_uimm5 (VecAluOpRRImm5.VssrlVI) vs2 imm mask vstate))

;; Helper for emitting the `vnot.v` instruction.
;; This is just a mnemonic for `vxor.vi vd, vs, -1`
(decl rv_vnot_v (VReg VecOpMasking VState) VReg)
Expand Down
25 changes: 25 additions & 0 deletions cranelift/codegen/src/isa/riscv64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1817,3 +1817,28 @@
(rhs_hi VReg (rv_vcompress_vm y even_mask ty))
(rhs VReg (rv_vslideup_vvi rhs_lo rhs_hi half_size (unmasked) ty)))
(rv_vadd_vv lhs rhs (unmasked) ty)))

;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `avg_round` computes the unsigned average with rounding: a := (x + y + 1) // 2
;;
;; See Section "2–5 Average of Two Integers" of the Hacker's Delight book
;;
;; The floor average of two integers without overflow can be computed as:
;; t = (x & y) + ((x ^ y) >> 1)
;;
;; The right shift should be a logical shift if the integers are unsigned.
;;
;; We are however interested in the ceiling average (x + y + 1). For that
;; we use a special rounding mode in the right shift instruction.
;;
;; For the right shift instruction we use `vssrl` which is a Scaling Shift
;; Right Logical instruction using the `vxrm` fixed-point rouding mode. The
;; default rounding mode is `rnu` (round-to-nearest-up (add +0.5 LSB)).
;; Which is coincidentally the rounding mode we want for `avg_round`.
(rule (lower (has_type (ty_vec_fits_in_register ty) (avg_round x y)))
(if-let one (u64_to_uimm5 1))
(let ((lhs VReg (rv_vand_vv x y (unmasked) ty))
(xor VReg (rv_vxor_vv x y (unmasked) ty))
(rhs VReg (rv_vssrl_vi xor one (unmasked) ty)))
(rv_vadd_vv lhs rhs (unmasked) ty)))
194 changes: 194 additions & 0 deletions cranelift/filetests/filetests/isa/riscv64/simd-avg_round.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
test compile precise-output
set unwind_info=false
target riscv64 has_v

function %avg_round_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
v2 = avg_round v0, v1
return v2
}

; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vand.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma)
; vxor.vv v8,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma)
; vssrl.vi v10,v8,1 #avl=16, #vtype=(e8, m1, ta, ma)
; vadd.vv v12,v6,v10 #avl=16, #vtype=(e8, m1, ta, ma)
; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x81, 0x0f, 0x02
; .byte 0x57, 0x83, 0x11, 0x26
; .byte 0x57, 0x84, 0x11, 0x2e
; .byte 0x57, 0xb5, 0x80, 0xaa
; .byte 0x57, 0x06, 0x65, 0x02
; .byte 0x27, 0x06, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

function %avg_round_i16x8(i16x8, i16x8) -> i16x8 {
block0(v0: i16x8, v1: i16x8):
v2 = avg_round v0, v1
return v2
}

; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vand.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma)
; vxor.vv v8,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma)
; vssrl.vi v10,v8,1 #avl=8, #vtype=(e16, m1, ta, ma)
; vadd.vv v12,v6,v10 #avl=8, #vtype=(e16, m1, ta, ma)
; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x81, 0x0f, 0x02
; .byte 0x57, 0x70, 0x84, 0xcc
; .byte 0x57, 0x83, 0x11, 0x26
; .byte 0x57, 0x84, 0x11, 0x2e
; .byte 0x57, 0xb5, 0x80, 0xaa
; .byte 0x57, 0x06, 0x65, 0x02
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0x27, 0x06, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

function %avg_round_i32x4(i32x4, i32x4) -> i32x4 {
block0(v0: i32x4, v1: i32x4):
v2 = avg_round v0, v1
return v2
}

; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vand.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma)
; vxor.vv v8,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma)
; vssrl.vi v10,v8,1 #avl=4, #vtype=(e32, m1, ta, ma)
; vadd.vv v12,v6,v10 #avl=4, #vtype=(e32, m1, ta, ma)
; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x81, 0x0f, 0x02
; .byte 0x57, 0x70, 0x02, 0xcd
; .byte 0x57, 0x83, 0x11, 0x26
; .byte 0x57, 0x84, 0x11, 0x2e
; .byte 0x57, 0xb5, 0x80, 0xaa
; .byte 0x57, 0x06, 0x65, 0x02
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0x27, 0x06, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

function %avg_round_i64x2(i64x2, i64x2) -> i64x2 {
block0(v0: i64x2, v1: i64x2):
v2 = avg_round v0, v1
return v2
}

; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vand.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma)
; vxor.vv v8,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma)
; vssrl.vi v10,v8,1 #avl=2, #vtype=(e64, m1, ta, ma)
; vadd.vv v12,v6,v10 #avl=2, #vtype=(e64, m1, ta, ma)
; vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; addi t6, s0, 0x20
; .byte 0x87, 0x81, 0x0f, 0x02
; .byte 0x57, 0x70, 0x81, 0xcd
; .byte 0x57, 0x83, 0x11, 0x26
; .byte 0x57, 0x84, 0x11, 0x2e
; .byte 0x57, 0xb5, 0x80, 0xaa
; .byte 0x57, 0x06, 0x65, 0x02
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0x27, 0x06, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
; the interpreter does not currently support SIMD `avg_round`.
test run
target aarch64
target riscv64 has_v
; x86_64 and s390x do not currently support 64-bit vectors, or
; `avg_round` on `i64x2` values.
; x86_64 also does not currently support `avg_round.i32x4`.
Expand Down
1 change: 1 addition & 0 deletions cranelift/filetests/filetests/runtests/simd-avg-round.clif
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ target s390x
set enable_simd
target x86_64
target x86_64 skylake
target riscv64 has_v

function %average_rounding_i8x16(i8x16, i8x16) -> i8x16 {
block0(v0: i8x16, v1: i8x16):
Expand Down

0 comments on commit 728d0f5

Please sign in to comment.