riscv64: Add SIMD avg_round (bytecodealliance#6599)

gurry · Jun 17, 2023 · 728d0f5 · 728d0f5
1 parent e19dcfa
commit 728d0f5
Show file tree

Hide file tree

Showing 6 changed files with 234 additions and 0 deletions.
diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -508,6 +508,7 @@ impl VecAluOpRRImm5 {
             VecAluOpRRImm5::VorVI => 0b001010,
             VecAluOpRRImm5::VxorVI => 0b001011,
             VecAluOpRRImm5::VslidedownVI => 0b001111,
+            VecAluOpRRImm5::VssrlVI => 0b101010,
             VecAluOpRRImm5::VmergeVIM => 0b010111,
             VecAluOpRRImm5::VsadduVI => 0b100000,
             VecAluOpRRImm5::VsaddVI => 0b100001,
@@ -526,6 +527,7 @@ impl VecAluOpRRImm5 {
             | VecAluOpRRImm5::VandVI
             | VecAluOpRRImm5::VorVI
             | VecAluOpRRImm5::VxorVI
+            | VecAluOpRRImm5::VssrlVI
             | VecAluOpRRImm5::VslidedownVI
             | VecAluOpRRImm5::VmergeVIM
             | VecAluOpRRImm5::VsadduVI
@@ -539,6 +541,7 @@ impl VecAluOpRRImm5 {
         match self {
             VecAluOpRRImm5::VsllVI
             | VecAluOpRRImm5::VsrlVI
+            | VecAluOpRRImm5::VssrlVI
             | VecAluOpRRImm5::VsraVI
             | VecAluOpRRImm5::VslidedownVI
             | VecAluOpRRImm5::VrgatherVI

diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -188,6 +188,7 @@
   (VandVI)
   (VorVI)
   (VxorVI)
+  (VssrlVI)
   (VslidedownVI)
   (VmergeVIM)
   (VrgatherVI)
@@ -663,6 +664,15 @@
 (rule (rv_vxor_vi vs2 imm mask vstate)
   (vec_alu_rr_imm5 (VecAluOpRRImm5.VxorVI) vs2 imm mask vstate))
 
+;; Helper for emitting the `vssrl.vi` instruction.
+;;
+;; vd[i] = (unsigned(vs2[i]) >> imm) + r
+;;
+;; `r` here is the rounding mode currently selected.
+(decl rv_vssrl_vi (VReg UImm5 VecOpMasking VState) VReg)
+(rule (rv_vssrl_vi vs2 imm mask vstate)
+  (vec_alu_rr_uimm5 (VecAluOpRRImm5.VssrlVI) vs2 imm mask vstate))
+
 ;; Helper for emitting the `vnot.v` instruction.
 ;; This is just a mnemonic for `vxor.vi vd, vs, -1`
 (decl rv_vnot_v (VReg VecOpMasking VState) VReg)

diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -1817,3 +1817,28 @@
         (rhs_hi VReg (rv_vcompress_vm y even_mask ty))
         (rhs VReg (rv_vslideup_vvi rhs_lo rhs_hi half_size (unmasked) ty)))
     (rv_vadd_vv lhs rhs (unmasked) ty)))
+
+;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; `avg_round` computes the unsigned average with rounding: a := (x + y + 1) // 2
+;;
+;; See Section "2–5 Average of Two Integers" of the Hacker's Delight book
+;;
+;; The floor average of two integers without overflow can be computed as:
+;;     t = (x & y) + ((x ^ y) >> 1)
+;;
+;; The right shift should be a logical shift if the integers are unsigned.
+;;
+;; We are however interested in the ceiling average (x + y + 1). For that
+;; we use a special rounding mode in the right shift instruction.
+;;
+;; For the right shift instruction we use `vssrl` which is a Scaling Shift
+;; Right Logical instruction using the `vxrm` fixed-point rouding mode. The
+;; default rounding mode is `rnu` (round-to-nearest-up (add +0.5 LSB)).
+;; Which is coincidentally the rounding mode we want for `avg_round`.
+(rule (lower (has_type (ty_vec_fits_in_register ty) (avg_round x y)))
+  (if-let one (u64_to_uimm5 1))
+  (let ((lhs VReg (rv_vand_vv x y (unmasked) ty))
+        (xor  VReg (rv_vxor_vv x y (unmasked) ty))
+        (rhs VReg (rv_vssrl_vi xor one (unmasked) ty)))
+    (rv_vadd_vv lhs rhs (unmasked) ty)))
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-avg_round.clif b/cranelift/filetests/filetests/isa/riscv64/simd-avg_round.clif
@@ -0,0 +1,194 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %avg_round_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = avg_round v0, v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vxor.vv v8,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vssrl.vi v10,v8,1 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vadd.vv v12,v6,v10 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   .byte 0x57, 0x83, 0x11, 0x26
+;   .byte 0x57, 0x84, 0x11, 0x2e
+;   .byte 0x57, 0xb5, 0x80, 0xaa
+;   .byte 0x57, 0x06, 0x65, 0x02
+;   .byte 0x27, 0x06, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %avg_round_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = avg_round v0, v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vxor.vv v8,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vssrl.vi v10,v8,1 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vadd.vv v12,v6,v10 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x83, 0x11, 0x26
+;   .byte 0x57, 0x84, 0x11, 0x2e
+;   .byte 0x57, 0xb5, 0x80, 0xaa
+;   .byte 0x57, 0x06, 0x65, 0x02
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x06, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %avg_round_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = avg_round v0, v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vxor.vv v8,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vssrl.vi v10,v8,1 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vadd.vv v12,v6,v10 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x83, 0x11, 0x26
+;   .byte 0x57, 0x84, 0x11, 0x2e
+;   .byte 0x57, 0xb5, 0x80, 0xaa
+;   .byte 0x57, 0x06, 0x65, 0x02
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x06, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %avg_round_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = avg_round v0, v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vxor.vv v8,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vssrl.vi v10,v8,1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vadd.vv v12,v6,v10 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v12,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x83, 0x11, 0x26
+;   .byte 0x57, 0x84, 0x11, 0x2e
+;   .byte 0x57, 0xb5, 0x80, 0xaa
+;   .byte 0x57, 0x06, 0x65, 0x02
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x06, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-avg-round-small.clif b/cranelift/filetests/filetests/runtests/simd-avg-round-small.clif
@@ -1,6 +1,7 @@
 ; the interpreter does not currently support SIMD `avg_round`.
 test run
 target aarch64
+target riscv64 has_v
 ; x86_64 and s390x do not currently support 64-bit vectors, or
 ; `avg_round` on `i64x2` values.
 ; x86_64 also does not currently support `avg_round.i32x4`.

diff --git a/cranelift/filetests/filetests/runtests/simd-avg-round.clif b/cranelift/filetests/filetests/runtests/simd-avg-round.clif
@@ -4,6 +4,7 @@ target s390x
 set enable_simd
 target x86_64
 target x86_64 skylake
+target riscv64 has_v
 
 function %average_rounding_i8x16(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):