Skip to content

Commit

Permalink
riscv64: Implement SIMD sqmul_round_sat and splat+mul instructions (
Browse files Browse the repository at this point in the history
bytecodealliance#6602)

* riscv64: Add splat versions of multiplication instructions

* riscv64: Implement `sqmul_round_sat`
  • Loading branch information
afonso360 authored Jun 19, 2023
1 parent 4756114 commit 0e9ce4c
Show file tree
Hide file tree
Showing 10 changed files with 736 additions and 6 deletions.
3 changes: 0 additions & 3 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -241,14 +241,11 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
"simd_f64x2_cmp",
"simd_f64x2_pmin_pmax",
"simd_f64x2_rounding",
"simd_i16x8_arith2",
"simd_i16x8_cmp",
"simd_i16x8_q15mulr_sat_s",
"simd_i32x4_cmp",
"simd_i32x4_trunc_sat_f32x4",
"simd_i32x4_trunc_sat_f64x2",
"simd_i64x2_cmp",
"simd_i8x16_arith2",
"simd_i8x16_cmp",
"simd_load",
"simd_splat",
Expand Down
14 changes: 11 additions & 3 deletions cranelift/codegen/src/isa/riscv64/inst/vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -318,8 +318,12 @@ impl VecAluOpRRR {
| VecAluOpRRR::VfsubVF => 0b000010,
VecAluOpRRR::VrsubVX => 0b000011,
VecAluOpRRR::VmulVV | VecAluOpRRR::VmulVX => 0b100101,
VecAluOpRRR::VmulhVV => 0b100111,
VecAluOpRRR::VmulhuVV | VecAluOpRRR::VfmulVV | VecAluOpRRR::VfmulVF => 0b100100,
VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhVX => 0b100111,
VecAluOpRRR::VmulhuVV
| VecAluOpRRR::VmulhuVX
| VecAluOpRRR::VfmulVV
| VecAluOpRRR::VfmulVF => 0b100100,
VecAluOpRRR::VsmulVV | VecAluOpRRR::VsmulVX => 0b100111,
VecAluOpRRR::VsllVV | VecAluOpRRR::VsllVX => 0b100101,
VecAluOpRRR::VsrlVV | VecAluOpRRR::VsrlVX => 0b101000,
VecAluOpRRR::VsraVV | VecAluOpRRR::VsraVX => 0b101001,
Expand Down Expand Up @@ -365,6 +369,7 @@ impl VecAluOpRRR {
| VecAluOpRRR::VsubVV
| VecAluOpRRR::VssubVV
| VecAluOpRRR::VssubuVV
| VecAluOpRRR::VsmulVV
| VecAluOpRRR::VsllVV
| VecAluOpRRR::VsrlVV
| VecAluOpRRR::VsraVV
Expand Down Expand Up @@ -399,14 +404,17 @@ impl VecAluOpRRR {
| VecAluOpRRR::VwsubuVX
| VecAluOpRRR::VwsubuWX
| VecAluOpRRR::VwsubWX
| VecAluOpRRR::VmulVX => VecOpCategory::OPMVX,
| VecAluOpRRR::VmulVX
| VecAluOpRRR::VmulhVX
| VecAluOpRRR::VmulhuVX => VecOpCategory::OPMVX,
VecAluOpRRR::VaddVX
| VecAluOpRRR::VsaddVX
| VecAluOpRRR::VsadduVX
| VecAluOpRRR::VsubVX
| VecAluOpRRR::VssubVX
| VecAluOpRRR::VssubuVX
| VecAluOpRRR::VrsubVX
| VecAluOpRRR::VsmulVX
| VecAluOpRRR::VsllVX
| VecAluOpRRR::VsrlVX
| VecAluOpRRR::VsraVX
Expand Down
30 changes: 30 additions & 0 deletions cranelift/codegen/src/isa/riscv64/inst_vector.isle
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@
(VmulVV)
(VmulhVV)
(VmulhuVV)
(VsmulVV)
(VsllVV)
(VsrlVV)
(VsraVV)
Expand Down Expand Up @@ -145,6 +146,9 @@
(VssubVX)
(VssubuVX)
(VmulVX)
(VmulhVX)
(VmulhuVX)
(VsmulVX)
(VsllVX)
(VsrlVX)
(VsraVX)
Expand Down Expand Up @@ -569,11 +573,37 @@
(rule (rv_vmulh_vv vs2 vs1 mask vstate)
(vec_alu_rrr (VecAluOpRRR.VmulhVV) vs2 vs1 mask vstate))

;; Helper for emitting the `vmulh.vx` instruction.
(decl rv_vmulh_vx (VReg XReg VecOpMasking VState) VReg)
(rule (rv_vmulh_vx vs2 vs1 mask vstate)
(vec_alu_rrr (VecAluOpRRR.VmulhVX) vs2 vs1 mask vstate))

;; Helper for emitting the `vmulhu.vv` instruction.
(decl rv_vmulhu_vv (VReg VReg VecOpMasking VState) VReg)
(rule (rv_vmulhu_vv vs2 vs1 mask vstate)
(vec_alu_rrr (VecAluOpRRR.VmulhuVV) vs2 vs1 mask vstate))

;; Helper for emitting the `vmulhu.vx` instruction.
(decl rv_vmulhu_vx (VReg XReg VecOpMasking VState) VReg)
(rule (rv_vmulhu_vx vs2 vs1 mask vstate)
(vec_alu_rrr (VecAluOpRRR.VmulhuVX) vs2 vs1 mask vstate))

;; Helper for emitting the `vsmul.vv` instruction.
;;
;; Signed saturating and rounding fractional multiply
;; # vd[i] = clip(roundoff_signed(vs2[i]*vs1[i], SEW-1))
(decl rv_vsmul_vv (VReg VReg VecOpMasking VState) VReg)
(rule (rv_vsmul_vv vs2 vs1 mask vstate)
(vec_alu_rrr (VecAluOpRRR.VsmulVV) vs2 vs1 mask vstate))

;; Helper for emitting the `vsmul.vx` instruction.
;;
;; Signed saturating and rounding fractional multiply
;; # vd[i] = clip(roundoff_signed(vs2[i]*x[rs1], SEW-1))
(decl rv_vsmul_vx (VReg XReg VecOpMasking VState) VReg)
(rule (rv_vsmul_vx vs2 vs1 mask vstate)
(vec_alu_rrr (VecAluOpRRR.VsmulVX) vs2 vs1 mask vstate))

;; Helper for emitting the `sll.vv` instruction.
(decl rv_vsll_vv (VReg VReg VecOpMasking VState) VReg)
(rule (rv_vsll_vv vs2 vs1 mask vstate)
Expand Down
29 changes: 29 additions & 0 deletions cranelift/codegen/src/isa/riscv64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -420,20 +420,38 @@
(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (imul x y)))
(rv_vmul_vv x y (unmasked) ty))

(rule 4 (lower (has_type (ty_vec_fits_in_register ty) (imul (splat x) y)))
(rv_vmul_vx y x (unmasked) ty))

(rule 5 (lower (has_type (ty_vec_fits_in_register ty) (imul x (splat y))))
(rv_vmul_vx x y (unmasked) ty))

;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (smulhi x y)))
(lower_smlhi ty (sext x ty $I64) (sext y ty $I64)))

(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (smulhi x y)))
(rv_vmulh_vv x y (unmasked) ty))

(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (smulhi (splat x) y)))
(rv_vmulh_vx y x (unmasked) ty))

(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (smulhi x (splat y))))
(rv_vmulh_vx x y (unmasked) ty))

;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (umulhi x y)))
(lower_umlhi ty (zext x ty $I64) (zext y ty $I64)))

(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (umulhi x y)))
(rv_vmulhu_vv x y (unmasked) ty))

(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (umulhi (splat x) y)))
(rv_vmulhu_vx y x (unmasked) ty))

(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (umulhi x (splat y))))
(rv_vmulhu_vx x y (unmasked) ty))

;;;; Rules for `div` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type (fits_in_32 ty) (udiv x y)))
Expand Down Expand Up @@ -1871,3 +1889,14 @@
(elem VReg (rv_vfmv_sf x ty))
(mask VReg (gen_vec_mask 1)))
(rv_vmerge_vvm zero elem mask ty)))

;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 0 (lower (has_type (ty_vec_fits_in_register ty) (sqmul_round_sat x y)))
(rv_vsmul_vv x y (unmasked) ty))

(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (sqmul_round_sat x (splat y))))
(rv_vsmul_vx x y (unmasked) ty))

(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (sqmul_round_sat (splat x) y)))
(rv_vsmul_vx y x (unmasked) ty))
158 changes: 158 additions & 0 deletions cranelift/filetests/filetests/isa/riscv64/simd-imul.clif
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,161 @@ block0(v0: i64x2, v1: i64x2):
; addi sp, sp, 0x10
; ret

function %imul_splat_i8x16(i8x16, i8) -> i8x16 {
block0(v0: i8x16, v1: i8):
v2 = splat.i8x16 v1
v3 = imul v0, v2
return v3
}

; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vmul.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma)
; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; .byte 0xd7, 0x62, 0x15, 0x96
; .byte 0xa7, 0x82, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

function %imul_splat_i16x8(i16x8, i16) -> i16x8 {
block0(v0: i16x8, v1: i16):
v2 = splat.i16x8 v1
v3 = imul v0, v2
return v3
}

; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vmul.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma)
; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; .byte 0x57, 0x70, 0x84, 0xcc
; .byte 0xd7, 0x62, 0x15, 0x96
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0xa7, 0x82, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

function %imul_splat_i32x4(i32x4, i32) -> i32x4 {
block0(v0: i32x4, v1: i32):
v2 = splat.i32x4 v1
v3 = imul v0, v2
return v3
}

; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vmul.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma)
; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; .byte 0x57, 0x70, 0x02, 0xcd
; .byte 0xd7, 0x62, 0x15, 0x96
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0xa7, 0x82, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

function %imul_splat_i64x2(i64x2, i64) -> i64x2 {
block0(v0: i64x2, v1: i64):
v2 = splat.i64x2 v1
v3 = imul v0, v2
return v3
}

; VCode:
; add sp,-16
; sd ra,8(sp)
; sd fp,0(sp)
; mv fp,sp
; block0:
; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
; vmul.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma)
; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
; ld ra,8(sp)
; ld fp,0(sp)
; add sp,+16
; ret
;
; Disassembled:
; block0: ; offset 0x0
; addi sp, sp, -0x10
; sd ra, 8(sp)
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; .byte 0x57, 0x70, 0x08, 0xcc
; addi t6, s0, 0x10
; .byte 0x87, 0x80, 0x0f, 0x02
; .byte 0x57, 0x70, 0x81, 0xcd
; .byte 0xd7, 0x62, 0x15, 0x96
; .byte 0x57, 0x70, 0x08, 0xcc
; .byte 0xa7, 0x82, 0x05, 0x02
; ld ra, 8(sp)
; ld s0, 0(sp)
; addi sp, sp, 0x10
; ret

Loading

0 comments on commit 0e9ce4c

Please sign in to comment.