]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: intrinsify math/bits.Bswap on riscv64
authorJoel Sing <joel@sing.id.au>
Wed, 19 Mar 2025 14:09:23 +0000 (01:09 +1100)
committerJoel Sing <joel@sing.id.au>
Thu, 1 May 2025 12:57:13 +0000 (05:57 -0700)
For riscv64/rva22u64 and above, we can intrinsify math/bits.Bswap
using the REV8 machine instruction.

On a StarFive VisionFive 2 with GORISCV64=rva22u64:

                 │     rb.1     │                rb.2                 │
                 │    sec/op    │   sec/op     vs base                │
ReverseBytes-4     18.790n ± 0%   4.026n ± 0%  -78.57% (p=0.000 n=10)
ReverseBytes16-4    6.710n ± 0%   5.368n ± 0%  -20.00% (p=0.000 n=10)
ReverseBytes32-4   13.420n ± 0%   5.368n ± 0%  -60.00% (p=0.000 n=10)
ReverseBytes64-4   17.450n ± 0%   4.026n ± 0%  -76.93% (p=0.000 n=10)
geomean             13.11n        4.649n       -64.54%

Change-Id: I26eee34270b1721f7304bb1cddb0fda129b20ece
Reviewed-on: https://go-review.googlesource.com/c/go/+/660855
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
src/cmd/compile/internal/riscv64/ssa.go
src/cmd/compile/internal/ssa/_gen/RISCV64.rules
src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteRISCV64.go
src/cmd/compile/internal/ssagen/intrinsics.go
src/cmd/compile/internal/ssagen/intrinsics_test.go
test/codegen/mathbits.go

index 952a2050a08e08e95743753e879ee27475b64301..4428d359a8eea594e3eb6fa7468893b95c4328ad 100644 (file)
@@ -419,7 +419,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                ssa.OpRISCV64FMVSX, ssa.OpRISCV64FMVDX,
                ssa.OpRISCV64FCVTSW, ssa.OpRISCV64FCVTSL, ssa.OpRISCV64FCVTWS, ssa.OpRISCV64FCVTLS,
                ssa.OpRISCV64FCVTDW, ssa.OpRISCV64FCVTDL, ssa.OpRISCV64FCVTWD, ssa.OpRISCV64FCVTLD, ssa.OpRISCV64FCVTDS, ssa.OpRISCV64FCVTSD,
-               ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CLZ, ssa.OpRISCV64CLZW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW:
+               ssa.OpRISCV64NOT, ssa.OpRISCV64NEG, ssa.OpRISCV64NEGW, ssa.OpRISCV64CLZ, ssa.OpRISCV64CLZW, ssa.OpRISCV64CTZ, ssa.OpRISCV64CTZW,
+               ssa.OpRISCV64REV8:
                p := s.Prog(v.Op.Asm())
                p.From.Type = obj.TYPE_REG
                p.From.Reg = v.Args[0].Reg()
index a5d4fb72ec687f23f0938d33e143f336aa97b531..b8b0429de28ea9cd0e1573e716ad746b35ee877a 100644 (file)
 (BitLen16 x) => (BitLen64 (ZeroExt16to64 x))
 (BitLen8  x) => (BitLen64 (ZeroExt8to64 x))
 
+// Byte swap (note that these will only be emitted for rva22u64 and above).
+(Bswap64 ...) => (REV8 ...)
+(Bswap32 <t> x) => (SRLI [32] (REV8 <t> x))
+(Bswap16 <t> x) => (SRLI [48] (REV8 <t> x))
+
 (Less64  ...) => (SLT  ...)
 (Less32  x y) => (SLT  (SignExt32to64 x) (SignExt32to64 y))
 (Less16  x y) => (SLT  (SignExt16to64 x) (SignExt16to64 y))
index cc2302ff374fa57084c93590600b3cdd70c18399..86412ce8a6769824029f9ed392596243213e097f 100644 (file)
@@ -237,6 +237,7 @@ func init() {
                {name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true},     // arg0 | arg1
                {name: "ORN", argLength: 2, reg: gp21, asm: "ORN"},                      // ^arg0 | arg1
                {name: "ORI", argLength: 1, reg: gp11, asm: "ORI", aux: "Int64"},        // arg0 | auxint
+               {name: "REV8", argLength: 1, reg: gp11, asm: "REV8"},                    // reverse bytes
                {name: "ROL", argLength: 2, reg: gp21, asm: "ROL"},                      // rotate left arg0 by (arg1 & 63)
                {name: "ROLW", argLength: 2, reg: gp21, asm: "ROLW"},                    // rotate left least significant word of arg0 by (arg1 & 31), sign extended
                {name: "ROR", argLength: 2, reg: gp21, asm: "ROR"},                      // rotate right arg0 by (arg1 & 63)
index de6ccf25f2f26ea7d93c5f25298e81149ade1073..6eeb90721b03fa29188ccca364e5b6af2c4194dd 100644 (file)
@@ -2520,6 +2520,7 @@ const (
        OpRISCV64OR
        OpRISCV64ORN
        OpRISCV64ORI
+       OpRISCV64REV8
        OpRISCV64ROL
        OpRISCV64ROLW
        OpRISCV64ROR
@@ -33968,6 +33969,19 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:   "REV8",
+               argLen: 1,
+               asm:    riscv.AREV8,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+                       },
+                       outputs: []outputInfo{
+                               {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
+                       },
+               },
+       },
        {
                name:   "ROL",
                argLen: 2,
index 182ca2d3fd674fad6a4b83cb6d45a99b23eb3bbf..d0e2c909e0d58e1522bbd856f0683c5c3b66a1b1 100644 (file)
@@ -110,6 +110,13 @@ func rewriteValueRISCV64(v *Value) bool {
                return rewriteValueRISCV64_OpBitLen64(v)
        case OpBitLen8:
                return rewriteValueRISCV64_OpBitLen8(v)
+       case OpBswap16:
+               return rewriteValueRISCV64_OpBswap16(v)
+       case OpBswap32:
+               return rewriteValueRISCV64_OpBswap32(v)
+       case OpBswap64:
+               v.Op = OpRISCV64REV8
+               return true
        case OpClosureCall:
                v.Op = OpRISCV64CALLclosure
                return true
@@ -1002,6 +1009,38 @@ func rewriteValueRISCV64_OpBitLen8(v *Value) bool {
                return true
        }
 }
+func rewriteValueRISCV64_OpBswap16(v *Value) bool {
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Bswap16 <t> x)
+       // result: (SRLI [48] (REV8 <t> x))
+       for {
+               t := v.Type
+               x := v_0
+               v.reset(OpRISCV64SRLI)
+               v.AuxInt = int64ToAuxInt(48)
+               v0 := b.NewValue0(v.Pos, OpRISCV64REV8, t)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               return true
+       }
+}
+func rewriteValueRISCV64_OpBswap32(v *Value) bool {
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Bswap32 <t> x)
+       // result: (SRLI [32] (REV8 <t> x))
+       for {
+               t := v.Type
+               x := v_0
+               v.reset(OpRISCV64SRLI)
+               v.AuxInt = int64ToAuxInt(32)
+               v0 := b.NewValue0(v.Pos, OpRISCV64REV8, t)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               return true
+       }
+}
 func rewriteValueRISCV64_OpConst16(v *Value) bool {
        // match: (Const16 [val])
        // result: (MOVDconst [int64(val)])
index eaced0b2775e3a545faccb852374587b5da25ff3..86ab98118d8b1814cdb273916b3a5f17d5efa88d 100644 (file)
@@ -184,22 +184,44 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
                },
                all...)
 
-       brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X}
-       if cfg.goppc64 >= 10 {
-               // Use only on Power10 as the new byte reverse instructions that Power10 provide
-               // make it worthwhile as an intrinsic
-               brev_arch = append(brev_arch, sys.PPC64)
-       }
        addF("internal/runtime/sys", "Bswap32",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
                },
-               brev_arch...)
+               sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
        addF("internal/runtime/sys", "Bswap64",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
                },
-               brev_arch...)
+               sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
+
+       if cfg.goppc64 >= 10 {
+               // Use only on Power10 as the new byte reverse instructions that Power10 provide
+               // make it worthwhile as an intrinsic
+               addF("internal/runtime/sys", "Bswap32",
+                       func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                               return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
+                       },
+                       sys.PPC64)
+               addF("internal/runtime/sys", "Bswap64",
+                       func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                               return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
+                       },
+                       sys.PPC64)
+       }
+
+       if cfg.goriscv64 >= 22 {
+               addF("internal/runtime/sys", "Bswap32",
+                       func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                               return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
+                       },
+                       sys.RISCV64)
+               addF("internal/runtime/sys", "Bswap64",
+                       func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                               return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
+                       },
+                       sys.RISCV64)
+       }
 
        /****** Prefetch ******/
        makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
@@ -924,23 +946,30 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
                        sys.RISCV64)
        }
 
+       // ReverseBytes inlines correctly, no need to intrinsify it.
        alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
        alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
+       // Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
        addF("math/bits", "ReverseBytes16",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
                },
                sys.Loong64)
-       // ReverseBytes inlines correctly, no need to intrinsify it.
-       // Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
-       // On Power10, 16-bit rotate is not available so use BRH instruction
        if cfg.goppc64 >= 10 {
+               // On Power10, 16-bit rotate is not available so use BRH instruction
                addF("math/bits", "ReverseBytes16",
                        func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                                return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
                        },
                        sys.PPC64)
        }
+       if cfg.goriscv64 >= 22 {
+               addF("math/bits", "ReverseBytes16",
+                       func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                               return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
+                       },
+                       sys.RISCV64)
+       }
 
        addF("math/bits", "Len64",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
index 230a7bdf67d28e05630c3c1f1ced92f6742be602..e6275734f2c57cfb422841e48a045c8611c5144a 100644 (file)
@@ -1107,6 +1107,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
        {"riscv64", "internal/runtime/math", "Add64"}:                      struct{}{},
        {"riscv64", "internal/runtime/math", "Mul64"}:                      struct{}{},
        {"riscv64", "internal/runtime/math", "MulUintptr"}:                 struct{}{},
+       {"riscv64", "internal/runtime/sys", "Bswap32"}:                     struct{}{},
+       {"riscv64", "internal/runtime/sys", "Bswap64"}:                     struct{}{},
        {"riscv64", "internal/runtime/sys", "GetCallerPC"}:                 struct{}{},
        {"riscv64", "internal/runtime/sys", "GetCallerSP"}:                 struct{}{},
        {"riscv64", "internal/runtime/sys", "GetClosurePtr"}:               struct{}{},
@@ -1129,6 +1131,9 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
        {"riscv64", "math/bits", "Len8"}:                                   struct{}{},
        {"riscv64", "math/bits", "Mul"}:                                    struct{}{},
        {"riscv64", "math/bits", "Mul64"}:                                  struct{}{},
+       {"riscv64", "math/bits", "ReverseBytes16"}:                         struct{}{},
+       {"riscv64", "math/bits", "ReverseBytes32"}:                         struct{}{},
+       {"riscv64", "math/bits", "ReverseBytes64"}:                         struct{}{},
        {"riscv64", "math/bits", "RotateLeft"}:                             struct{}{},
        {"riscv64", "math/bits", "RotateLeft16"}:                           struct{}{},
        {"riscv64", "math/bits", "RotateLeft32"}:                           struct{}{},
index 873354b838ec5724c1d2f77b110b2f7d00789af5..e9dfbb1443e2e360088d35080d7d02f83ccce094 100644 (file)
@@ -261,42 +261,46 @@ func Reverse8(n uint8) uint8 {
 // ----------------------- //
 
 func ReverseBytes(n uint) uint {
-       // amd64:"BSWAPQ"
        // 386:"BSWAPL"
-       // s390x:"MOVDBR"
+       // amd64:"BSWAPQ"
        // arm64:"REV"
        // loong64:"REVBV"
+       // riscv64/rva22u64,riscv64/rva23u64:"REV8"
+       // s390x:"MOVDBR"
        return bits.ReverseBytes(n)
 }
 
 func ReverseBytes64(n uint64) uint64 {
-       // amd64:"BSWAPQ"
        // 386:"BSWAPL"
-       // s390x:"MOVDBR"
+       // amd64:"BSWAPQ"
        // arm64:"REV"
-       // ppc64x/power10: "BRD"
        // loong64:"REVBV"
+       // ppc64x/power10: "BRD"
+       // riscv64/rva22u64,riscv64/rva23u64:"REV8"
+       // s390x:"MOVDBR"
        return bits.ReverseBytes64(n)
 }
 
 func ReverseBytes32(n uint32) uint32 {
-       // amd64:"BSWAPL"
        // 386:"BSWAPL"
-       // s390x:"MOVWBR"
+       // amd64:"BSWAPL"
        // arm64:"REVW"
        // loong64:"REVB2W"
        // ppc64x/power10: "BRW"
+       // riscv64/rva22u64,riscv64/rva23u64:"REV8","SRLI\t\\$32"
+       // s390x:"MOVWBR"
        return bits.ReverseBytes32(n)
 }
 
 func ReverseBytes16(n uint16) uint16 {
        // amd64:"ROLW"
-       // arm64:"REV16W",-"UBFX",-"ORR"
        // arm/5:"SLL","SRL","ORR"
        // arm/6:"REV16"
        // arm/7:"REV16"
+       // arm64:"REV16W",-"UBFX",-"ORR"
        // loong64:"REVB2H"
        // ppc64x/power10: "BRH"
+       // riscv64/rva22u64,riscv64/rva23u64:"REV8","SRLI\t\\$48"
        return bits.ReverseBytes16(n)
 }