]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: intrinsify math/bits/ReverseBytes{16|32|64} for ppc64/power10
authorArchana R <aravind5@in.ibm.com>
Mon, 31 Oct 2022 16:47:17 +0000 (11:47 -0500)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Fri, 3 Feb 2023 19:01:06 +0000 (19:01 +0000)
This change intrinsifies ReverseBytes{16|32|64} by generating the
corresponding new instructions in Power10: brh, brd and brw and
adds a verification test for the same.
On Power 9 and 8, the .go code performs optimally as it is.

Performance improvement seen on Power10:
ReverseBytes32  1.38ns ± 0%  1.18ns ± 0%  -14.2
ReverseBytes64  1.52ns ± 0%  1.11ns ± 0%  -26.87
ReverseBytes16  1.41ns ± 1%  1.18ns ± 0%  -16.47

Change-Id: I88f127f3ab9ba24a772becc21ad90acfba324b37
Reviewed-on: https://go-review.googlesource.com/c/go/+/446675
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
src/cmd/compile/internal/ppc64/ssa.go
src/cmd/compile/internal/ssa/_gen/PPC64.rules
src/cmd/compile/internal/ssa/_gen/PPC64Ops.go
src/cmd/compile/internal/ssa/_gen/genericOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewritePPC64.go
src/cmd/compile/internal/ssagen/ssa.go
test/codegen/mathbits.go
test/run.go

index d567a12b01c596e44ed6c0169abf089a1b54bd75..08a2a0cfa2f997bf3cccbf958ab5988adaa322b7 100644 (file)
@@ -670,7 +670,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
        case ssa.OpPPC64NEG, ssa.OpPPC64FNEG, ssa.OpPPC64FSQRT, ssa.OpPPC64FSQRTS, ssa.OpPPC64FFLOOR, ssa.OpPPC64FTRUNC, ssa.OpPPC64FCEIL,
                ssa.OpPPC64FCTIDZ, ssa.OpPPC64FCTIWZ, ssa.OpPPC64FCFID, ssa.OpPPC64FCFIDS, ssa.OpPPC64FRSP, ssa.OpPPC64CNTLZD, ssa.OpPPC64CNTLZW,
                ssa.OpPPC64POPCNTD, ssa.OpPPC64POPCNTW, ssa.OpPPC64POPCNTB, ssa.OpPPC64MFVSRD, ssa.OpPPC64MTVSRD, ssa.OpPPC64FABS, ssa.OpPPC64FNABS,
-               ssa.OpPPC64FROUND, ssa.OpPPC64CNTTZW, ssa.OpPPC64CNTTZD:
+               ssa.OpPPC64FROUND, ssa.OpPPC64CNTTZW, ssa.OpPPC64CNTTZD, ssa.OpPPC64BRH, ssa.OpPPC64BRW, ssa.OpPPC64BRD:
                r := v.Reg()
                p := s.Prog(v.Op.Asm())
                p.To.Type = obj.TYPE_REG
index 2eda1af3bfc943f172b003fffb4ef83fb639977f..795312525525d302eddd55d989371739a555008b 100644 (file)
 (PrefetchCache ptr mem)          => (DCBT ptr mem [0])
 (PrefetchCacheStreamed ptr mem)  => (DCBT ptr mem [16])
 
+// Use byte reverse instructions on Power10
+(Bswap(16|32|64) x) && buildcfg.GOPPC64>=10 => (BR(H|W|D) x)
index 88d85ed946d1ada68efa4589d7e11dfcccbaac99..10e8f1d97fdba88aabb96f835a56682e0c0aa7e2 100644 (file)
@@ -295,6 +295,9 @@ func init() {
                {name: "XORCC", argLength: 2, reg: gp21, asm: "XORCC", commutative: true, clobberFlags: true, typ: "(Int,Flags)"},   // arg0^arg1 sets CC
                {name: "EQV", argLength: 2, reg: gp21, asm: "EQV", typ: "Int64", commutative: true},                                 // arg0^^arg1
                {name: "NEG", argLength: 1, reg: gp11, asm: "NEG"},                                                                  // -arg0 (integer)
+               {name: "BRD", argLength: 1, reg: gp11, asm: "BRD"},                                                                  // reversebytes64(arg0)
+               {name: "BRW", argLength: 1, reg: gp11, asm: "BRW"},                                                                  // reversebytes32(arg0)
+               {name: "BRH", argLength: 1, reg: gp11, asm: "BRH"},                                                                  // reversebytes16(arg0)
                {name: "FNEG", argLength: 1, reg: fp11, asm: "FNEG"},                                                                // -arg0 (floating point)
                {name: "FSQRT", argLength: 1, reg: fp11, asm: "FSQRT"},                                                              // sqrt(arg0) (floating point)
                {name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS"},                                                            // sqrt(arg0) (floating point, single precision)
index cb4470a99be7024bfe850652030845dc68d23a5c..6ecccc3e92b0622905fc8a3c6bc795368cc93b83 100644 (file)
@@ -238,6 +238,7 @@ var genericOps = []opData{
        {name: "BitLen32", argLength: 1},     // Number of bits in arg[0] (returns 0-32)
        {name: "BitLen64", argLength: 1},     // Number of bits in arg[0] (returns 0-64)
 
+       {name: "Bswap16", argLength: 1}, // Swap bytes
        {name: "Bswap32", argLength: 1}, // Swap bytes
        {name: "Bswap64", argLength: 1}, // Swap bytes
 
index 6c26213eac184b302b20c63d887f724317c06d68..59e1a5eb7621cced1ac92f8146308cecd3c66d4a 100644 (file)
@@ -2161,6 +2161,9 @@ const (
        OpPPC64XORCC
        OpPPC64EQV
        OpPPC64NEG
+       OpPPC64BRD
+       OpPPC64BRW
+       OpPPC64BRH
        OpPPC64FNEG
        OpPPC64FSQRT
        OpPPC64FSQRTS
@@ -2962,6 +2965,7 @@ const (
        OpBitLen16
        OpBitLen32
        OpBitLen64
+       OpBswap16
        OpBswap32
        OpBswap64
        OpBitRev8
@@ -29013,6 +29017,45 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:   "BRD",
+               argLen: 1,
+               asm:    ppc64.ABRD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+                       },
+                       outputs: []outputInfo{
+                               {0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+                       },
+               },
+       },
+       {
+               name:   "BRW",
+               argLen: 1,
+               asm:    ppc64.ABRW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+                       },
+                       outputs: []outputInfo{
+                               {0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+                       },
+               },
+       },
+       {
+               name:   "BRH",
+               argLen: 1,
+               asm:    ppc64.ABRH,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+                       },
+                       outputs: []outputInfo{
+                               {0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+                       },
+               },
+       },
        {
                name:   "FNEG",
                argLen: 1,
@@ -38564,6 +38607,11 @@ var opcodeTable = [...]opInfo{
                argLen:  1,
                generic: true,
        },
+       {
+               name:    "Bswap16",
+               argLen:  1,
+               generic: true,
+       },
        {
                name:    "Bswap32",
                argLen:  1,
index aee570df7a68fa66e0489035c462f6d451078f0b..7b6e3beb71f93d3acf6fff0312476ce2cbcc62bb 100644 (file)
@@ -107,6 +107,12 @@ func rewriteValuePPC64(v *Value) bool {
                return rewriteValuePPC64_OpBitLen32(v)
        case OpBitLen64:
                return rewriteValuePPC64_OpBitLen64(v)
+       case OpBswap16:
+               return rewriteValuePPC64_OpBswap16(v)
+       case OpBswap32:
+               return rewriteValuePPC64_OpBswap32(v)
+       case OpBswap64:
+               return rewriteValuePPC64_OpBswap64(v)
        case OpCeil:
                v.Op = OpPPC64FCEIL
                return true
@@ -1122,6 +1128,54 @@ func rewriteValuePPC64_OpBitLen64(v *Value) bool {
                return true
        }
 }
+func rewriteValuePPC64_OpBswap16(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (Bswap16 x)
+       // cond: buildcfg.GOPPC64>=10
+       // result: (BRH x)
+       for {
+               x := v_0
+               if !(buildcfg.GOPPC64 >= 10) {
+                       break
+               }
+               v.reset(OpPPC64BRH)
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValuePPC64_OpBswap32(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (Bswap32 x)
+       // cond: buildcfg.GOPPC64>=10
+       // result: (BRW x)
+       for {
+               x := v_0
+               if !(buildcfg.GOPPC64 >= 10) {
+                       break
+               }
+               v.reset(OpPPC64BRW)
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValuePPC64_OpBswap64(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (Bswap64 x)
+       // cond: buildcfg.GOPPC64>=10
+       // result: (BRD x)
+       for {
+               x := v_0
+               if !(buildcfg.GOPPC64 >= 10) {
+                       break
+               }
+               v.reset(OpPPC64BRD)
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
 func rewriteValuePPC64_OpCom16(v *Value) bool {
        v_0 := v.Args[0]
        // match: (Com16 x)
index 654db8f317f511e9352bb40693d2b64cc758f2a7..48f813a48fee579ee098c19de95ac4e5750d76c6 100644 (file)
@@ -4000,17 +4000,23 @@ func InitTables() {
                },
                sys.ARM64, sys.PPC64)
 
+       /* Use only on Power10 as the new byte reverse instructions that Power10 provide
+          make it worthwhile as an intrinsic */
+       brev_arch := []sys.ArchFamily{sys.AMD64, sys.ARM64, sys.ARM, sys.S390X}
+       if buildcfg.GOPPC64 >= 10 {
+               brev_arch = append(brev_arch, sys.PPC64)
+       }
        /******** runtime/internal/sys ********/
        addF("runtime/internal/sys", "Bswap32",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
                },
-               sys.AMD64, sys.ARM64, sys.ARM, sys.S390X)
+               brev_arch...)
        addF("runtime/internal/sys", "Bswap64",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
                },
-               sys.AMD64, sys.ARM64, sys.ARM, sys.S390X)
+               brev_arch...)
 
        /****** Prefetch ******/
        makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
@@ -4537,7 +4543,16 @@ func InitTables() {
        alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...)
        alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...)
        // ReverseBytes inlines correctly, no need to intrinsify it.
-       // ReverseBytes16 lowers to a rotate, no need for anything special here.
+       // Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
+       // On Power10, 16-bit rotate is not available so use BRH instruction
+       if buildcfg.GOPPC64 >= 10 {
+               addF("math/bits", "ReverseBytes16",
+                       func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                               return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
+                       },
+                       sys.PPC64)
+       }
+
        addF("math/bits", "Len64",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
index 09aa5a136eb9f2f1eef72a5801a623776f79619d..0eed27a61948b6f83b93eaf3e5f2ede147bf6d38 100644 (file)
@@ -198,6 +198,7 @@ func ReverseBytes64(n uint64) uint64 {
        // amd64:"BSWAPQ"
        // s390x:"MOVDBR"
        // arm64:"REV"
+       // ppc64x/power10: "BRD"
        return bits.ReverseBytes64(n)
 }
 
@@ -205,6 +206,7 @@ func ReverseBytes32(n uint32) uint32 {
        // amd64:"BSWAPL"
        // s390x:"MOVWBR"
        // arm64:"REVW"
+       // ppc64x/power10: "BRW"
        return bits.ReverseBytes32(n)
 }
 
@@ -214,6 +216,7 @@ func ReverseBytes16(n uint16) uint16 {
        // arm/5:"SLL","SRL","ORR"
        // arm/6:"REV16"
        // arm/7:"REV16"
+       // ppc64x/power10: "BRH"
        return bits.ReverseBytes16(n)
 }
 
index 8eff84d92de9e2eb1116eaccd2198c3c1d4d5717..611fb02d72989cdc2bea259187960553d0298c99 100644 (file)
@@ -1649,8 +1649,8 @@ var (
                "loong64": {},
                "mips":    {"GOMIPS", "hardfloat", "softfloat"},
                "mips64":  {"GOMIPS64", "hardfloat", "softfloat"},
-               "ppc64":   {"GOPPC64", "power8", "power9"},
-               "ppc64le": {"GOPPC64", "power8", "power9"},
+               "ppc64":   {"GOPPC64", "power8", "power9", "power10"},
+               "ppc64le": {"GOPPC64", "power8", "power9", "power10"},
                "ppc64x":  {}, // A pseudo-arch representing both ppc64 and ppc64le
                "s390x":   {},
                "wasm":    {},