]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: optimize Ctz64 on 386
authorYoulin Feng <fengyoulin@live.com>
Tue, 22 Oct 2024 09:18:11 +0000 (17:18 +0800)
committerGopher Robot <gobot@golang.org>
Tue, 5 Nov 2024 15:30:57 +0000 (15:30 +0000)
Compared with the version generated by dec64.rules based on Ctz32,
the number of assembly instructions is reduced by half.

SwissMap uses TrailingZeros64 to find the first match in its control
group and may benefit from this CL on 386 architectures.

goos: linux
goarch: 386
cpu: 13th Gen Intel(R) Core(TM) i7-13700H
                   │   old.txt    │               new.txt                │
                   │    sec/op    │    sec/op     vs base                │
TrailingZeros64-20   0.8828n ± 1%   0.6299n ± 1%  -28.65% (p=0.000 n=20)

Change-Id: Iba08a3f4e13efd3349715dfb7fcd5fd470286cd3
Reviewed-on: https://go-review.googlesource.com/c/go/+/624376
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Keith Randall <khr@golang.org>

src/cmd/compile/internal/ssa/_gen/386.rules
src/cmd/compile/internal/ssa/_gen/386Ops.go
src/cmd/compile/internal/ssa/_gen/genericOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewrite386.go
src/cmd/compile/internal/ssagen/intrinsics.go
src/cmd/compile/internal/x86/ssa.go

index 433981222468a0b20dce23213c86c0ba16088157..67cfa3460aa3e179912eccc910228ac8cbd452f7 100644 (file)
@@ -63,6 +63,7 @@
 (Ctz16NonZero ...) => (BSFL ...)
 (Ctz32 ...) => (LoweredCtz32 ...)
 (Ctz32NonZero ...) => (BSFL ...)
+(Ctz64On32 ...) => (LoweredCtz64 ...)
 
 // Lowering extension
 (SignExt8to16  ...) => (MOVBLSX ...)
index 52044ff5d61a493ff0e865a3a5b7d94d09d0531f..a976a91fb847ffcacad995bb9a2cc24644300853 100644 (file)
@@ -300,9 +300,10 @@ func init() {
 
                {name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0
 
-               {name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
-               {name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
-               {name: "LoweredCtz32", argLength: 1, reg: gp11, clobberFlags: true},      // arg0 # of low-order zeroes
+               {name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true},                   // arg0 # of low-order zeroes ; undef if zero
+               {name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true},                   // arg0 # of low-order zeroes ; undef if zero
+               {name: "LoweredCtz32", argLength: 1, reg: gp11, clobberFlags: true},                        // arg0 # of low-order zeroes
+               {name: "LoweredCtz64", argLength: 2, reg: gp21, resultNotInArgs: true, clobberFlags: true}, // arg1<<32+arg0 # of low-order zeroes
 
                {name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
                {name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
index 7f6e386499f49ff36a1dc6dec3aaf61dd2556f79..82f91320b3f1751c970d666f9cc136e6b850d81b 100644 (file)
@@ -229,6 +229,7 @@ var genericOps = []opData{
        {name: "Ctz16", argLength: 1},        // Count trailing (low order) zeroes (returns 0-16)
        {name: "Ctz32", argLength: 1},        // Count trailing (low order) zeroes (returns 0-32)
        {name: "Ctz64", argLength: 1},        // Count trailing (low order) zeroes (returns 0-64)
+       {name: "Ctz64On32", argLength: 2},    // Count trailing (low order) zeroes (returns 0-64) in arg[1]<<32+arg[0]
        {name: "Ctz8NonZero", argLength: 1},  // same as above, but arg[0] known to be non-zero, returns 0-7
        {name: "Ctz16NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-15
        {name: "Ctz32NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-31
index 06528a907681517e18475d69ffdc95a33e1d7ae7..1ca50bdf9e04861af40bb2246e885cfc204411ec 100644 (file)
@@ -469,6 +469,7 @@ const (
        Op386BSFL
        Op386BSFW
        Op386LoweredCtz32
+       Op386LoweredCtz64
        Op386BSRL
        Op386BSRW
        Op386BSWAPL
@@ -3093,6 +3094,7 @@ const (
        OpCtz16
        OpCtz32
        OpCtz64
+       OpCtz64On32
        OpCtz8NonZero
        OpCtz16NonZero
        OpCtz32NonZero
@@ -5195,6 +5197,21 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:            "LoweredCtz64",
+               argLen:          2,
+               resultNotInArgs: true,
+               clobberFlags:    true,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 239}, // AX CX DX BX BP SI DI
+                               {1, 239}, // AX CX DX BX BP SI DI
+                       },
+                       outputs: []outputInfo{
+                               {0, 239}, // AX CX DX BX BP SI DI
+                       },
+               },
+       },
        {
                name:         "BSRL",
                argLen:       1,
@@ -40458,6 +40475,11 @@ var opcodeTable = [...]opInfo{
                argLen:  1,
                generic: true,
        },
+       {
+               name:    "Ctz64On32",
+               argLen:  2,
+               generic: true,
+       },
        {
                name:    "Ctz8NonZero",
                argLen:  1,
index ce74d75158d274c3d82d087305ff31da413e60ba..9f1645f8c33dccada460dead6dd09c8b2cb290f0 100644 (file)
@@ -323,6 +323,9 @@ func rewriteValue386(v *Value) bool {
        case OpCtz32NonZero:
                v.Op = Op386BSFL
                return true
+       case OpCtz64On32:
+               v.Op = Op386LoweredCtz64
+               return true
        case OpCtz8:
                return rewriteValue386_OpCtz8(v)
        case OpCtz8NonZero:
index df5862f718eda40743c492cc807de449a8903d97..b13999b82e5891cc2b3fb44446ff16214b63a955 100644 (file)
@@ -747,7 +747,14 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
                },
-               sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+               sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+       addF("math/bits", "TrailingZeros64",
+               func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+                       lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0])
+                       hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0])
+                       return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi)
+               },
+               sys.I386)
        addF("math/bits", "TrailingZeros32",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
index 42ec44a51151d775a2fbc62fafcc1cf248b7c8c6..35ad2d90e664107da956ee267f73271b0264db88 100644 (file)
@@ -850,6 +850,54 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                p2.To.Type = obj.TYPE_REG
                p2.To.Reg = v.Reg()
 
+               // NOP (so the JNZ has somewhere to land)
+               nop := s.Prog(obj.ANOP)
+               p1.To.SetTarget(nop)
+       case ssa.Op386LoweredCtz64:
+               if v.Args[0].Reg() == v.Reg() {
+                       v.Fatalf("input[0] and output in the same register %s", v.LongString())
+               }
+               if v.Args[1].Reg() == v.Reg() {
+                       v.Fatalf("input[1] and output in the same register %s", v.LongString())
+               }
+
+               // BSFL arg0, out
+               p := s.Prog(x86.ABSFL)
+               p.From.Type = obj.TYPE_REG
+               p.From.Reg = v.Args[0].Reg()
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = v.Reg()
+
+               // JNZ 5(PC)
+               p1 := s.Prog(x86.AJNE)
+               p1.To.Type = obj.TYPE_BRANCH
+
+               // BSFL arg1, out
+               p2 := s.Prog(x86.ABSFL)
+               p2.From.Type = obj.TYPE_REG
+               p2.From.Reg = v.Args[1].Reg()
+               p2.To.Type = obj.TYPE_REG
+               p2.To.Reg = v.Reg()
+
+               // JNZ 2(PC)
+               p3 := s.Prog(x86.AJNE)
+               p3.To.Type = obj.TYPE_BRANCH
+
+               // MOVL $32, out
+               p4 := s.Prog(x86.AMOVL)
+               p4.From.Type = obj.TYPE_CONST
+               p4.From.Offset = 32
+               p4.To.Type = obj.TYPE_REG
+               p4.To.Reg = v.Reg()
+
+               // ADDL $32, out
+               p5 := s.Prog(x86.AADDL)
+               p5.From.Type = obj.TYPE_CONST
+               p5.From.Offset = 32
+               p5.To.Type = obj.TYPE_REG
+               p5.To.Reg = v.Reg()
+               p3.To.SetTarget(p5)
+
                // NOP (so the JNZ has somewhere to land)
                nop := s.Prog(obj.ANOP)
                p1.To.SetTarget(nop)