From 0140aae6d0009372cb5b781df7079aa62c97ddff Mon Sep 17 00:00:00 2001 From: Youlin Feng Date: Tue, 22 Oct 2024 17:18:11 +0800 Subject: [PATCH] cmd/compile: optimize Ctz64 on 386 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Compared with the version generated by dec64.rules based on Ctz32, the number of assembly instructions is reduced by half. SwissMap uses TrailingZeros64 to find the first match in its control group and may benefit from this CL on 386 architectures. goos: linux goarch: 386 cpu: 13th Gen Intel(R) Core(TM) i7-13700H │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ TrailingZeros64-20 0.8828n ± 1% 0.6299n ± 1% -28.65% (p=0.000 n=20) Change-Id: Iba08a3f4e13efd3349715dfb7fcd5fd470286cd3 Reviewed-on: https://go-review.googlesource.com/c/go/+/624376 Reviewed-by: David Chase Reviewed-by: Keith Randall Reviewed-by: Keith Randall LUCI-TryBot-Result: Go LUCI Auto-Submit: Keith Randall --- src/cmd/compile/internal/ssa/_gen/386.rules | 1 + src/cmd/compile/internal/ssa/_gen/386Ops.go | 7 +-- .../compile/internal/ssa/_gen/genericOps.go | 1 + src/cmd/compile/internal/ssa/opGen.go | 22 +++++++++ src/cmd/compile/internal/ssa/rewrite386.go | 3 ++ src/cmd/compile/internal/ssagen/intrinsics.go | 9 +++- src/cmd/compile/internal/x86/ssa.go | 48 +++++++++++++++++++ 7 files changed, 87 insertions(+), 4 deletions(-) diff --git a/src/cmd/compile/internal/ssa/_gen/386.rules b/src/cmd/compile/internal/ssa/_gen/386.rules index 4339812224..67cfa3460a 100644 --- a/src/cmd/compile/internal/ssa/_gen/386.rules +++ b/src/cmd/compile/internal/ssa/_gen/386.rules @@ -63,6 +63,7 @@ (Ctz16NonZero ...) => (BSFL ...) (Ctz32 ...) => (LoweredCtz32 ...) (Ctz32NonZero ...) => (BSFL ...) +(Ctz64On32 ...) => (LoweredCtz64 ...) // Lowering extension (SignExt8to16 ...) => (MOVBLSX ...) diff --git a/src/cmd/compile/internal/ssa/_gen/386Ops.go b/src/cmd/compile/internal/ssa/_gen/386Ops.go index 52044ff5d6..a976a91fb8 100644 --- a/src/cmd/compile/internal/ssa/_gen/386Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/386Ops.go @@ -300,9 +300,10 @@ func init() { {name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0 - {name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero - {name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero - {name: "LoweredCtz32", argLength: 1, reg: gp11, clobberFlags: true}, // arg0 # of low-order zeroes + {name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero + {name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero + {name: "LoweredCtz32", argLength: 1, reg: gp11, clobberFlags: true}, // arg0 # of low-order zeroes + {name: "LoweredCtz64", argLength: 2, reg: gp21, resultNotInArgs: true, clobberFlags: true}, // arg1<<32+arg0 # of low-order zeroes {name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero {name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go index 7f6e386499..82f91320b3 100644 --- a/src/cmd/compile/internal/ssa/_gen/genericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go @@ -229,6 +229,7 @@ var genericOps = []opData{ {name: "Ctz16", argLength: 1}, // Count trailing (low order) zeroes (returns 0-16) {name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32) {name: "Ctz64", argLength: 1}, // Count trailing (low order) zeroes (returns 0-64) + {name: "Ctz64On32", argLength: 2}, // Count trailing (low order) zeroes (returns 0-64) in arg[1]<<32+arg[0] {name: "Ctz8NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-7 {name: "Ctz16NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-15 {name: "Ctz32NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-31 diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 06528a9076..1ca50bdf9e 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -469,6 +469,7 @@ const ( Op386BSFL Op386BSFW Op386LoweredCtz32 + Op386LoweredCtz64 Op386BSRL Op386BSRW Op386BSWAPL @@ -3093,6 +3094,7 @@ const ( OpCtz16 OpCtz32 OpCtz64 + OpCtz64On32 OpCtz8NonZero OpCtz16NonZero OpCtz32NonZero @@ -5195,6 +5197,21 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "LoweredCtz64", + argLen: 2, + resultNotInArgs: true, + clobberFlags: true, + reg: regInfo{ + inputs: []inputInfo{ + {0, 239}, // AX CX DX BX BP SI DI + {1, 239}, // AX CX DX BX BP SI DI + }, + outputs: []outputInfo{ + {0, 239}, // AX CX DX BX BP SI DI + }, + }, + }, { name: "BSRL", argLen: 1, @@ -40458,6 +40475,11 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "Ctz64On32", + argLen: 2, + generic: true, + }, { name: "Ctz8NonZero", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewrite386.go b/src/cmd/compile/internal/ssa/rewrite386.go index ce74d75158..9f1645f8c3 100644 --- a/src/cmd/compile/internal/ssa/rewrite386.go +++ b/src/cmd/compile/internal/ssa/rewrite386.go @@ -323,6 +323,9 @@ func rewriteValue386(v *Value) bool { case OpCtz32NonZero: v.Op = Op386BSFL return true + case OpCtz64On32: + v.Op = Op386LoweredCtz64 + return true case OpCtz8: return rewriteValue386_OpCtz8(v) case OpCtz8NonZero: diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index df5862f718..b13999b82e 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -747,7 +747,14 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0]) }, - sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) + sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) + addF("math/bits", "TrailingZeros64", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0]) + hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0]) + return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi) + }, + sys.I386) addF("math/bits", "TrailingZeros32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0]) diff --git a/src/cmd/compile/internal/x86/ssa.go b/src/cmd/compile/internal/x86/ssa.go index 42ec44a511..35ad2d90e6 100644 --- a/src/cmd/compile/internal/x86/ssa.go +++ b/src/cmd/compile/internal/x86/ssa.go @@ -850,6 +850,54 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p2.To.Type = obj.TYPE_REG p2.To.Reg = v.Reg() + // NOP (so the JNZ has somewhere to land) + nop := s.Prog(obj.ANOP) + p1.To.SetTarget(nop) + case ssa.Op386LoweredCtz64: + if v.Args[0].Reg() == v.Reg() { + v.Fatalf("input[0] and output in the same register %s", v.LongString()) + } + if v.Args[1].Reg() == v.Reg() { + v.Fatalf("input[1] and output in the same register %s", v.LongString()) + } + + // BSFL arg0, out + p := s.Prog(x86.ABSFL) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + + // JNZ 5(PC) + p1 := s.Prog(x86.AJNE) + p1.To.Type = obj.TYPE_BRANCH + + // BSFL arg1, out + p2 := s.Prog(x86.ABSFL) + p2.From.Type = obj.TYPE_REG + p2.From.Reg = v.Args[1].Reg() + p2.To.Type = obj.TYPE_REG + p2.To.Reg = v.Reg() + + // JNZ 2(PC) + p3 := s.Prog(x86.AJNE) + p3.To.Type = obj.TYPE_BRANCH + + // MOVL $32, out + p4 := s.Prog(x86.AMOVL) + p4.From.Type = obj.TYPE_CONST + p4.From.Offset = 32 + p4.To.Type = obj.TYPE_REG + p4.To.Reg = v.Reg() + + // ADDL $32, out + p5 := s.Prog(x86.AADDL) + p5.From.Type = obj.TYPE_CONST + p5.From.Offset = 32 + p5.To.Type = obj.TYPE_REG + p5.To.Reg = v.Reg() + p3.To.SetTarget(p5) + // NOP (so the JNZ has somewhere to land) nop := s.Prog(obj.ANOP) p1.To.SetTarget(nop) -- 2.48.1