From 495b167919b1473e02843135f57d310cdd4f5789 Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Thu, 16 Mar 2017 14:08:31 -0700 Subject: [PATCH] cmd/compile: intrinsics for math/bits.{Len,LeadingZeros} MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit name old time/op new time/op delta LeadingZeros-4 2.00ns ± 0% 1.34ns ± 1% -33.02% (p=0.000 n=8+10) LeadingZeros16-4 1.62ns ± 0% 1.57ns ± 0% -3.09% (p=0.001 n=8+9) LeadingZeros32-4 2.14ns ± 0% 1.48ns ± 0% -30.84% (p=0.002 n=8+10) LeadingZeros64-4 2.06ns ± 1% 1.33ns ± 0% -35.08% (p=0.000 n=8+8) 8-bit args is a special case - the Go code is really fast because it is just a single table lookup. So I've disabled that for now. Intrinsics were actually slower: LeadingZeros8-4 1.22ns ± 3% 1.58ns ± 1% +29.56% (p=0.000 n=10+10) Update #18616 Change-Id: Ia9c289b9ba59c583ea64060470315fd637e814cf Reviewed-on: https://go-review.googlesource.com/38311 Run-TryBot: Keith Randall TryBot-Result: Gobot Gobot Reviewed-by: Robert Griesemer --- src/cmd/compile/internal/amd64/ssa.go | 2 +- src/cmd/compile/internal/gc/asm_test.go | 421 +++++++++++++++++- src/cmd/compile/internal/gc/inl.go | 8 +- src/cmd/compile/internal/gc/ssa.go | 45 ++ src/cmd/compile/internal/ssa/gen/AMD64.rules | 18 + src/cmd/compile/internal/ssa/gen/AMD64Ops.go | 2 + src/cmd/compile/internal/ssa/gen/ARM.rules | 4 + src/cmd/compile/internal/ssa/gen/ARM64.rules | 4 + src/cmd/compile/internal/ssa/gen/MIPS.rules | 3 + src/cmd/compile/internal/ssa/gen/S390X.rules | 3 + src/cmd/compile/internal/ssa/gen/dec64.rules | 8 + .../compile/internal/ssa/gen/genericOps.go | 6 +- src/cmd/compile/internal/ssa/opGen.go | 42 ++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 226 ++++++++++ src/cmd/compile/internal/ssa/rewriteARM.go | 44 ++ src/cmd/compile/internal/ssa/rewriteARM64.go | 58 +++ src/cmd/compile/internal/ssa/rewriteMIPS.go | 21 + src/cmd/compile/internal/ssa/rewriteS390X.go | 42 ++ src/cmd/compile/internal/ssa/rewritedec64.go | 32 ++ 19 files changed, 979 insertions(+), 10 deletions(-) diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index c0de90b7a7..7e39d9784c 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -755,7 +755,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p := gc.Prog(v.Op.Asm()) p.To.Type = obj.TYPE_REG p.To.Reg = r - case ssa.OpAMD64BSFQ, ssa.OpAMD64BSFL: + case ssa.OpAMD64BSFQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRQ, ssa.OpAMD64BSRL: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = v.Args[0].Reg() diff --git a/src/cmd/compile/internal/gc/asm_test.go b/src/cmd/compile/internal/gc/asm_test.go index 6d56cf6066..7db3908c0f 100644 --- a/src/cmd/compile/internal/gc/asm_test.go +++ b/src/cmd/compile/internal/gc/asm_test.go @@ -148,6 +148,7 @@ func (ats *asmTests) runGo(t *testing.T, args ...string) string { cmd.Stderr = &stderr if err := cmd.Run(); err != nil { + fmt.Printf(stdout.String()) t.Fatalf("error running cmd: %v", err) } @@ -178,9 +179,10 @@ var allAsmTests = []*asmTests{ tests: linuxS390XTests, }, { - arch: "arm", - os: "linux", - tests: linuxARMTests, + arch: "arm", + os: "linux", + imports: []string{"math/bits"}, + tests: linuxARMTests, }, { arch: "arm64", @@ -188,6 +190,12 @@ var allAsmTests = []*asmTests{ imports: []string{"math/bits"}, tests: linuxARM64Tests, }, + { + arch: "mips", + os: "linux", + imports: []string{"math/bits"}, + tests: linuxMIPSTests, + }, } var linuxAMD64Tests = []*asmTest{ @@ -601,6 +609,90 @@ var linuxAMD64Tests = []*asmTest{ `, []string{"\tROLW\t\\$8,"}, }, + { + ` + func f48(a uint64) int { + return bits.Len64(a) + } + `, + []string{"\tBSRQ\t"}, + }, + { + ` + func f49(a uint32) int { + return bits.Len32(a) + } + `, + []string{"\tBSRQ\t"}, + }, + { + ` + func f50(a uint16) int { + return bits.Len16(a) + } + `, + []string{"\tBSRQ\t"}, + }, + /* see ssa.go + { + ` + func f51(a uint8) int { + return bits.Len8(a) + } + `, + []string{"\tBSRQ\t"}, + }, + */ + { + ` + func f52(a uint) int { + return bits.Len(a) + } + `, + []string{"\tBSRQ\t"}, + }, + { + ` + func f53(a uint64) int { + return bits.LeadingZeros64(a) + } + `, + []string{"\tBSRQ\t"}, + }, + { + ` + func f54(a uint32) int { + return bits.LeadingZeros32(a) + } + `, + []string{"\tBSRQ\t"}, + }, + { + ` + func f55(a uint16) int { + return bits.LeadingZeros16(a) + } + `, + []string{"\tBSRQ\t"}, + }, + /* see ssa.go + { + ` + func f56(a uint8) int { + return bits.LeadingZeros8(a) + } + `, + []string{"\tBSRQ\t"}, + }, + */ + { + ` + func f57(a uint) int { + return bits.LeadingZeros(a) + } + `, + []string{"\tBSRQ\t"}, + }, } var linux386Tests = []*asmTest{ @@ -818,6 +910,86 @@ var linuxS390XTests = []*asmTest{ `, []string{"\tMOVWBR\t"}, }, + { + ` + func f24(a uint64) int { + return bits.Len64(a) + } + `, + []string{"\tFLOGR\t"}, + }, + { + ` + func f25(a uint32) int { + return bits.Len32(a) + } + `, + []string{"\tFLOGR\t"}, + }, + { + ` + func f26(a uint16) int { + return bits.Len16(a) + } + `, + []string{"\tFLOGR\t"}, + }, + { + ` + func f27(a uint8) int { + return bits.Len8(a) + } + `, + []string{"\tFLOGR\t"}, + }, + { + ` + func f28(a uint) int { + return bits.Len(a) + } + `, + []string{"\tFLOGR\t"}, + }, + { + ` + func f29(a uint64) int { + return bits.LeadingZeros64(a) + } + `, + []string{"\tFLOGR\t"}, + }, + { + ` + func f30(a uint32) int { + return bits.LeadingZeros32(a) + } + `, + []string{"\tFLOGR\t"}, + }, + { + ` + func f31(a uint16) int { + return bits.LeadingZeros16(a) + } + `, + []string{"\tFLOGR\t"}, + }, + { + ` + func f32(a uint8) int { + return bits.LeadingZeros8(a) + } + `, + []string{"\tFLOGR\t"}, + }, + { + ` + func f33(a uint) int { + return bits.LeadingZeros(a) + } + `, + []string{"\tFLOGR\t"}, + }, } var linuxARMTests = []*asmTest{ @@ -845,6 +1017,86 @@ var linuxARMTests = []*asmTest{ `, []string{"\tMOVW\tR[0-9]+@>25,"}, }, + { + ` + func f3(a uint64) int { + return bits.Len64(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f4(a uint32) int { + return bits.Len32(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f5(a uint16) int { + return bits.Len16(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f6(a uint8) int { + return bits.Len8(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f7(a uint) int { + return bits.Len(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f8(a uint64) int { + return bits.LeadingZeros64(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f9(a uint32) int { + return bits.LeadingZeros32(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f10(a uint16) int { + return bits.LeadingZeros16(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f11(a uint8) int { + return bits.LeadingZeros8(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f12(a uint) int { + return bits.LeadingZeros(a) + } + `, + []string{"\tCLZ\t"}, + }, } var linuxARM64Tests = []*asmTest{ @@ -912,6 +1164,169 @@ var linuxARM64Tests = []*asmTest{ `, []string{"\tREVW\t"}, }, + { + ` + func f24(a uint64) int { + return bits.Len64(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f25(a uint32) int { + return bits.Len32(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f26(a uint16) int { + return bits.Len16(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f27(a uint8) int { + return bits.Len8(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f28(a uint) int { + return bits.Len(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f29(a uint64) int { + return bits.LeadingZeros64(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f30(a uint32) int { + return bits.LeadingZeros32(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f31(a uint16) int { + return bits.LeadingZeros16(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f32(a uint8) int { + return bits.LeadingZeros8(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f33(a uint) int { + return bits.LeadingZeros(a) + } + `, + []string{"\tCLZ\t"}, + }, +} + +var linuxMIPSTests = []*asmTest{ + { + ` + func f0(a uint64) int { + return bits.Len64(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f1(a uint32) int { + return bits.Len32(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f2(a uint16) int { + return bits.Len16(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f3(a uint8) int { + return bits.Len8(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f4(a uint) int { + return bits.Len(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f5(a uint64) int { + return bits.LeadingZeros64(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f6(a uint32) int { + return bits.LeadingZeros32(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f7(a uint16) int { + return bits.LeadingZeros16(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f8(a uint8) int { + return bits.LeadingZeros8(a) + } + `, + []string{"\tCLZ\t"}, + }, + { + ` + func f9(a uint) int { + return bits.LeadingZeros(a) + } + `, + []string{"\tCLZ\t"}, + }, } // TestLineNumber checks to make sure the generated assembly has line numbers diff --git a/src/cmd/compile/internal/gc/inl.go b/src/cmd/compile/internal/gc/inl.go index 22bd7ce743..5faf494876 100644 --- a/src/cmd/compile/internal/gc/inl.go +++ b/src/cmd/compile/internal/gc/inl.go @@ -200,14 +200,14 @@ func ishairy(n *Node, budget *int32, reason *string) bool { switch n.Op { // Call is okay if inlinable and we have the budget for the body. case OCALLFUNC: - if fn := n.Left.Func; fn != nil && fn.Inl.Len() != 0 { - *budget -= fn.InlCost - break - } if isIntrinsicCall(n) { *budget-- break } + if fn := n.Left.Func; fn != nil && fn.Inl.Len() != 0 { + *budget -= fn.InlCost + break + } if n.isMethodCalledAsFunction() { if d := n.Left.Sym.Def; d != nil && d.Func.Inl.Len() != 0 { diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index bd766a5870..418056a81c 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -2706,6 +2706,51 @@ func init() { alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...) // ReverseBytes inlines correctly, no need to intrinsify it. // ReverseBytes16 lowers to a rotate, no need for anything special here. + addF("math/bits", "Len64", + func(s *state, n *Node, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpBitLen64, Types[TINT], args[0]) + }, + sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS) + addF("math/bits", "Len32", + func(s *state, n *Node, args []*ssa.Value) *ssa.Value { + if s.config.IntSize == 4 { + return s.newValue1(ssa.OpBitLen32, Types[TINT], args[0]) + } + x := s.newValue1(ssa.OpZeroExt32to64, Types[TUINT64], args[0]) + return s.newValue1(ssa.OpBitLen64, Types[TINT], x) + }, + sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS) + addF("math/bits", "Len16", + func(s *state, n *Node, args []*ssa.Value) *ssa.Value { + if s.config.IntSize == 4 { + x := s.newValue1(ssa.OpZeroExt16to32, Types[TUINT32], args[0]) + return s.newValue1(ssa.OpBitLen32, Types[TINT], x) + } + x := s.newValue1(ssa.OpZeroExt16to64, Types[TUINT64], args[0]) + return s.newValue1(ssa.OpBitLen64, Types[TINT], x) + }, + sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS) + // Note: disabled on AMD64 because the Go code is faster! + addF("math/bits", "Len8", + func(s *state, n *Node, args []*ssa.Value) *ssa.Value { + if s.config.IntSize == 4 { + x := s.newValue1(ssa.OpZeroExt8to32, Types[TUINT32], args[0]) + return s.newValue1(ssa.OpBitLen32, Types[TINT], x) + } + x := s.newValue1(ssa.OpZeroExt8to64, Types[TUINT64], args[0]) + return s.newValue1(ssa.OpBitLen64, Types[TINT], x) + }, + sys.ARM64, sys.ARM, sys.S390X, sys.MIPS) + + addF("math/bits", "Len", + func(s *state, n *Node, args []*ssa.Value) *ssa.Value { + if s.config.IntSize == 4 { + return s.newValue1(ssa.OpBitLen32, Types[TINT], args[0]) + } + return s.newValue1(ssa.OpBitLen64, Types[TINT], args[0]) + }, + sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS) + // LeadingZeros is handled because it trivially calls Len. /******** sync/atomic ********/ diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules index adfdfe9333..e68a7783f5 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules @@ -100,6 +100,9 @@ (Ctz64 x) -> (CMOVQEQ (Select0 (BSFQ x)) (MOVQconst [64]) (Select1 (BSFQ x))) (Ctz32 x) -> (Select0 (BSFQ (ORQ (MOVQconst [1<<32]) x))) +(BitLen64 x) -> (ADDQconst [1] (CMOVQEQ (Select0 (BSRQ x)) (MOVQconst [-1]) (Select1 (BSRQ x)))) +(BitLen32 x) -> (BitLen64 (MOVLQZX x)) + (Bswap64 x) -> (BSWAPQ x) (Bswap32 x) -> (BSWAPL x) @@ -1394,6 +1397,7 @@ (ORL x x) -> x (XORQ x x) -> (MOVQconst [0]) (XORL x x) -> (MOVLconst [0]) +(NEGQ (ADDQconst [c] (NEGQ x))) && c != -(1<<31) -> (ADDQconst [-c] x) // checking AND against 0. (CMPQconst (ANDQ x y) [0]) -> (TESTQ x y) @@ -2089,3 +2093,17 @@ // Extension is unnecessary for trailing zeros. (BSFQ (ORQconst [1<<8] (MOVBQZX x))) -> (BSFQ (ORQconst [1<<8] x)) (BSFQ (ORQconst [1<<16] (MOVWQZX x))) -> (BSFQ (ORQconst [1<<16] x)) + +// Redundant sign/zero extensions +(MOVLQSX x:(MOVLQSX _)) -> x +(MOVLQSX x:(MOVWQSX _)) -> x +(MOVLQSX x:(MOVBQSX _)) -> x +(MOVWQSX x:(MOVWQSX _)) -> x +(MOVWQSX x:(MOVBQSX _)) -> x +(MOVBQSX x:(MOVBQSX _)) -> x +(MOVLQZX x:(MOVLQZX _)) -> x +(MOVLQZX x:(MOVWQZX _)) -> x +(MOVLQZX x:(MOVBQZX _)) -> x +(MOVWQZX x:(MOVWQZX _)) -> x +(MOVWQZX x:(MOVBQZX _)) -> x +(MOVBQZX x:(MOVBQZX _)) -> x diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go index 8feee02105..f9731047e7 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go @@ -312,6 +312,8 @@ func init() { // flags are set to "equal" if the input is zero, "not equal" otherwise. {name: "BSFQ", argLength: 1, reg: gp11flags, asm: "BSFQ", typ: "(UInt64,Flags)"}, // # of low-order zeroes in 64-bit arg {name: "BSFL", argLength: 1, reg: gp11flags, asm: "BSFL", typ: "(UInt32,Flags)"}, // # of low-order zeroes in 32-bit arg + {name: "BSRQ", argLength: 1, reg: gp11flags, asm: "BSRQ", typ: "(UInt64,Flags)"}, // # of high-order zeroes in 64-bit arg + {name: "BSRL", argLength: 1, reg: gp11flags, asm: "BSRL", typ: "(UInt32,Flags)"}, // # of high-order zeroes in 32-bit arg // Note ASM for ops moves whole register // diff --git a/src/cmd/compile/internal/ssa/gen/ARM.rules b/src/cmd/compile/internal/ssa/gen/ARM.rules index b2b4bf6ff9..9413bd4d31 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM.rules @@ -91,6 +91,9 @@ // 32 - CLZ(x&-x - 1) (Ctz32 x) -> (RSBconst [32] (CLZ (SUBconst (AND x (RSBconst [0] x)) [1]))) +// bit length +(BitLen32 x) -> (RSBconst [32] (CLZ x)) + // byte swap // let (a, b, c, d) be the bytes of x from high to low // t1 = x right rotate 16 bits -- (c, d, a, b ) @@ -1205,6 +1208,7 @@ // generic simplifications (ADD x (RSBconst [0] y)) -> (SUB x y) (ADD (RSBconst [0] y) x) -> (SUB x y) +(ADD (RSBconst [c] x) (RSBconst [d] y)) -> (RSBconst [c+d] (ADD x y)) (SUB x x) -> (MOVWconst [0]) (RSB x x) -> (MOVWconst [0]) (AND x x) -> x diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules index 8dde996df8..9331ab154b 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules @@ -86,6 +86,8 @@ (Ctz64 x) -> (CLZ (RBIT x)) (Ctz32 x) -> (CLZW (RBITW x)) +(BitLen64 x) -> (SUB (MOVDconst [64]) (CLZ x)) + (Bswap64 x) -> (REV x) (Bswap32 x) -> (REVW x) @@ -831,6 +833,8 @@ (BIC x x) -> (MOVDconst [0]) (AND x (MVN y)) -> (BIC x y) (CSELULT x (MOVDconst [0]) flag) -> (CSELULT0 x flag) +(SUB x (SUB y z)) -> (SUB (ADD x z) y) +(SUB (SUB x y) z) -> (SUB x (ADD y z)) // remove redundant *const ops (ADDconst [0] x) -> x diff --git a/src/cmd/compile/internal/ssa/gen/MIPS.rules b/src/cmd/compile/internal/ssa/gen/MIPS.rules index 93428d5b75..766c25fa53 100644 --- a/src/cmd/compile/internal/ssa/gen/MIPS.rules +++ b/src/cmd/compile/internal/ssa/gen/MIPS.rules @@ -147,6 +147,9 @@ // 32 - CLZ(x&-x - 1) (Ctz32 x) -> (SUB (MOVWconst [32]) (CLZ (SUBconst [1] (AND x (NEG x))))) +// bit length +(BitLen32 x) -> (SUB (MOVWconst [32]) (CLZ x)) + // boolean ops -- booleans are represented with 0=false, 1=true (AndB x y) -> (AND x y) (OrB x y) -> (OR x y) diff --git a/src/cmd/compile/internal/ssa/gen/S390X.rules b/src/cmd/compile/internal/ssa/gen/S390X.rules index 0a3dccfb54..c1e821a7ee 100644 --- a/src/cmd/compile/internal/ssa/gen/S390X.rules +++ b/src/cmd/compile/internal/ssa/gen/S390X.rules @@ -102,6 +102,8 @@ (Ctz64 x) -> (SUB (MOVDconst [64]) (FLOGR (AND (SUBconst [1] x) (NOT x)))) (Ctz32 x) -> (SUB (MOVDconst [64]) (FLOGR (MOVWZreg (ANDW (SUBWconst [1] x) (NOTW x))))) +(BitLen64 x) -> (SUB (MOVDconst [64]) (FLOGR x)) + (Bswap64 x) -> (MOVDBR x) (Bswap32 x) -> (MOVWBR x) @@ -1022,6 +1024,7 @@ (ORW x x) -> x (XOR x x) -> (MOVDconst [0]) (XORW x x) -> (MOVDconst [0]) +(NEG (ADDconst [c] (NEG x))) && c != -(1<<31) -> (ADDconst [-c] x) // fused multiply-add (FADD x (FMUL y z)) -> (FMADD x y z) diff --git a/src/cmd/compile/internal/ssa/gen/dec64.rules b/src/cmd/compile/internal/ssa/gen/dec64.rules index 6469e59d81..18adb96a73 100644 --- a/src/cmd/compile/internal/ssa/gen/dec64.rules +++ b/src/cmd/compile/internal/ssa/gen/dec64.rules @@ -114,6 +114,14 @@ (Com32 (Zeromask (Int64Lo x))) (Ctz32 (Int64Hi x)))) +(BitLen64 x) -> + (Add32 + (BitLen32 (Int64Hi x)) + (BitLen32 + (Or32 + (Int64Lo x) + (Zeromask (Int64Hi x))))) + (Bswap64 x) -> (Int64Make (Bswap32 (Int64Lo x)) diff --git a/src/cmd/compile/internal/ssa/gen/genericOps.go b/src/cmd/compile/internal/ssa/gen/genericOps.go index 074b3be05a..b4c2a015e1 100644 --- a/src/cmd/compile/internal/ssa/gen/genericOps.go +++ b/src/cmd/compile/internal/ssa/gen/genericOps.go @@ -236,8 +236,10 @@ var genericOps = []opData{ {name: "Com32", argLength: 1}, {name: "Com64", argLength: 1}, - {name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32) - {name: "Ctz64", argLength: 1}, // Count trailing zeroes (returns 0-64) + {name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32) + {name: "Ctz64", argLength: 1}, // Count trailing zeroes (returns 0-64) + {name: "BitLen32", argLength: 1}, // Number of bits in arg[0] (returns 0-32) + {name: "BitLen64", argLength: 1}, // Number of bits in arg[0] (returns 0-64) {name: "Bswap32", argLength: 1}, // Swap bytes {name: "Bswap64", argLength: 1}, // Swap bytes diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index e8f2b1c98c..146c52ed9f 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -533,6 +533,8 @@ const ( OpAMD64NOTL OpAMD64BSFQ OpAMD64BSFL + OpAMD64BSRQ + OpAMD64BSRL OpAMD64CMOVQEQ OpAMD64CMOVLEQ OpAMD64BSWAPQ @@ -1761,6 +1763,8 @@ const ( OpCom64 OpCtz32 OpCtz64 + OpBitLen32 + OpBitLen64 OpBswap32 OpBswap64 OpSqrt @@ -6243,6 +6247,34 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "BSRQ", + argLen: 1, + asm: x86.ABSRQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 + }, + outputs: []outputInfo{ + {1, 0}, + {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 + }, + }, + }, + { + name: "BSRL", + argLen: 1, + asm: x86.ABSRL, + reg: regInfo{ + inputs: []inputInfo{ + {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 + }, + outputs: []outputInfo{ + {1, 0}, + {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 + }, + }, + }, { name: "CMOVQEQ", argLen: 3, @@ -21429,6 +21461,16 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "BitLen32", + argLen: 1, + generic: true, + }, + { + name: "BitLen64", + argLen: 1, + generic: true, + }, { name: "Bswap32", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 37741123a3..76e37c29e7 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -374,6 +374,10 @@ func rewriteValueAMD64(v *Value, config *Config) bool { return rewriteValueAMD64_OpAtomicStorePtrNoWB(v, config) case OpAvg64u: return rewriteValueAMD64_OpAvg64u(v, config) + case OpBitLen32: + return rewriteValueAMD64_OpBitLen32(v, config) + case OpBitLen64: + return rewriteValueAMD64_OpBitLen64(v, config) case OpBswap32: return rewriteValueAMD64_OpBswap32(v, config) case OpBswap64: @@ -3995,6 +3999,19 @@ func rewriteValueAMD64_OpAMD64MOVBQSX(v *Value, config *Config) bool { v.AddArg(x) return true } + // match: (MOVBQSX x:(MOVBQSX _)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpAMD64MOVBQSX { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } return false } func rewriteValueAMD64_OpAMD64MOVBQSXload(v *Value, config *Config) bool { @@ -4171,6 +4188,19 @@ func rewriteValueAMD64_OpAMD64MOVBQZX(v *Value, config *Config) bool { v.AddArg(x) return true } + // match: (MOVBQZX x:(MOVBQZX _)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpAMD64MOVBQZX { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } return false } func rewriteValueAMD64_OpAMD64MOVBload(v *Value, config *Config) bool { @@ -5873,6 +5903,45 @@ func rewriteValueAMD64_OpAMD64MOVLQSX(v *Value, config *Config) bool { v.AddArg(x) return true } + // match: (MOVLQSX x:(MOVLQSX _)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpAMD64MOVLQSX { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } + // match: (MOVLQSX x:(MOVWQSX _)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpAMD64MOVWQSX { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } + // match: (MOVLQSX x:(MOVBQSX _)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpAMD64MOVBQSX { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } return false } func rewriteValueAMD64_OpAMD64MOVLQSXload(v *Value, config *Config) bool { @@ -6026,6 +6095,45 @@ func rewriteValueAMD64_OpAMD64MOVLQZX(v *Value, config *Config) bool { v.AddArg(x) return true } + // match: (MOVLQZX x:(MOVLQZX _)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpAMD64MOVLQZX { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } + // match: (MOVLQZX x:(MOVWQZX _)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpAMD64MOVWQZX { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } + // match: (MOVLQZX x:(MOVBQZX _)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpAMD64MOVBQZX { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } return false } func rewriteValueAMD64_OpAMD64MOVLatomicload(v *Value, config *Config) bool { @@ -9742,6 +9850,32 @@ func rewriteValueAMD64_OpAMD64MOVWQSX(v *Value, config *Config) bool { v.AddArg(x) return true } + // match: (MOVWQSX x:(MOVWQSX _)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpAMD64MOVWQSX { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } + // match: (MOVWQSX x:(MOVBQSX _)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpAMD64MOVBQSX { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } return false } func rewriteValueAMD64_OpAMD64MOVWQSXload(v *Value, config *Config) bool { @@ -9920,6 +10054,32 @@ func rewriteValueAMD64_OpAMD64MOVWQZX(v *Value, config *Config) bool { v.AddArg(x) return true } + // match: (MOVWQZX x:(MOVWQZX _)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpAMD64MOVWQZX { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } + // match: (MOVWQZX x:(MOVBQZX _)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpAMD64MOVBQZX { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } return false } func rewriteValueAMD64_OpAMD64MOVWload(v *Value, config *Config) bool { @@ -11962,6 +12122,28 @@ func rewriteValueAMD64_OpAMD64NEGQ(v *Value, config *Config) bool { v.AuxInt = -c return true } + // match: (NEGQ (ADDQconst [c] (NEGQ x))) + // cond: c != -(1<<31) + // result: (ADDQconst [-c] x) + for { + v_0 := v.Args[0] + if v_0.Op != OpAMD64ADDQconst { + break + } + c := v_0.AuxInt + v_0_0 := v_0.Args[0] + if v_0_0.Op != OpAMD64NEGQ { + break + } + x := v_0_0.Args[0] + if !(c != -(1 << 31)) { + break + } + v.reset(OpAMD64ADDQconst) + v.AuxInt = -c + v.AddArg(x) + return true + } return false } func rewriteValueAMD64_OpAMD64NOTL(v *Value, config *Config) bool { @@ -17735,6 +17917,50 @@ func rewriteValueAMD64_OpAvg64u(v *Value, config *Config) bool { return true } } +func rewriteValueAMD64_OpBitLen32(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (BitLen32 x) + // cond: + // result: (BitLen64 (MOVLQZX x)) + for { + x := v.Args[0] + v.reset(OpBitLen64) + v0 := b.NewValue0(v.Pos, OpAMD64MOVLQZX, config.Frontend().TypeUInt64()) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpBitLen64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (BitLen64 x) + // cond: + // result: (ADDQconst [1] (CMOVQEQ (Select0 (BSRQ x)) (MOVQconst [-1]) (Select1 (BSRQ x)))) + for { + t := v.Type + x := v.Args[0] + v.reset(OpAMD64ADDQconst) + v.AuxInt = 1 + v0 := b.NewValue0(v.Pos, OpAMD64CMOVQEQ, t) + v1 := b.NewValue0(v.Pos, OpSelect0, t) + v2 := b.NewValue0(v.Pos, OpAMD64BSRQ, MakeTuple(config.fe.TypeUInt64(), TypeFlags)) + v2.AddArg(x) + v1.AddArg(v2) + v0.AddArg(v1) + v3 := b.NewValue0(v.Pos, OpAMD64MOVQconst, t) + v3.AuxInt = -1 + v0.AddArg(v3) + v4 := b.NewValue0(v.Pos, OpSelect1, TypeFlags) + v5 := b.NewValue0(v.Pos, OpAMD64BSRQ, MakeTuple(config.fe.TypeUInt64(), TypeFlags)) + v5.AddArg(x) + v4.AddArg(v5) + v0.AddArg(v4) + v.AddArg(v0) + return true + } +} func rewriteValueAMD64_OpBswap32(v *Value, config *Config) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/ssa/rewriteARM.go b/src/cmd/compile/internal/ssa/rewriteARM.go index fa77deabfd..83bdde4c17 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM.go +++ b/src/cmd/compile/internal/ssa/rewriteARM.go @@ -362,6 +362,8 @@ func rewriteValueARM(v *Value, config *Config) bool { return rewriteValueARM_OpAndB(v, config) case OpAvg32u: return rewriteValueARM_OpAvg32u(v, config) + case OpBitLen32: + return rewriteValueARM_OpBitLen32(v, config) case OpBswap32: return rewriteValueARM_OpBswap32(v, config) case OpClosureCall: @@ -1569,6 +1571,31 @@ func rewriteValueARM_OpARMADD(v *Value, config *Config) bool { v.AddArg(y) return true } + // match: (ADD (RSBconst [c] x) (RSBconst [d] y)) + // cond: + // result: (RSBconst [c+d] (ADD x y)) + for { + t := v.Type + v_0 := v.Args[0] + if v_0.Op != OpARMRSBconst { + break + } + c := v_0.AuxInt + x := v_0.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARMRSBconst { + break + } + d := v_1.AuxInt + y := v_1.Args[0] + v.reset(OpARMRSBconst) + v.AuxInt = c + d + v0 := b.NewValue0(v.Pos, OpARMADD, t) + v0.AddArg(x) + v0.AddArg(y) + v.AddArg(v0) + return true + } // match: (ADD (MUL x y) a) // cond: // result: (MULA x y a) @@ -13034,6 +13061,23 @@ func rewriteValueARM_OpAvg32u(v *Value, config *Config) bool { return true } } +func rewriteValueARM_OpBitLen32(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (BitLen32 x) + // cond: + // result: (RSBconst [32] (CLZ x)) + for { + t := v.Type + x := v.Args[0] + v.reset(OpARMRSBconst) + v.AuxInt = 32 + v0 := b.NewValue0(v.Pos, OpARMCLZ, t) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} func rewriteValueARM_OpBswap32(v *Value, config *Config) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index f3a7bffcff..10a3589598 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go @@ -250,6 +250,8 @@ func rewriteValueARM64(v *Value, config *Config) bool { return rewriteValueARM64_OpAtomicStorePtrNoWB(v, config) case OpAvg64u: return rewriteValueARM64_OpAvg64u(v, config) + case OpBitLen64: + return rewriteValueARM64_OpBitLen64(v, config) case OpBswap32: return rewriteValueARM64_OpBswap32(v, config) case OpBswap64: @@ -8238,6 +8240,44 @@ func rewriteValueARM64_OpARM64SUB(v *Value, config *Config) bool { v.AuxInt = 0 return true } + // match: (SUB x (SUB y z)) + // cond: + // result: (SUB (ADD x z) y) + for { + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64SUB { + break + } + y := v_1.Args[0] + z := v_1.Args[1] + v.reset(OpARM64SUB) + v0 := b.NewValue0(v.Pos, OpARM64ADD, v.Type) + v0.AddArg(x) + v0.AddArg(z) + v.AddArg(v0) + v.AddArg(y) + return true + } + // match: (SUB (SUB x y) z) + // cond: + // result: (SUB x (ADD y z)) + for { + v_0 := v.Args[0] + if v_0.Op != OpARM64SUB { + break + } + x := v_0.Args[0] + y := v_0.Args[1] + z := v.Args[1] + v.reset(OpARM64SUB) + v.AddArg(x) + v0 := b.NewValue0(v.Pos, OpARM64ADD, y.Type) + v0.AddArg(y) + v0.AddArg(z) + v.AddArg(v0) + return true + } // match: (SUB x (SLLconst [c] y)) // cond: // result: (SUBshiftLL x y [c]) @@ -9656,6 +9696,24 @@ func rewriteValueARM64_OpAvg64u(v *Value, config *Config) bool { return true } } +func rewriteValueARM64_OpBitLen64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (BitLen64 x) + // cond: + // result: (SUB (MOVDconst [64]) (CLZ x)) + for { + x := v.Args[0] + v.reset(OpARM64SUB) + v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, config.fe.TypeUInt64()) + v0.AuxInt = 64 + v.AddArg(v0) + v1 := b.NewValue0(v.Pos, OpARM64CLZ, config.fe.TypeInt()) + v1.AddArg(x) + v.AddArg(v1) + return true + } +} func rewriteValueARM64_OpBswap32(v *Value, config *Config) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/ssa/rewriteMIPS.go b/src/cmd/compile/internal/ssa/rewriteMIPS.go index 1816282a9d..88789d93ee 100644 --- a/src/cmd/compile/internal/ssa/rewriteMIPS.go +++ b/src/cmd/compile/internal/ssa/rewriteMIPS.go @@ -52,6 +52,8 @@ func rewriteValueMIPS(v *Value, config *Config) bool { return rewriteValueMIPS_OpAtomicStorePtrNoWB(v, config) case OpAvg32u: return rewriteValueMIPS_OpAvg32u(v, config) + case OpBitLen32: + return rewriteValueMIPS_OpBitLen32(v, config) case OpClosureCall: return rewriteValueMIPS_OpClosureCall(v, config) case OpCom16: @@ -1007,6 +1009,25 @@ func rewriteValueMIPS_OpAvg32u(v *Value, config *Config) bool { return true } } +func rewriteValueMIPS_OpBitLen32(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (BitLen32 x) + // cond: + // result: (SUB (MOVWconst [32]) (CLZ x)) + for { + t := v.Type + x := v.Args[0] + v.reset(OpMIPSSUB) + v0 := b.NewValue0(v.Pos, OpMIPSMOVWconst, config.fe.TypeUInt32()) + v0.AuxInt = 32 + v.AddArg(v0) + v1 := b.NewValue0(v.Pos, OpMIPSCLZ, t) + v1.AddArg(x) + v.AddArg(v1) + return true + } +} func rewriteValueMIPS_OpClosureCall(v *Value, config *Config) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go index cef736515f..a43b98dc74 100644 --- a/src/cmd/compile/internal/ssa/rewriteS390X.go +++ b/src/cmd/compile/internal/ssa/rewriteS390X.go @@ -60,6 +60,8 @@ func rewriteValueS390X(v *Value, config *Config) bool { return rewriteValueS390X_OpAtomicStorePtrNoWB(v, config) case OpAvg64u: return rewriteValueS390X_OpAvg64u(v, config) + case OpBitLen64: + return rewriteValueS390X_OpBitLen64(v, config) case OpBswap32: return rewriteValueS390X_OpBswap32(v, config) case OpBswap64: @@ -1138,6 +1140,24 @@ func rewriteValueS390X_OpAvg64u(v *Value, config *Config) bool { return true } } +func rewriteValueS390X_OpBitLen64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (BitLen64 x) + // cond: + // result: (SUB (MOVDconst [64]) (FLOGR x)) + for { + x := v.Args[0] + v.reset(OpS390XSUB) + v0 := b.NewValue0(v.Pos, OpS390XMOVDconst, config.fe.TypeUInt64()) + v0.AuxInt = 64 + v.AddArg(v0) + v1 := b.NewValue0(v.Pos, OpS390XFLOGR, config.fe.TypeUInt64()) + v1.AddArg(x) + v.AddArg(v1) + return true + } +} func rewriteValueS390X_OpBswap32(v *Value, config *Config) bool { b := v.Block _ = b @@ -14732,6 +14752,28 @@ func rewriteValueS390X_OpS390XNEG(v *Value, config *Config) bool { v.AuxInt = -c return true } + // match: (NEG (ADDconst [c] (NEG x))) + // cond: c != -(1<<31) + // result: (ADDconst [-c] x) + for { + v_0 := v.Args[0] + if v_0.Op != OpS390XADDconst { + break + } + c := v_0.AuxInt + v_0_0 := v_0.Args[0] + if v_0_0.Op != OpS390XNEG { + break + } + x := v_0_0.Args[0] + if !(c != -(1 << 31)) { + break + } + v.reset(OpS390XADDconst) + v.AuxInt = -c + v.AddArg(x) + return true + } return false } func rewriteValueS390X_OpS390XNEGW(v *Value, config *Config) bool { diff --git a/src/cmd/compile/internal/ssa/rewritedec64.go b/src/cmd/compile/internal/ssa/rewritedec64.go index db4a3c02bb..7a4ba21b6a 100644 --- a/src/cmd/compile/internal/ssa/rewritedec64.go +++ b/src/cmd/compile/internal/ssa/rewritedec64.go @@ -14,6 +14,8 @@ func rewriteValuedec64(v *Value, config *Config) bool { return rewriteValuedec64_OpAnd64(v, config) case OpArg: return rewriteValuedec64_OpArg(v, config) + case OpBitLen64: + return rewriteValuedec64_OpBitLen64(v, config) case OpBswap64: return rewriteValuedec64_OpBswap64(v, config) case OpCom64: @@ -278,6 +280,36 @@ func rewriteValuedec64_OpArg(v *Value, config *Config) bool { } return false } +func rewriteValuedec64_OpBitLen64(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (BitLen64 x) + // cond: + // result: (Add32 (BitLen32 (Int64Hi x)) (BitLen32 (Or32 (Int64Lo x) (Zeromask (Int64Hi x))))) + for { + x := v.Args[0] + v.reset(OpAdd32) + v.Type = config.fe.TypeInt() + v0 := b.NewValue0(v.Pos, OpBitLen32, config.fe.TypeInt()) + v1 := b.NewValue0(v.Pos, OpInt64Hi, config.fe.TypeUInt32()) + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + v2 := b.NewValue0(v.Pos, OpBitLen32, config.fe.TypeInt()) + v3 := b.NewValue0(v.Pos, OpOr32, config.fe.TypeUInt32()) + v4 := b.NewValue0(v.Pos, OpInt64Lo, config.fe.TypeUInt32()) + v4.AddArg(x) + v3.AddArg(v4) + v5 := b.NewValue0(v.Pos, OpZeromask, config.fe.TypeUInt32()) + v6 := b.NewValue0(v.Pos, OpInt64Hi, config.fe.TypeUInt32()) + v6.AddArg(x) + v5.AddArg(v6) + v3.AddArg(v5) + v2.AddArg(v3) + v.AddArg(v2) + return true + } +} func rewriteValuedec64_OpBswap64(v *Value, config *Config) bool { b := v.Block _ = b -- 2.48.1