From: Tobias Klauser Date: Fri, 15 Mar 2019 07:49:38 +0000 (+0100) Subject: cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm X-Git-Tag: go1.13beta1~1015 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=156c830bea6795d57ef9eb9bfe66197413c00fce;p=gostls13.git cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm This follows CL 156999 which did the same for arm64. name old time/op new time/op delta TrailingZeros-4 7.30ns ± 1% 7.30ns ± 0% ~ (p=0.413 n=9+9) TrailingZeros8-4 8.32ns ± 0% 7.17ns ± 0% -13.77% (p=0.000 n=10+9) TrailingZeros16-4 8.30ns ± 0% 7.18ns ± 0% -13.50% (p=0.000 n=9+10) TrailingZeros32-4 6.46ns ± 1% 6.47ns ± 1% ~ (p=0.325 n=10+10) TrailingZeros64-4 16.3ns ± 0% 16.2ns ± 0% -0.61% (p=0.000 n=7+10) Change-Id: I7e9e1abf7e30d811aa474d272b2824ec7cbbaa98 Reviewed-on: https://go-review.googlesource.com/c/go/+/167797 Run-TryBot: Tobias Klauser TryBot-Result: Gobot Gobot Reviewed-by: Cherry Zhang --- diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index 62301642f5..3ccb59e105 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -3288,12 +3288,12 @@ func init() { y := s.newValue2(ssa.OpOr32, types.Types[TUINT32], x, c) return s.newValue1(ssa.OpCtz32, types.Types[TINT], y) }, - sys.ARM, sys.MIPS) + sys.MIPS) addF("math/bits", "TrailingZeros16", func(s *state, n *Node, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpCtz16, types.Types[TINT], args[0]) }, - sys.AMD64, sys.ARM64, sys.Wasm) + sys.AMD64, sys.ARM, sys.ARM64, sys.Wasm) addF("math/bits", "TrailingZeros16", func(s *state, n *Node, args []*ssa.Value) *ssa.Value { x := s.newValue1(ssa.OpZeroExt16to64, types.Types[TUINT64], args[0]) @@ -3309,12 +3309,12 @@ func init() { y := s.newValue2(ssa.OpOr32, types.Types[TUINT32], x, c) return s.newValue1(ssa.OpCtz32, types.Types[TINT], y) }, - sys.ARM, sys.MIPS) + sys.MIPS) addF("math/bits", "TrailingZeros8", func(s *state, n *Node, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpCtz8, types.Types[TINT], args[0]) }, - sys.AMD64, sys.ARM64, sys.Wasm) + sys.AMD64, sys.ARM, sys.ARM64, sys.Wasm) addF("math/bits", "TrailingZeros8", func(s *state, n *Node, args []*ssa.Value) *ssa.Value { x := s.newValue1(ssa.OpZeroExt8to64, types.Types[TUINT64], args[0]) diff --git a/src/cmd/compile/internal/ssa/gen/ARM.rules b/src/cmd/compile/internal/ssa/gen/ARM.rules index db418b76a6..a3f36d3009 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM.rules @@ -59,13 +59,22 @@ // TODO: optimize this for ARMv5 and ARMv6 (Ctz32NonZero x) -> (Ctz32 x) +(Ctz16NonZero x) -> (Ctz32 x) +(Ctz8NonZero x) -> (Ctz32 x) // count trailing zero for ARMv5 and ARMv6 // 32 - CLZ(x&-x - 1) -(Ctz32 x) && objabi.GOARM<=6 -> (RSBconst [32] (CLZ (SUBconst (AND x (RSBconst [0] x)) [1]))) +(Ctz32 x) && objabi.GOARM<=6 -> + (RSBconst [32] (CLZ (SUBconst (AND x (RSBconst [0] x)) [1]))) +(Ctz16 x) && objabi.GOARM<=6 -> + (RSBconst [32] (CLZ (SUBconst (AND (ORconst [0x10000] x) (RSBconst [0] (ORconst [0x10000] x))) [1]))) +(Ctz8 x) && objabi.GOARM<=6 -> + (RSBconst [32] (CLZ (SUBconst (AND (ORconst [0x100] x) (RSBconst [0] (ORconst [0x100] x))) [1]))) // count trailing zero for ARMv7 (Ctz32 x) && objabi.GOARM==7 -> (CLZ (RBIT x)) +(Ctz16 x) && objabi.GOARM==7 -> (CLZ (RBIT (ORconst [0x10000] x))) +(Ctz8 x) && objabi.GOARM==7 -> (CLZ (RBIT (ORconst [0x100] x))) // bit length (BitLen32 x) -> (RSBconst [32] (CLZ x)) diff --git a/src/cmd/compile/internal/ssa/rewriteARM.go b/src/cmd/compile/internal/ssa/rewriteARM.go index e6635ad6b5..37a34a9977 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM.go +++ b/src/cmd/compile/internal/ssa/rewriteARM.go @@ -483,10 +483,18 @@ func rewriteValueARM(v *Value) bool { return rewriteValueARM_OpConstBool_0(v) case OpConstNil: return rewriteValueARM_OpConstNil_0(v) + case OpCtz16: + return rewriteValueARM_OpCtz16_0(v) + case OpCtz16NonZero: + return rewriteValueARM_OpCtz16NonZero_0(v) case OpCtz32: return rewriteValueARM_OpCtz32_0(v) case OpCtz32NonZero: return rewriteValueARM_OpCtz32NonZero_0(v) + case OpCtz8: + return rewriteValueARM_OpCtz8_0(v) + case OpCtz8NonZero: + return rewriteValueARM_OpCtz8NonZero_0(v) case OpCvt32Fto32: return rewriteValueARM_OpCvt32Fto32_0(v) case OpCvt32Fto32U: @@ -17550,6 +17558,72 @@ func rewriteValueARM_OpConstNil_0(v *Value) bool { return true } } +func rewriteValueARM_OpCtz16_0(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Ctz16 x) + // cond: objabi.GOARM<=6 + // result: (RSBconst [32] (CLZ (SUBconst (AND (ORconst [0x10000] x) (RSBconst [0] (ORconst [0x10000] x))) [1]))) + for { + t := v.Type + x := v.Args[0] + if !(objabi.GOARM <= 6) { + break + } + v.reset(OpARMRSBconst) + v.AuxInt = 32 + v0 := b.NewValue0(v.Pos, OpARMCLZ, t) + v1 := b.NewValue0(v.Pos, OpARMSUBconst, typ.UInt32) + v1.AuxInt = 1 + v2 := b.NewValue0(v.Pos, OpARMAND, typ.UInt32) + v3 := b.NewValue0(v.Pos, OpARMORconst, typ.UInt32) + v3.AuxInt = 0x10000 + v3.AddArg(x) + v2.AddArg(v3) + v4 := b.NewValue0(v.Pos, OpARMRSBconst, typ.UInt32) + v4.AuxInt = 0 + v5 := b.NewValue0(v.Pos, OpARMORconst, typ.UInt32) + v5.AuxInt = 0x10000 + v5.AddArg(x) + v4.AddArg(v5) + v2.AddArg(v4) + v1.AddArg(v2) + v0.AddArg(v1) + v.AddArg(v0) + return true + } + // match: (Ctz16 x) + // cond: objabi.GOARM==7 + // result: (CLZ (RBIT (ORconst [0x10000] x))) + for { + t := v.Type + x := v.Args[0] + if !(objabi.GOARM == 7) { + break + } + v.reset(OpARMCLZ) + v.Type = t + v0 := b.NewValue0(v.Pos, OpARMRBIT, typ.UInt32) + v1 := b.NewValue0(v.Pos, OpARMORconst, typ.UInt32) + v1.AuxInt = 0x10000 + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + return true + } + return false +} +func rewriteValueARM_OpCtz16NonZero_0(v *Value) bool { + // match: (Ctz16NonZero x) + // cond: + // result: (Ctz32 x) + for { + x := v.Args[0] + v.reset(OpCtz32) + v.AddArg(x) + return true + } +} func rewriteValueARM_OpCtz32_0(v *Value) bool { b := v.Block // match: (Ctz32 x) @@ -17606,6 +17680,72 @@ func rewriteValueARM_OpCtz32NonZero_0(v *Value) bool { return true } } +func rewriteValueARM_OpCtz8_0(v *Value) bool { + b := v.Block + typ := &b.Func.Config.Types + // match: (Ctz8 x) + // cond: objabi.GOARM<=6 + // result: (RSBconst [32] (CLZ (SUBconst (AND (ORconst [0x100] x) (RSBconst [0] (ORconst [0x100] x))) [1]))) + for { + t := v.Type + x := v.Args[0] + if !(objabi.GOARM <= 6) { + break + } + v.reset(OpARMRSBconst) + v.AuxInt = 32 + v0 := b.NewValue0(v.Pos, OpARMCLZ, t) + v1 := b.NewValue0(v.Pos, OpARMSUBconst, typ.UInt32) + v1.AuxInt = 1 + v2 := b.NewValue0(v.Pos, OpARMAND, typ.UInt32) + v3 := b.NewValue0(v.Pos, OpARMORconst, typ.UInt32) + v3.AuxInt = 0x100 + v3.AddArg(x) + v2.AddArg(v3) + v4 := b.NewValue0(v.Pos, OpARMRSBconst, typ.UInt32) + v4.AuxInt = 0 + v5 := b.NewValue0(v.Pos, OpARMORconst, typ.UInt32) + v5.AuxInt = 0x100 + v5.AddArg(x) + v4.AddArg(v5) + v2.AddArg(v4) + v1.AddArg(v2) + v0.AddArg(v1) + v.AddArg(v0) + return true + } + // match: (Ctz8 x) + // cond: objabi.GOARM==7 + // result: (CLZ (RBIT (ORconst [0x100] x))) + for { + t := v.Type + x := v.Args[0] + if !(objabi.GOARM == 7) { + break + } + v.reset(OpARMCLZ) + v.Type = t + v0 := b.NewValue0(v.Pos, OpARMRBIT, typ.UInt32) + v1 := b.NewValue0(v.Pos, OpARMORconst, typ.UInt32) + v1.AuxInt = 0x100 + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + return true + } + return false +} +func rewriteValueARM_OpCtz8NonZero_0(v *Value) bool { + // match: (Ctz8NonZero x) + // cond: + // result: (Ctz32 x) + for { + x := v.Args[0] + v.reset(OpCtz32) + v.AddArg(x) + return true + } +} func rewriteValueARM_OpCvt32Fto32_0(v *Value) bool { // match: (Cvt32Fto32 x) // cond: diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go index 9a4051a0ce..3d5f1f64c8 100644 --- a/test/codegen/mathbits.go +++ b/test/codegen/mathbits.go @@ -258,6 +258,7 @@ func RotateLeftVariable32(n uint32, m int) uint32 { func TrailingZeros(n uint) int { // amd64:"BSFQ","MOVL\t\\$64","CMOVQEQ" + // arm:"CLZ" // arm64:"RBIT","CLZ" // s390x:"FLOGR" // ppc64:"ANDN","POPCNTD" @@ -278,6 +279,7 @@ func TrailingZeros64(n uint64) int { func TrailingZeros32(n uint32) int { // amd64:"BTSQ\\t\\$32","BSFQ" + // arm:"CLZ" // arm64:"RBITW","CLZW" // s390x:"FLOGR","MOVWZ" // ppc64:"ANDN","POPCNTW" @@ -288,6 +290,7 @@ func TrailingZeros32(n uint32) int { func TrailingZeros16(n uint16) int { // amd64:"BSFL","BTSL\\t\\$16" + // arm:"ORR\t\\$65536","CLZ",-"MOVHU\tR" // arm64:"ORR\t\\$65536","RBITW","CLZW",-"MOVHU\tR",-"RBIT\t",-"CLZ\t" // s390x:"FLOGR","OR\t\\$65536" // ppc64:"POPCNTD","OR\\t\\$65536" @@ -298,6 +301,7 @@ func TrailingZeros16(n uint16) int { func TrailingZeros8(n uint8) int { // amd64:"BSFL","BTSL\\t\\$8" + // arm:"ORR\t\\$256","CLZ",-"MOVBU\tR" // arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t" // s390x:"FLOGR","OR\t\\$256" // wasm:"I64Ctz"