This follows CL 156999 which did the same for arm64.
name old time/op new time/op delta
TrailingZeros-4 7.30ns ± 1% 7.30ns ± 0% ~ (p=0.413 n=9+9)
TrailingZeros8-4 8.32ns ± 0% 7.17ns ± 0% -13.77% (p=0.000 n=10+9)
TrailingZeros16-4 8.30ns ± 0% 7.18ns ± 0% -13.50% (p=0.000 n=9+10)
TrailingZeros32-4 6.46ns ± 1% 6.47ns ± 1% ~ (p=0.325 n=10+10)
TrailingZeros64-4 16.3ns ± 0% 16.2ns ± 0% -0.61% (p=0.000 n=7+10)
Change-Id: I7e9e1abf7e30d811aa474d272b2824ec7cbbaa98
Reviewed-on: https://go-review.googlesource.com/c/go/+/167797
Run-TryBot: Tobias Klauser <tobias.klauser@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
y := s.newValue2(ssa.OpOr32, types.Types[TUINT32], x, c)
return s.newValue1(ssa.OpCtz32, types.Types[TINT], y)
},
- sys.ARM, sys.MIPS)
+ sys.MIPS)
addF("math/bits", "TrailingZeros16",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz16, types.Types[TINT], args[0])
},
- sys.AMD64, sys.ARM64, sys.Wasm)
+ sys.AMD64, sys.ARM, sys.ARM64, sys.Wasm)
addF("math/bits", "TrailingZeros16",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt16to64, types.Types[TUINT64], args[0])
y := s.newValue2(ssa.OpOr32, types.Types[TUINT32], x, c)
return s.newValue1(ssa.OpCtz32, types.Types[TINT], y)
},
- sys.ARM, sys.MIPS)
+ sys.MIPS)
addF("math/bits", "TrailingZeros8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz8, types.Types[TINT], args[0])
},
- sys.AMD64, sys.ARM64, sys.Wasm)
+ sys.AMD64, sys.ARM, sys.ARM64, sys.Wasm)
addF("math/bits", "TrailingZeros8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt8to64, types.Types[TUINT64], args[0])
// TODO: optimize this for ARMv5 and ARMv6
(Ctz32NonZero x) -> (Ctz32 x)
+(Ctz16NonZero x) -> (Ctz32 x)
+(Ctz8NonZero x) -> (Ctz32 x)
// count trailing zero for ARMv5 and ARMv6
// 32 - CLZ(x&-x - 1)
-(Ctz32 <t> x) && objabi.GOARM<=6 -> (RSBconst [32] (CLZ <t> (SUBconst <t> (AND <t> x (RSBconst <t> [0] x)) [1])))
+(Ctz32 <t> x) && objabi.GOARM<=6 ->
+ (RSBconst [32] (CLZ <t> (SUBconst <t> (AND <t> x (RSBconst <t> [0] x)) [1])))
+(Ctz16 <t> x) && objabi.GOARM<=6 ->
+ (RSBconst [32] (CLZ <t> (SUBconst <typ.UInt32> (AND <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x) (RSBconst <typ.UInt32> [0] (ORconst <typ.UInt32> [0x10000] x))) [1])))
+(Ctz8 <t> x) && objabi.GOARM<=6 ->
+ (RSBconst [32] (CLZ <t> (SUBconst <typ.UInt32> (AND <typ.UInt32> (ORconst <typ.UInt32> [0x100] x) (RSBconst <typ.UInt32> [0] (ORconst <typ.UInt32> [0x100] x))) [1])))
// count trailing zero for ARMv7
(Ctz32 <t> x) && objabi.GOARM==7 -> (CLZ <t> (RBIT <t> x))
+(Ctz16 <t> x) && objabi.GOARM==7 -> (CLZ <t> (RBIT <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x)))
+(Ctz8 <t> x) && objabi.GOARM==7 -> (CLZ <t> (RBIT <typ.UInt32> (ORconst <typ.UInt32> [0x100] x)))
// bit length
(BitLen32 <t> x) -> (RSBconst [32] (CLZ <t> x))
return rewriteValueARM_OpConstBool_0(v)
case OpConstNil:
return rewriteValueARM_OpConstNil_0(v)
+ case OpCtz16:
+ return rewriteValueARM_OpCtz16_0(v)
+ case OpCtz16NonZero:
+ return rewriteValueARM_OpCtz16NonZero_0(v)
case OpCtz32:
return rewriteValueARM_OpCtz32_0(v)
case OpCtz32NonZero:
return rewriteValueARM_OpCtz32NonZero_0(v)
+ case OpCtz8:
+ return rewriteValueARM_OpCtz8_0(v)
+ case OpCtz8NonZero:
+ return rewriteValueARM_OpCtz8NonZero_0(v)
case OpCvt32Fto32:
return rewriteValueARM_OpCvt32Fto32_0(v)
case OpCvt32Fto32U:
return true
}
}
+func rewriteValueARM_OpCtz16_0(v *Value) bool {
+ b := v.Block
+ typ := &b.Func.Config.Types
+ // match: (Ctz16 <t> x)
+ // cond: objabi.GOARM<=6
+ // result: (RSBconst [32] (CLZ <t> (SUBconst <typ.UInt32> (AND <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x) (RSBconst <typ.UInt32> [0] (ORconst <typ.UInt32> [0x10000] x))) [1])))
+ for {
+ t := v.Type
+ x := v.Args[0]
+ if !(objabi.GOARM <= 6) {
+ break
+ }
+ v.reset(OpARMRSBconst)
+ v.AuxInt = 32
+ v0 := b.NewValue0(v.Pos, OpARMCLZ, t)
+ v1 := b.NewValue0(v.Pos, OpARMSUBconst, typ.UInt32)
+ v1.AuxInt = 1
+ v2 := b.NewValue0(v.Pos, OpARMAND, typ.UInt32)
+ v3 := b.NewValue0(v.Pos, OpARMORconst, typ.UInt32)
+ v3.AuxInt = 0x10000
+ v3.AddArg(x)
+ v2.AddArg(v3)
+ v4 := b.NewValue0(v.Pos, OpARMRSBconst, typ.UInt32)
+ v4.AuxInt = 0
+ v5 := b.NewValue0(v.Pos, OpARMORconst, typ.UInt32)
+ v5.AuxInt = 0x10000
+ v5.AddArg(x)
+ v4.AddArg(v5)
+ v2.AddArg(v4)
+ v1.AddArg(v2)
+ v0.AddArg(v1)
+ v.AddArg(v0)
+ return true
+ }
+ // match: (Ctz16 <t> x)
+ // cond: objabi.GOARM==7
+ // result: (CLZ <t> (RBIT <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x)))
+ for {
+ t := v.Type
+ x := v.Args[0]
+ if !(objabi.GOARM == 7) {
+ break
+ }
+ v.reset(OpARMCLZ)
+ v.Type = t
+ v0 := b.NewValue0(v.Pos, OpARMRBIT, typ.UInt32)
+ v1 := b.NewValue0(v.Pos, OpARMORconst, typ.UInt32)
+ v1.AuxInt = 0x10000
+ v1.AddArg(x)
+ v0.AddArg(v1)
+ v.AddArg(v0)
+ return true
+ }
+ return false
+}
+func rewriteValueARM_OpCtz16NonZero_0(v *Value) bool {
+ // match: (Ctz16NonZero x)
+ // cond:
+ // result: (Ctz32 x)
+ for {
+ x := v.Args[0]
+ v.reset(OpCtz32)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueARM_OpCtz32_0(v *Value) bool {
b := v.Block
// match: (Ctz32 <t> x)
return true
}
}
+func rewriteValueARM_OpCtz8_0(v *Value) bool {
+ b := v.Block
+ typ := &b.Func.Config.Types
+ // match: (Ctz8 <t> x)
+ // cond: objabi.GOARM<=6
+ // result: (RSBconst [32] (CLZ <t> (SUBconst <typ.UInt32> (AND <typ.UInt32> (ORconst <typ.UInt32> [0x100] x) (RSBconst <typ.UInt32> [0] (ORconst <typ.UInt32> [0x100] x))) [1])))
+ for {
+ t := v.Type
+ x := v.Args[0]
+ if !(objabi.GOARM <= 6) {
+ break
+ }
+ v.reset(OpARMRSBconst)
+ v.AuxInt = 32
+ v0 := b.NewValue0(v.Pos, OpARMCLZ, t)
+ v1 := b.NewValue0(v.Pos, OpARMSUBconst, typ.UInt32)
+ v1.AuxInt = 1
+ v2 := b.NewValue0(v.Pos, OpARMAND, typ.UInt32)
+ v3 := b.NewValue0(v.Pos, OpARMORconst, typ.UInt32)
+ v3.AuxInt = 0x100
+ v3.AddArg(x)
+ v2.AddArg(v3)
+ v4 := b.NewValue0(v.Pos, OpARMRSBconst, typ.UInt32)
+ v4.AuxInt = 0
+ v5 := b.NewValue0(v.Pos, OpARMORconst, typ.UInt32)
+ v5.AuxInt = 0x100
+ v5.AddArg(x)
+ v4.AddArg(v5)
+ v2.AddArg(v4)
+ v1.AddArg(v2)
+ v0.AddArg(v1)
+ v.AddArg(v0)
+ return true
+ }
+ // match: (Ctz8 <t> x)
+ // cond: objabi.GOARM==7
+ // result: (CLZ <t> (RBIT <typ.UInt32> (ORconst <typ.UInt32> [0x100] x)))
+ for {
+ t := v.Type
+ x := v.Args[0]
+ if !(objabi.GOARM == 7) {
+ break
+ }
+ v.reset(OpARMCLZ)
+ v.Type = t
+ v0 := b.NewValue0(v.Pos, OpARMRBIT, typ.UInt32)
+ v1 := b.NewValue0(v.Pos, OpARMORconst, typ.UInt32)
+ v1.AuxInt = 0x100
+ v1.AddArg(x)
+ v0.AddArg(v1)
+ v.AddArg(v0)
+ return true
+ }
+ return false
+}
+func rewriteValueARM_OpCtz8NonZero_0(v *Value) bool {
+ // match: (Ctz8NonZero x)
+ // cond:
+ // result: (Ctz32 x)
+ for {
+ x := v.Args[0]
+ v.reset(OpCtz32)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueARM_OpCvt32Fto32_0(v *Value) bool {
// match: (Cvt32Fto32 x)
// cond:
func TrailingZeros(n uint) int {
// amd64:"BSFQ","MOVL\t\\$64","CMOVQEQ"
+ // arm:"CLZ"
// arm64:"RBIT","CLZ"
// s390x:"FLOGR"
// ppc64:"ANDN","POPCNTD"
func TrailingZeros32(n uint32) int {
// amd64:"BTSQ\\t\\$32","BSFQ"
+ // arm:"CLZ"
// arm64:"RBITW","CLZW"
// s390x:"FLOGR","MOVWZ"
// ppc64:"ANDN","POPCNTW"
func TrailingZeros16(n uint16) int {
// amd64:"BSFL","BTSL\\t\\$16"
+ // arm:"ORR\t\\$65536","CLZ",-"MOVHU\tR"
// arm64:"ORR\t\\$65536","RBITW","CLZW",-"MOVHU\tR",-"RBIT\t",-"CLZ\t"
// s390x:"FLOGR","OR\t\\$65536"
// ppc64:"POPCNTD","OR\\t\\$65536"
func TrailingZeros8(n uint8) int {
// amd64:"BSFL","BTSL\\t\\$8"
+ // arm:"ORR\t\\$256","CLZ",-"MOVBU\tR"
// arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t"
// s390x:"FLOGR","OR\t\\$256"
// wasm:"I64Ctz"