From dd91269b7c470d07ba0efe1abab85011f41e38bc Mon Sep 17 00:00:00 2001 From: erifan01 Date: Wed, 2 Jan 2019 09:14:26 +0000 Subject: [PATCH] cmd/compile: optimize math/bits Len32 intrinsic on arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Arm64 has a 32-bit CLZ instruction CLZW, which can be used for intrinsic Len32. Function LeadingZeros32 calls Len32, with this change, the assembly code of LeadingZeros32 becomes more concise. Go code: func f32(x uint32) { z = bits.LeadingZeros32(x) } Before: "".f32 STEXT size=32 args=0x8 locals=0x0 leaf 0x0000 00000 (test.go:7) TEXT "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8 0x0004 00004 (test.go:7) MOVWU "".x(FP), R0 0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZ R0, R0 0x000c 00012 ($GOROOT/src/math/bits/bits.go:30) SUB $32, R0, R0 0x0010 00016 (test.go:7) MOVD R0, "".z(SB) 0x001c 00028 (test.go:7) RET (R30) After: "".f32 STEXT size=32 args=0x8 locals=0x0 leaf 0x0000 00000 (test.go:7) TEXT "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8 0x0004 00004 (test.go:7) MOVWU "".x(FP), R0 0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZW R0, R0 0x000c 00012 (test.go:7) MOVD R0, "".z(SB) 0x0018 00024 (test.go:7) RET (R30) Benchmarks: name old time/op new time/op delta LeadingZeros-8 2.53ns ± 0% 2.55ns ± 0% +0.67% (p=0.000 n=10+10) LeadingZeros8-8 3.56ns ± 0% 3.56ns ± 0% ~ (all equal) LeadingZeros16-8 3.55ns ± 0% 3.56ns ± 0% ~ (p=0.465 n=10+10) LeadingZeros32-8 3.55ns ± 0% 2.96ns ± 0% -16.71% (p=0.000 n=10+7) LeadingZeros64-8 2.53ns ± 0% 2.54ns ± 0% ~ (p=0.059 n=8+10) Change-Id: Ie5666bb82909e341060e02ffd4e86c0e5d67e90a Reviewed-on: https://go-review.googlesource.com/c/157000 Run-TryBot: Cherry Zhang TryBot-Result: Gobot Gobot Reviewed-by: Cherry Zhang --- src/cmd/compile/internal/gc/ssa.go | 4 ++-- src/cmd/compile/internal/ssa/gen/ARM64.rules | 1 + src/cmd/compile/internal/ssa/rewriteARM64.go | 22 ++++++++++++++++++++ test/codegen/mathbits.go | 2 +- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index c8befa40cd..95904edd6a 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -3327,7 +3327,7 @@ func init() { func(s *state, n *Node, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBitLen32, types.Types[TINT], args[0]) }, - sys.AMD64) + sys.AMD64, sys.ARM64) addF("math/bits", "Len32", func(s *state, n *Node, args []*ssa.Value) *ssa.Value { if s.config.PtrSize == 4 { @@ -3336,7 +3336,7 @@ func init() { x := s.newValue1(ssa.OpZeroExt32to64, types.Types[TUINT64], args[0]) return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x) }, - sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64) + sys.ARM, sys.S390X, sys.MIPS, sys.PPC64) addF("math/bits", "Len16", func(s *state, n *Node, args []*ssa.Value) *ssa.Value { if s.config.PtrSize == 4 { diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules index 1efce66016..fc806f75a0 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules @@ -123,6 +123,7 @@ (FMOVSload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _)) -> (FMOVSgpfp val) (BitLen64 x) -> (SUB (MOVDconst [64]) (CLZ x)) +(BitLen32 x) -> (SUB (MOVDconst [32]) (CLZW x)) (Bswap64 x) -> (REV x) (Bswap32 x) -> (REVW x) diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index 2afd0f335e..05b8b9c697 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go @@ -427,6 +427,8 @@ func rewriteValueARM64(v *Value) bool { return rewriteValueARM64_OpAtomicStorePtrNoWB_0(v) case OpAvg64u: return rewriteValueARM64_OpAvg64u_0(v) + case OpBitLen32: + return rewriteValueARM64_OpBitLen32_0(v) case OpBitLen64: return rewriteValueARM64_OpBitLen64_0(v) case OpBitRev16: @@ -32715,6 +32717,26 @@ func rewriteValueARM64_OpAvg64u_0(v *Value) bool { return true } } +func rewriteValueARM64_OpBitLen32_0(v *Value) bool { + b := v.Block + _ = b + typ := &b.Func.Config.Types + _ = typ + // match: (BitLen32 x) + // cond: + // result: (SUB (MOVDconst [32]) (CLZW x)) + for { + x := v.Args[0] + v.reset(OpARM64SUB) + v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64) + v0.AuxInt = 32 + v.AddArg(v0) + v1 := b.NewValue0(v.Pos, OpARM64CLZW, typ.Int) + v1.AddArg(x) + v.AddArg(v1) + return true + } +} func rewriteValueARM64_OpBitLen64_0(v *Value) bool { b := v.Block _ = b diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go index 44ab2c02b7..d8b1775b0f 100644 --- a/test/codegen/mathbits.go +++ b/test/codegen/mathbits.go @@ -31,7 +31,7 @@ func LeadingZeros64(n uint64) int { func LeadingZeros32(n uint32) int { // amd64:"BSRQ","LEAQ",-"CMOVQEQ" // s390x:"FLOGR" - // arm:"CLZ" arm64:"CLZ" + // arm:"CLZ" arm64:"CLZW" // mips:"CLZ" return bits.LeadingZeros32(n) } -- 2.50.0