]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: eliminate unnecessary type conversions in TrailingZeros(16|8) for arm64
authorerifan01 <eric.fang@arm.com>
Thu, 3 Jan 2019 09:25:06 +0000 (09:25 +0000)
committerCherry Zhang <cherryyz@google.com>
Thu, 7 Mar 2019 14:24:56 +0000 (14:24 +0000)
This CL eliminates unnecessary type conversion operations: OpZeroExt16to64 and OpZeroExt8to64.
If the input argrument is a nonzero value, then ORconst operation can also be eliminated.

Benchmarks:

name               old time/op  new time/op  delta
TrailingZeros-8    2.75ns ± 0%  2.75ns ± 0%     ~     (all equal)
TrailingZeros8-8   3.49ns ± 1%  2.93ns ± 0%  -16.00%  (p=0.000 n=10+10)
TrailingZeros16-8  3.49ns ± 1%  2.93ns ± 0%  -16.05%  (p=0.000 n=9+10)
TrailingZeros32-8  2.67ns ± 1%  2.68ns ± 1%     ~     (p=0.468 n=10+10)
TrailingZeros64-8  2.67ns ± 1%  2.65ns ± 0%   -0.62%  (p=0.022 n=10+9)

code:

func f16(x uint) { z = bits.TrailingZeros16(uint16(x)) }

Before:

"".f16 STEXT size=48 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        MOVHU   R0, R0
        0x0008 00008 (test.go:7)        ORR     $65536, R0, R0
        0x000c 00012 (test.go:7)        RBIT    R0, R0
        0x0010 00016 (test.go:7)        CLZ     R0, R0
        0x0014 00020 (test.go:7)        MOVD    R0, "".z(SB)
        0x0020 00032 (test.go:7)        RET     (R30)

This line of code is unnecessary:
        0x0004 00004 (test.go:7)        MOVHU   R0, R0

After:

"".f16 STEXT size=32 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f16(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0000 00000 (test.go:7)        FUNCDATA        ZR, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        FUNCDATA        $3, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
        0x0000 00000 (test.go:7)        PCDATA  $2, ZR
        0x0000 00000 (test.go:7)        PCDATA  ZR, ZR
        0x0000 00000 (test.go:7)        MOVD    "".x(FP), R0
        0x0004 00004 (test.go:7)        ORR     $65536, R0, R0
        0x0008 00008 (test.go:7)        RBITW   R0, R0
        0x000c 00012 (test.go:7)        CLZW    R0, R0
        0x0010 00016 (test.go:7)        MOVD    R0, "".z(SB)
        0x001c 00028 (test.go:7)        RET     (R30)

The situation of TrailingZeros8 is similar to TrailingZeros16.

Change-Id: I473bdca06be8460a0be87abbae6fe640017e4c9d
Reviewed-on: https://go-review.googlesource.com/c/go/+/156999
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

src/cmd/compile/internal/gc/ssa.go
src/cmd/compile/internal/ssa/gen/ARM64.rules
src/cmd/compile/internal/ssa/rewriteARM64.go
test/codegen/mathbits.go

index e03988dac26de2e925596e808831fd01cfb189d5..3f4355c387cc9790821f098048b29afca735eed4 100644 (file)
@@ -3290,7 +3290,7 @@ func init() {
                func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpCtz16, types.Types[TINT], args[0])
                },
-               sys.AMD64)
+               sys.AMD64, sys.ARM64)
        addF("math/bits", "TrailingZeros16",
                func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
                        x := s.newValue1(ssa.OpZeroExt16to64, types.Types[TUINT64], args[0])
@@ -3298,7 +3298,7 @@ func init() {
                        y := s.newValue2(ssa.OpOr64, types.Types[TUINT64], x, c)
                        return s.newValue1(ssa.OpCtz64, types.Types[TINT], y)
                },
-               sys.ARM64, sys.S390X, sys.PPC64)
+               sys.S390X, sys.PPC64)
        addF("math/bits", "TrailingZeros8",
                func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
                        x := s.newValue1(ssa.OpZeroExt8to32, types.Types[TUINT32], args[0])
@@ -3311,7 +3311,7 @@ func init() {
                func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpCtz8, types.Types[TINT], args[0])
                },
-               sys.AMD64)
+               sys.AMD64, sys.ARM64)
        addF("math/bits", "TrailingZeros8",
                func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
                        x := s.newValue1(ssa.OpZeroExt8to64, types.Types[TUINT64], args[0])
@@ -3319,7 +3319,7 @@ func init() {
                        y := s.newValue2(ssa.OpOr64, types.Types[TUINT64], x, c)
                        return s.newValue1(ssa.OpCtz64, types.Types[TINT], y)
                },
-               sys.ARM64, sys.S390X)
+               sys.S390X)
        alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...)
        alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...)
        // ReverseBytes inlines correctly, no need to intrinsify it.
index ca123d7375095a4b9a9c20c4b93ec088d1aae2a4..6e0420983acef2fe30ed6d1a54f68693165395be 100644 (file)
 
 (Ctz64NonZero x) -> (Ctz64 x)
 (Ctz32NonZero x) -> (Ctz32 x)
+(Ctz16NonZero x) -> (Ctz32 x)
+(Ctz8NonZero x) -> (Ctz32 x)
 
 (Ctz64 <t> x) -> (CLZ (RBIT <t> x))
 (Ctz32 <t> x) -> (CLZW (RBITW <t> x))
+(Ctz16 <t> x) -> (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x)))
+(Ctz8 <t> x) -> (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x100] x)))
 
 (PopCount64 <t> x) -> (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> x))))
 (PopCount32 <t> x) -> (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt32to64 x)))))
index 25246ce5e560de1ec24eee5394000c8a06a7e3b3..24f392a43eeafd28e13daf0c0fd551448e1e1ec4 100644 (file)
@@ -473,6 +473,10 @@ func rewriteValueARM64(v *Value) bool {
                return rewriteValueARM64_OpConstBool_0(v)
        case OpConstNil:
                return rewriteValueARM64_OpConstNil_0(v)
+       case OpCtz16:
+               return rewriteValueARM64_OpCtz16_0(v)
+       case OpCtz16NonZero:
+               return rewriteValueARM64_OpCtz16NonZero_0(v)
        case OpCtz32:
                return rewriteValueARM64_OpCtz32_0(v)
        case OpCtz32NonZero:
@@ -481,6 +485,10 @@ func rewriteValueARM64(v *Value) bool {
                return rewriteValueARM64_OpCtz64_0(v)
        case OpCtz64NonZero:
                return rewriteValueARM64_OpCtz64NonZero_0(v)
+       case OpCtz8:
+               return rewriteValueARM64_OpCtz8_0(v)
+       case OpCtz8NonZero:
+               return rewriteValueARM64_OpCtz8NonZero_0(v)
        case OpCvt32Fto32:
                return rewriteValueARM64_OpCvt32Fto32_0(v)
        case OpCvt32Fto32U:
@@ -33182,6 +33190,39 @@ func rewriteValueARM64_OpConstNil_0(v *Value) bool {
                return true
        }
 }
+func rewriteValueARM64_OpCtz16_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       typ := &b.Func.Config.Types
+       _ = typ
+       // match: (Ctz16 <t> x)
+       // cond:
+       // result: (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x10000] x)))
+       for {
+               t := v.Type
+               x := v.Args[0]
+               v.reset(OpARM64CLZW)
+               v.Type = t
+               v0 := b.NewValue0(v.Pos, OpARM64RBITW, typ.UInt32)
+               v1 := b.NewValue0(v.Pos, OpARM64ORconst, typ.UInt32)
+               v1.AuxInt = 0x10000
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               return true
+       }
+}
+func rewriteValueARM64_OpCtz16NonZero_0(v *Value) bool {
+       // match: (Ctz16NonZero x)
+       // cond:
+       // result: (Ctz32 x)
+       for {
+               x := v.Args[0]
+               v.reset(OpCtz32)
+               v.AddArg(x)
+               return true
+       }
+}
 func rewriteValueARM64_OpCtz32_0(v *Value) bool {
        b := v.Block
        _ = b
@@ -33236,6 +33277,39 @@ func rewriteValueARM64_OpCtz64NonZero_0(v *Value) bool {
                return true
        }
 }
+func rewriteValueARM64_OpCtz8_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       typ := &b.Func.Config.Types
+       _ = typ
+       // match: (Ctz8 <t> x)
+       // cond:
+       // result: (CLZW <t> (RBITW <typ.UInt32> (ORconst <typ.UInt32> [0x100] x)))
+       for {
+               t := v.Type
+               x := v.Args[0]
+               v.reset(OpARM64CLZW)
+               v.Type = t
+               v0 := b.NewValue0(v.Pos, OpARM64RBITW, typ.UInt32)
+               v1 := b.NewValue0(v.Pos, OpARM64ORconst, typ.UInt32)
+               v1.AuxInt = 0x100
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               return true
+       }
+}
+func rewriteValueARM64_OpCtz8NonZero_0(v *Value) bool {
+       // match: (Ctz8NonZero x)
+       // cond:
+       // result: (Ctz32 x)
+       for {
+               x := v.Args[0]
+               v.reset(OpCtz32)
+               v.AddArg(x)
+               return true
+       }
+}
 func rewriteValueARM64_OpCvt32Fto32_0(v *Value) bool {
        // match: (Cvt32Fto32 x)
        // cond:
index 09939bb6bea0102641a5cf9d9f8a5e3daa9b1a5b..c77b66c3f7eda2887a1f97727d53386254b5be04 100644 (file)
@@ -242,6 +242,7 @@ func RotateLeftVariable32(n uint32, m int) uint32 {
 
 func TrailingZeros(n uint) int {
        // amd64:"BSFQ","MOVL\t\\$64","CMOVQEQ"
+       // arm64:"RBIT","CLZ"
        // s390x:"FLOGR"
        // ppc64:"ANDN","POPCNTD"
        // ppc64le:"ANDN","POPCNTD"
@@ -250,6 +251,7 @@ func TrailingZeros(n uint) int {
 
 func TrailingZeros64(n uint64) int {
        // amd64:"BSFQ","MOVL\t\\$64","CMOVQEQ"
+       // arm64:"RBIT","CLZ"
        // s390x:"FLOGR"
        // ppc64:"ANDN","POPCNTD"
        // ppc64le:"ANDN","POPCNTD"
@@ -258,6 +260,7 @@ func TrailingZeros64(n uint64) int {
 
 func TrailingZeros32(n uint32) int {
        // amd64:"BTSQ\\t\\$32","BSFQ"
+       // arm64:"RBITW","CLZW"
        // s390x:"FLOGR","MOVWZ"
        // ppc64:"ANDN","POPCNTW"
        // ppc64le:"ANDN","POPCNTW"
@@ -266,6 +269,7 @@ func TrailingZeros32(n uint32) int {
 
 func TrailingZeros16(n uint16) int {
        // amd64:"BSFL","BTSL\\t\\$16"
+       // arm64:"ORR\t\\$65536","RBITW","CLZW",-"MOVHU\tR",-"RBIT\t",-"CLZ\t"
        // s390x:"FLOGR","OR\t\\$65536"
        // ppc64:"POPCNTD","OR\\t\\$65536"
        // ppc64le:"POPCNTD","OR\\t\\$65536"
@@ -274,6 +278,7 @@ func TrailingZeros16(n uint16) int {
 
 func TrailingZeros8(n uint8) int {
        // amd64:"BSFL","BTSL\\t\\$8"
+       // arm64:"ORR\t\\$256","RBITW","CLZW",-"MOVBU\tR",-"RBIT\t",-"CLZ\t"
        // s390x:"FLOGR","OR\t\\$256"
        return bits.TrailingZeros8(n)
 }
@@ -314,6 +319,7 @@ func IterateBits16(n uint16) int {
        i := 0
        for n != 0 {
                // amd64:"BSFL",-"BTSL"
+               // arm64:"RBITW","CLZW",-"ORR"
                i += bits.TrailingZeros16(n)
                n &= n - 1
        }
@@ -324,6 +330,7 @@ func IterateBits8(n uint8) int {
        i := 0
        for n != 0 {
                // amd64:"BSFL",-"BTSL"
+               // arm64:"RBITW","CLZW",-"ORR"
                i += bits.TrailingZeros8(n)
                n &= n - 1
        }