]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: intrinsics for math/bits.OnesCount
authorKeith Randall <keithr@alum.mit.edu>
Fri, 17 Mar 2017 04:33:03 +0000 (21:33 -0700)
committerKeith Randall <khr@golang.org>
Tue, 4 Apr 2017 02:40:11 +0000 (02:40 +0000)
Popcount instructions on amd64 are not guaranteed to be
present, so we must guard their call.  Rewrite rules can't
generate control flow at the moment, so the intrinsifier
needs to generate that code.

name           old time/op  new time/op  delta
OnesCount-8    2.47ns ± 5%  1.04ns ± 2%  -57.70%  (p=0.000 n=10+10)
OnesCount16-8  1.05ns ± 1%  0.78ns ± 0%  -25.56%    (p=0.000 n=9+8)
OnesCount32-8  1.63ns ± 5%  1.04ns ± 2%  -35.96%  (p=0.000 n=10+10)
OnesCount64-8  2.45ns ± 0%  1.04ns ± 1%  -57.55%   (p=0.000 n=6+10)

Update #18616

Change-Id: I4aff2cc9aa93787898d7b22055fe272a7cf95673
Reviewed-on: https://go-review.googlesource.com/38320
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Robert Griesemer <gri@golang.org>
src/cmd/compile/internal/amd64/ssa.go
src/cmd/compile/internal/gc/asm_test.go
src/cmd/compile/internal/gc/builtin.go
src/cmd/compile/internal/gc/builtin/runtime.go
src/cmd/compile/internal/gc/ssa.go
src/cmd/compile/internal/ssa/gen/AMD64.rules
src/cmd/compile/internal/ssa/gen/AMD64Ops.go
src/cmd/compile/internal/ssa/gen/genericOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteAMD64.go

index 20fc49c926ff8c24289ea817c928e9f1ed2a5b37..4faad77a65240ae0dc0f9eda021eca357dd482da 100644 (file)
@@ -767,6 +767,21 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.From.Reg = v.Args[0].Reg()
                p.To.Type = obj.TYPE_REG
                p.To.Reg = v.Reg()
+       case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL:
+               if v.Args[0].Reg() != v.Reg() {
+                       // POPCNT on Intel has a false dependency on the destination register.
+                       // Zero the destination to break the dependency.
+                       p := s.Prog(x86.AMOVQ)
+                       p.From.Type = obj.TYPE_CONST
+                       p.From.Offset = 0
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = v.Reg()
+               }
+               p := s.Prog(v.Op.Asm())
+               p.From.Type = obj.TYPE_REG
+               p.From.Reg = v.Args[0].Reg()
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = v.Reg()
        case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
                ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
                ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
index b904c44fe6a3eb53e82d3c8f18fe5d48fa4c7e90..dd96bec2828a883d8b54322eeb5e0c55a0f2b338 100644 (file)
@@ -699,6 +699,34 @@ var linuxAMD64Tests = []*asmTest{
                `,
                []string{"\tBSRQ\t"},
        },
+       {
+               `
+               func pop1(x uint64) int {
+                       return bits.OnesCount64(x)
+               }`,
+               []string{"\tPOPCNTQ\t", "support_popcnt"},
+       },
+       {
+               `
+               func pop2(x uint32) int {
+                       return bits.OnesCount32(x)
+               }`,
+               []string{"\tPOPCNTL\t", "support_popcnt"},
+       },
+       {
+               `
+               func pop3(x uint16) int {
+                       return bits.OnesCount16(x)
+               }`,
+               []string{"\tPOPCNTL\t", "support_popcnt"},
+       },
+       {
+               `
+               func pop4(x uint) int {
+                       return bits.OnesCount(x)
+               }`,
+               []string{"\tPOPCNTQ\t", "support_popcnt"},
+       },
        // see issue 19595.
        // We want to merge load+op in f58, but not in f59.
        {
index 294fc4fc16558ac8891e1793ac4954a29baf7cc8..eae6f20d80edc3be120baf03bb484692beb55c39 100644 (file)
@@ -142,6 +142,7 @@ var runtimeDecls = [...]struct {
        {"racewriterange", funcTag, 111},
        {"msanread", funcTag, 111},
        {"msanwrite", funcTag, 111},
+       {"support_popcnt", varTag, 11},
 }
 
 func runtimeTypes() []*Type {
index b89f0a3c0220264b3236659c56dbb0b11e588626..7f4846db9dbbdaffff08855b6046abd583908a7e 100644 (file)
@@ -187,3 +187,6 @@ func racewriterange(addr, size uintptr)
 // memory sanitizer
 func msanread(addr, size uintptr)
 func msanwrite(addr, size uintptr)
+
+// architecture variants
+var support_popcnt bool
index ad81858186defd2b11e6100ef0caaa34ea50af0c..a0cc83d82fd2b30d3d4ed7b399658d6e73705f8b 100644 (file)
@@ -2823,6 +2823,54 @@ func init() {
                        return s.newValue1(ssa.OpBitRev64, Types[TINT], args[0])
                },
                sys.ARM64)
+       makeOnesCount := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+               return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+                       aux := s.lookupSymbol(n, &ssa.ExternSymbol{Typ: Types[TBOOL], Sym: Linksym(syslook("support_popcnt").Sym)})
+                       addr := s.entryNewValue1A(ssa.OpAddr, Types[TBOOL].PtrTo(), aux, s.sb)
+                       v := s.newValue2(ssa.OpLoad, Types[TBOOL], addr, s.mem())
+                       b := s.endBlock()
+                       b.Kind = ssa.BlockIf
+                       b.SetControl(v)
+                       bTrue := s.f.NewBlock(ssa.BlockPlain)
+                       bFalse := s.f.NewBlock(ssa.BlockPlain)
+                       bEnd := s.f.NewBlock(ssa.BlockPlain)
+                       b.AddEdgeTo(bTrue)
+                       b.AddEdgeTo(bFalse)
+                       b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
+
+                       // We have the intrinsic - use it directly.
+                       s.startBlock(bTrue)
+                       op := op64
+                       if s.config.IntSize == 4 {
+                               op = op32
+                       }
+                       s.vars[n] = s.newValue1(op, Types[TINT], args[0])
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Call the pure Go version.
+                       s.startBlock(bFalse)
+                       a := s.call(n, callNormal)
+                       s.vars[n] = s.newValue2(ssa.OpLoad, Types[TINT], a, s.mem())
+                       s.endBlock().AddEdgeTo(bEnd)
+
+                       // Merge results.
+                       s.startBlock(bEnd)
+                       return s.variable(n, Types[TINT])
+               }
+       }
+       addF("math/bits", "OnesCount64",
+               makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount64),
+               sys.AMD64)
+       addF("math/bits", "OnesCount32",
+               makeOnesCount(ssa.OpPopCount32, ssa.OpPopCount32),
+               sys.AMD64)
+       addF("math/bits", "OnesCount16",
+               makeOnesCount(ssa.OpPopCount16, ssa.OpPopCount16),
+               sys.AMD64)
+       // Note: no OnesCount8, the Go implementation is faster - just a table load.
+       addF("math/bits", "OnesCount",
+               makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount32),
+               sys.AMD64)
 
        /******** sync/atomic ********/
 
index ac45cd71e51b4403a976a76497ee02fd56ebaa51..b7cbe37472d55e3abbbaa58f5597e92053c7bc9d 100644 (file)
 (Bswap64 x) -> (BSWAPQ x)
 (Bswap32 x) -> (BSWAPL x)
 
+(PopCount64 x) -> (POPCNTQ x)
+(PopCount32 x) -> (POPCNTL x)
+(PopCount16 x) -> (POPCNTL (MOVWQZX <types.UInt32> x))
+(PopCount8 x) -> (POPCNTL (MOVBQZX <types.UInt32> x))
+
 (Sqrt x) -> (SQRTSD x)
 
 // Lowering extension
index a859c63aa48b673dad3d8a7666c9f97b62db053a..d9e5fd5b7a5453169e8b12ae0db956afea41d75f 100644 (file)
@@ -323,6 +323,11 @@ func init() {
                {name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
                {name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
 
+               // POPCNT instructions aren't guaranteed to be on the target platform (they are SSE4).
+               // Any use must be preceded by a successful check of runtime.support_popcnt.
+               {name: "POPCNTQ", argLength: 1, reg: gp11, asm: "POPCNTQ", clobberFlags: true}, // count number of set bits in arg0
+               {name: "POPCNTL", argLength: 1, reg: gp11, asm: "POPCNTL", clobberFlags: true}, // count number of set bits in arg0
+
                {name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
 
                {name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
index 7991f32679c35512b42d1c9763a387c57672b7a9..300a54524b01394ec2507ee5c0b2e679934cff18 100644 (file)
@@ -250,6 +250,11 @@ var genericOps = []opData{
        {name: "BitRev32", argLength: 1}, // Reverse the bits in arg[0]
        {name: "BitRev64", argLength: 1}, // Reverse the bits in arg[0]
 
+       {name: "PopCount8", argLength: 1},  // Count bits in arg[0]
+       {name: "PopCount16", argLength: 1}, // Count bits in arg[0]
+       {name: "PopCount32", argLength: 1}, // Count bits in arg[0]
+       {name: "PopCount64", argLength: 1}, // Count bits in arg[0]
+
        {name: "Sqrt", argLength: 1}, // sqrt(arg0), float64 only
 
        // Data movement, max argument length for Phi is indefinite so just pick
index daeaf64c621a03515b46a3b7e1e57297ff59605a..48bc15773e4250c18ad4a2d8cd7246e3721223ac 100644 (file)
@@ -538,6 +538,8 @@ const (
        OpAMD64CMOVLEQ
        OpAMD64BSWAPQ
        OpAMD64BSWAPL
+       OpAMD64POPCNTQ
+       OpAMD64POPCNTL
        OpAMD64SQRTSD
        OpAMD64SBBQcarrymask
        OpAMD64SBBLcarrymask
@@ -1778,6 +1780,10 @@ const (
        OpBitRev16
        OpBitRev32
        OpBitRev64
+       OpPopCount8
+       OpPopCount16
+       OpPopCount32
+       OpPopCount64
        OpSqrt
        OpPhi
        OpCopy
@@ -6368,6 +6374,34 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "POPCNTQ",
+               argLen:       1,
+               clobberFlags: true,
+               asm:          x86.APOPCNTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+                       },
+                       outputs: []outputInfo{
+                               {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+                       },
+               },
+       },
+       {
+               name:         "POPCNTL",
+               argLen:       1,
+               clobberFlags: true,
+               asm:          x86.APOPCNTL,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+                       },
+                       outputs: []outputInfo{
+                               {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+                       },
+               },
+       },
        {
                name:   "SQRTSD",
                argLen: 1,
@@ -21680,6 +21714,26 @@ var opcodeTable = [...]opInfo{
                argLen:  1,
                generic: true,
        },
+       {
+               name:    "PopCount8",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PopCount16",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PopCount32",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PopCount64",
+               argLen:  1,
+               generic: true,
+       },
        {
                name:    "Sqrt",
                argLen:  1,
index 91e05456e0db051a5e07d6a323b9cb6eb216700a..df72064b7686098fe3f48bfd8065f7c61d2d36fa 100644 (file)
@@ -686,6 +686,14 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpOr8(v)
        case OpOrB:
                return rewriteValueAMD64_OpOrB(v)
+       case OpPopCount16:
+               return rewriteValueAMD64_OpPopCount16(v)
+       case OpPopCount32:
+               return rewriteValueAMD64_OpPopCount32(v)
+       case OpPopCount64:
+               return rewriteValueAMD64_OpPopCount64(v)
+       case OpPopCount8:
+               return rewriteValueAMD64_OpPopCount8(v)
        case OpRound32F:
                return rewriteValueAMD64_OpRound32F(v)
        case OpRound64F:
@@ -33467,6 +33475,62 @@ func rewriteValueAMD64_OpOrB(v *Value) bool {
                return true
        }
 }
+func rewriteValueAMD64_OpPopCount16(v *Value) bool {
+       b := v.Block
+       _ = b
+       types := &b.Func.Config.Types
+       _ = types
+       // match: (PopCount16 x)
+       // cond:
+       // result: (POPCNTL (MOVWQZX <types.UInt32> x))
+       for {
+               x := v.Args[0]
+               v.reset(OpAMD64POPCNTL)
+               v0 := b.NewValue0(v.Pos, OpAMD64MOVWQZX, types.UInt32)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPopCount32(v *Value) bool {
+       // match: (PopCount32 x)
+       // cond:
+       // result: (POPCNTL x)
+       for {
+               x := v.Args[0]
+               v.reset(OpAMD64POPCNTL)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPopCount64(v *Value) bool {
+       // match: (PopCount64 x)
+       // cond:
+       // result: (POPCNTQ x)
+       for {
+               x := v.Args[0]
+               v.reset(OpAMD64POPCNTQ)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPopCount8(v *Value) bool {
+       b := v.Block
+       _ = b
+       types := &b.Func.Config.Types
+       _ = types
+       // match: (PopCount8 x)
+       // cond:
+       // result: (POPCNTL (MOVBQZX <types.UInt32> x))
+       for {
+               x := v.Args[0]
+               v.reset(OpAMD64POPCNTL)
+               v0 := b.NewValue0(v.Pos, OpAMD64MOVBQZX, types.UInt32)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               return true
+       }
+}
 func rewriteValueAMD64_OpRound32F(v *Value) bool {
        // match: (Round32F x)
        // cond: