From: Keith Randall Date: Fri, 17 Mar 2017 04:33:03 +0000 (-0700) Subject: cmd/compile: intrinsics for math/bits.OnesCount X-Git-Tag: go1.9beta1~836 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=5cadc91b3ced9614b1055c448f7784a15907fff5;p=gostls13.git cmd/compile: intrinsics for math/bits.OnesCount Popcount instructions on amd64 are not guaranteed to be present, so we must guard their call. Rewrite rules can't generate control flow at the moment, so the intrinsifier needs to generate that code. name old time/op new time/op delta OnesCount-8 2.47ns ± 5% 1.04ns ± 2% -57.70% (p=0.000 n=10+10) OnesCount16-8 1.05ns ± 1% 0.78ns ± 0% -25.56% (p=0.000 n=9+8) OnesCount32-8 1.63ns ± 5% 1.04ns ± 2% -35.96% (p=0.000 n=10+10) OnesCount64-8 2.45ns ± 0% 1.04ns ± 1% -57.55% (p=0.000 n=6+10) Update #18616 Change-Id: I4aff2cc9aa93787898d7b22055fe272a7cf95673 Reviewed-on: https://go-review.googlesource.com/38320 Run-TryBot: Keith Randall TryBot-Result: Gobot Gobot Reviewed-by: Robert Griesemer --- diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 20fc49c926..4faad77a65 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -767,6 +767,21 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.From.Reg = v.Args[0].Reg() p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() + case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL: + if v.Args[0].Reg() != v.Reg() { + // POPCNT on Intel has a false dependency on the destination register. + // Zero the destination to break the dependency. + p := s.Prog(x86.AMOVQ) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 0 + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + } + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE, ssa.OpAMD64SETL, ssa.OpAMD64SETLE, ssa.OpAMD64SETG, ssa.OpAMD64SETGE, diff --git a/src/cmd/compile/internal/gc/asm_test.go b/src/cmd/compile/internal/gc/asm_test.go index b904c44fe6..dd96bec282 100644 --- a/src/cmd/compile/internal/gc/asm_test.go +++ b/src/cmd/compile/internal/gc/asm_test.go @@ -699,6 +699,34 @@ var linuxAMD64Tests = []*asmTest{ `, []string{"\tBSRQ\t"}, }, + { + ` + func pop1(x uint64) int { + return bits.OnesCount64(x) + }`, + []string{"\tPOPCNTQ\t", "support_popcnt"}, + }, + { + ` + func pop2(x uint32) int { + return bits.OnesCount32(x) + }`, + []string{"\tPOPCNTL\t", "support_popcnt"}, + }, + { + ` + func pop3(x uint16) int { + return bits.OnesCount16(x) + }`, + []string{"\tPOPCNTL\t", "support_popcnt"}, + }, + { + ` + func pop4(x uint) int { + return bits.OnesCount(x) + }`, + []string{"\tPOPCNTQ\t", "support_popcnt"}, + }, // see issue 19595. // We want to merge load+op in f58, but not in f59. { diff --git a/src/cmd/compile/internal/gc/builtin.go b/src/cmd/compile/internal/gc/builtin.go index 294fc4fc16..eae6f20d80 100644 --- a/src/cmd/compile/internal/gc/builtin.go +++ b/src/cmd/compile/internal/gc/builtin.go @@ -142,6 +142,7 @@ var runtimeDecls = [...]struct { {"racewriterange", funcTag, 111}, {"msanread", funcTag, 111}, {"msanwrite", funcTag, 111}, + {"support_popcnt", varTag, 11}, } func runtimeTypes() []*Type { diff --git a/src/cmd/compile/internal/gc/builtin/runtime.go b/src/cmd/compile/internal/gc/builtin/runtime.go index b89f0a3c02..7f4846db9d 100644 --- a/src/cmd/compile/internal/gc/builtin/runtime.go +++ b/src/cmd/compile/internal/gc/builtin/runtime.go @@ -187,3 +187,6 @@ func racewriterange(addr, size uintptr) // memory sanitizer func msanread(addr, size uintptr) func msanwrite(addr, size uintptr) + +// architecture variants +var support_popcnt bool diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index ad81858186..a0cc83d82f 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -2823,6 +2823,54 @@ func init() { return s.newValue1(ssa.OpBitRev64, Types[TINT], args[0]) }, sys.ARM64) + makeOnesCount := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value { + return func(s *state, n *Node, args []*ssa.Value) *ssa.Value { + aux := s.lookupSymbol(n, &ssa.ExternSymbol{Typ: Types[TBOOL], Sym: Linksym(syslook("support_popcnt").Sym)}) + addr := s.entryNewValue1A(ssa.OpAddr, Types[TBOOL].PtrTo(), aux, s.sb) + v := s.newValue2(ssa.OpLoad, Types[TBOOL], addr, s.mem()) + b := s.endBlock() + b.Kind = ssa.BlockIf + b.SetControl(v) + bTrue := s.f.NewBlock(ssa.BlockPlain) + bFalse := s.f.NewBlock(ssa.BlockPlain) + bEnd := s.f.NewBlock(ssa.BlockPlain) + b.AddEdgeTo(bTrue) + b.AddEdgeTo(bFalse) + b.Likely = ssa.BranchLikely // most machines have popcnt nowadays + + // We have the intrinsic - use it directly. + s.startBlock(bTrue) + op := op64 + if s.config.IntSize == 4 { + op = op32 + } + s.vars[n] = s.newValue1(op, Types[TINT], args[0]) + s.endBlock().AddEdgeTo(bEnd) + + // Call the pure Go version. + s.startBlock(bFalse) + a := s.call(n, callNormal) + s.vars[n] = s.newValue2(ssa.OpLoad, Types[TINT], a, s.mem()) + s.endBlock().AddEdgeTo(bEnd) + + // Merge results. + s.startBlock(bEnd) + return s.variable(n, Types[TINT]) + } + } + addF("math/bits", "OnesCount64", + makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount64), + sys.AMD64) + addF("math/bits", "OnesCount32", + makeOnesCount(ssa.OpPopCount32, ssa.OpPopCount32), + sys.AMD64) + addF("math/bits", "OnesCount16", + makeOnesCount(ssa.OpPopCount16, ssa.OpPopCount16), + sys.AMD64) + // Note: no OnesCount8, the Go implementation is faster - just a table load. + addF("math/bits", "OnesCount", + makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount32), + sys.AMD64) /******** sync/atomic ********/ diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules index ac45cd71e5..b7cbe37472 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules @@ -106,6 +106,11 @@ (Bswap64 x) -> (BSWAPQ x) (Bswap32 x) -> (BSWAPL x) +(PopCount64 x) -> (POPCNTQ x) +(PopCount32 x) -> (POPCNTL x) +(PopCount16 x) -> (POPCNTL (MOVWQZX x)) +(PopCount8 x) -> (POPCNTL (MOVBQZX x)) + (Sqrt x) -> (SQRTSD x) // Lowering extension diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go index a859c63aa4..d9e5fd5b7a 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go @@ -323,6 +323,11 @@ func init() { {name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes {name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes + // POPCNT instructions aren't guaranteed to be on the target platform (they are SSE4). + // Any use must be preceded by a successful check of runtime.support_popcnt. + {name: "POPCNTQ", argLength: 1, reg: gp11, asm: "POPCNTQ", clobberFlags: true}, // count number of set bits in arg0 + {name: "POPCNTL", argLength: 1, reg: gp11, asm: "POPCNTL", clobberFlags: true}, // count number of set bits in arg0 + {name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0) {name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear. diff --git a/src/cmd/compile/internal/ssa/gen/genericOps.go b/src/cmd/compile/internal/ssa/gen/genericOps.go index 7991f32679..300a54524b 100644 --- a/src/cmd/compile/internal/ssa/gen/genericOps.go +++ b/src/cmd/compile/internal/ssa/gen/genericOps.go @@ -250,6 +250,11 @@ var genericOps = []opData{ {name: "BitRev32", argLength: 1}, // Reverse the bits in arg[0] {name: "BitRev64", argLength: 1}, // Reverse the bits in arg[0] + {name: "PopCount8", argLength: 1}, // Count bits in arg[0] + {name: "PopCount16", argLength: 1}, // Count bits in arg[0] + {name: "PopCount32", argLength: 1}, // Count bits in arg[0] + {name: "PopCount64", argLength: 1}, // Count bits in arg[0] + {name: "Sqrt", argLength: 1}, // sqrt(arg0), float64 only // Data movement, max argument length for Phi is indefinite so just pick diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index daeaf64c62..48bc15773e 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -538,6 +538,8 @@ const ( OpAMD64CMOVLEQ OpAMD64BSWAPQ OpAMD64BSWAPL + OpAMD64POPCNTQ + OpAMD64POPCNTL OpAMD64SQRTSD OpAMD64SBBQcarrymask OpAMD64SBBLcarrymask @@ -1778,6 +1780,10 @@ const ( OpBitRev16 OpBitRev32 OpBitRev64 + OpPopCount8 + OpPopCount16 + OpPopCount32 + OpPopCount64 OpSqrt OpPhi OpCopy @@ -6368,6 +6374,34 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "POPCNTQ", + argLen: 1, + clobberFlags: true, + asm: x86.APOPCNTQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 + }, + outputs: []outputInfo{ + {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 + }, + }, + }, + { + name: "POPCNTL", + argLen: 1, + clobberFlags: true, + asm: x86.APOPCNTL, + reg: regInfo{ + inputs: []inputInfo{ + {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 + }, + outputs: []outputInfo{ + {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 + }, + }, + }, { name: "SQRTSD", argLen: 1, @@ -21680,6 +21714,26 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "PopCount8", + argLen: 1, + generic: true, + }, + { + name: "PopCount16", + argLen: 1, + generic: true, + }, + { + name: "PopCount32", + argLen: 1, + generic: true, + }, + { + name: "PopCount64", + argLen: 1, + generic: true, + }, { name: "Sqrt", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 91e05456e0..df72064b76 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -686,6 +686,14 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpOr8(v) case OpOrB: return rewriteValueAMD64_OpOrB(v) + case OpPopCount16: + return rewriteValueAMD64_OpPopCount16(v) + case OpPopCount32: + return rewriteValueAMD64_OpPopCount32(v) + case OpPopCount64: + return rewriteValueAMD64_OpPopCount64(v) + case OpPopCount8: + return rewriteValueAMD64_OpPopCount8(v) case OpRound32F: return rewriteValueAMD64_OpRound32F(v) case OpRound64F: @@ -33467,6 +33475,62 @@ func rewriteValueAMD64_OpOrB(v *Value) bool { return true } } +func rewriteValueAMD64_OpPopCount16(v *Value) bool { + b := v.Block + _ = b + types := &b.Func.Config.Types + _ = types + // match: (PopCount16 x) + // cond: + // result: (POPCNTL (MOVWQZX x)) + for { + x := v.Args[0] + v.reset(OpAMD64POPCNTL) + v0 := b.NewValue0(v.Pos, OpAMD64MOVWQZX, types.UInt32) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueAMD64_OpPopCount32(v *Value) bool { + // match: (PopCount32 x) + // cond: + // result: (POPCNTL x) + for { + x := v.Args[0] + v.reset(OpAMD64POPCNTL) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpPopCount64(v *Value) bool { + // match: (PopCount64 x) + // cond: + // result: (POPCNTQ x) + for { + x := v.Args[0] + v.reset(OpAMD64POPCNTQ) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpPopCount8(v *Value) bool { + b := v.Block + _ = b + types := &b.Func.Config.Types + _ = types + // match: (PopCount8 x) + // cond: + // result: (POPCNTL (MOVBQZX x)) + for { + x := v.Args[0] + v.reset(OpAMD64POPCNTL) + v0 := b.NewValue0(v.Pos, OpAMD64MOVBQZX, types.UInt32) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} func rewriteValueAMD64_OpRound32F(v *Value) bool { // match: (Round32F x) // cond: