p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
+ case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL:
+ if v.Args[0].Reg() != v.Reg() {
+ // POPCNT on Intel has a false dependency on the destination register.
+ // Zero the destination to break the dependency.
+ p := s.Prog(x86.AMOVQ)
+ p.From.Type = obj.TYPE_CONST
+ p.From.Offset = 0
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = v.Reg()
+ }
+ p := s.Prog(v.Op.Asm())
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = v.Args[0].Reg()
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = v.Reg()
case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
`,
[]string{"\tBSRQ\t"},
},
+ {
+ `
+ func pop1(x uint64) int {
+ return bits.OnesCount64(x)
+ }`,
+ []string{"\tPOPCNTQ\t", "support_popcnt"},
+ },
+ {
+ `
+ func pop2(x uint32) int {
+ return bits.OnesCount32(x)
+ }`,
+ []string{"\tPOPCNTL\t", "support_popcnt"},
+ },
+ {
+ `
+ func pop3(x uint16) int {
+ return bits.OnesCount16(x)
+ }`,
+ []string{"\tPOPCNTL\t", "support_popcnt"},
+ },
+ {
+ `
+ func pop4(x uint) int {
+ return bits.OnesCount(x)
+ }`,
+ []string{"\tPOPCNTQ\t", "support_popcnt"},
+ },
// see issue 19595.
// We want to merge load+op in f58, but not in f59.
{
{"racewriterange", funcTag, 111},
{"msanread", funcTag, 111},
{"msanwrite", funcTag, 111},
+ {"support_popcnt", varTag, 11},
}
func runtimeTypes() []*Type {
// memory sanitizer
func msanread(addr, size uintptr)
func msanwrite(addr, size uintptr)
+
+// architecture variants
+var support_popcnt bool
return s.newValue1(ssa.OpBitRev64, Types[TINT], args[0])
},
sys.ARM64)
+ makeOnesCount := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+ return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+ aux := s.lookupSymbol(n, &ssa.ExternSymbol{Typ: Types[TBOOL], Sym: Linksym(syslook("support_popcnt").Sym)})
+ addr := s.entryNewValue1A(ssa.OpAddr, Types[TBOOL].PtrTo(), aux, s.sb)
+ v := s.newValue2(ssa.OpLoad, Types[TBOOL], addr, s.mem())
+ b := s.endBlock()
+ b.Kind = ssa.BlockIf
+ b.SetControl(v)
+ bTrue := s.f.NewBlock(ssa.BlockPlain)
+ bFalse := s.f.NewBlock(ssa.BlockPlain)
+ bEnd := s.f.NewBlock(ssa.BlockPlain)
+ b.AddEdgeTo(bTrue)
+ b.AddEdgeTo(bFalse)
+ b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
+
+ // We have the intrinsic - use it directly.
+ s.startBlock(bTrue)
+ op := op64
+ if s.config.IntSize == 4 {
+ op = op32
+ }
+ s.vars[n] = s.newValue1(op, Types[TINT], args[0])
+ s.endBlock().AddEdgeTo(bEnd)
+
+ // Call the pure Go version.
+ s.startBlock(bFalse)
+ a := s.call(n, callNormal)
+ s.vars[n] = s.newValue2(ssa.OpLoad, Types[TINT], a, s.mem())
+ s.endBlock().AddEdgeTo(bEnd)
+
+ // Merge results.
+ s.startBlock(bEnd)
+ return s.variable(n, Types[TINT])
+ }
+ }
+ addF("math/bits", "OnesCount64",
+ makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount64),
+ sys.AMD64)
+ addF("math/bits", "OnesCount32",
+ makeOnesCount(ssa.OpPopCount32, ssa.OpPopCount32),
+ sys.AMD64)
+ addF("math/bits", "OnesCount16",
+ makeOnesCount(ssa.OpPopCount16, ssa.OpPopCount16),
+ sys.AMD64)
+ // Note: no OnesCount8, the Go implementation is faster - just a table load.
+ addF("math/bits", "OnesCount",
+ makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount32),
+ sys.AMD64)
/******** sync/atomic ********/
(Bswap64 x) -> (BSWAPQ x)
(Bswap32 x) -> (BSWAPL x)
+(PopCount64 x) -> (POPCNTQ x)
+(PopCount32 x) -> (POPCNTL x)
+(PopCount16 x) -> (POPCNTL (MOVWQZX <types.UInt32> x))
+(PopCount8 x) -> (POPCNTL (MOVBQZX <types.UInt32> x))
+
(Sqrt x) -> (SQRTSD x)
// Lowering extension
{name: "BSWAPQ", argLength: 1, reg: gp11, asm: "BSWAPQ", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true, clobberFlags: true}, // arg0 swap bytes
+ // POPCNT instructions aren't guaranteed to be on the target platform (they are SSE4).
+ // Any use must be preceded by a successful check of runtime.support_popcnt.
+ {name: "POPCNTQ", argLength: 1, reg: gp11, asm: "POPCNTQ", clobberFlags: true}, // count number of set bits in arg0
+ {name: "POPCNTL", argLength: 1, reg: gp11, asm: "POPCNTL", clobberFlags: true}, // count number of set bits in arg0
+
{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
{name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
{name: "BitRev32", argLength: 1}, // Reverse the bits in arg[0]
{name: "BitRev64", argLength: 1}, // Reverse the bits in arg[0]
+ {name: "PopCount8", argLength: 1}, // Count bits in arg[0]
+ {name: "PopCount16", argLength: 1}, // Count bits in arg[0]
+ {name: "PopCount32", argLength: 1}, // Count bits in arg[0]
+ {name: "PopCount64", argLength: 1}, // Count bits in arg[0]
+
{name: "Sqrt", argLength: 1}, // sqrt(arg0), float64 only
// Data movement, max argument length for Phi is indefinite so just pick
OpAMD64CMOVLEQ
OpAMD64BSWAPQ
OpAMD64BSWAPL
+ OpAMD64POPCNTQ
+ OpAMD64POPCNTL
OpAMD64SQRTSD
OpAMD64SBBQcarrymask
OpAMD64SBBLcarrymask
OpBitRev16
OpBitRev32
OpBitRev64
+ OpPopCount8
+ OpPopCount16
+ OpPopCount32
+ OpPopCount64
OpSqrt
OpPhi
OpCopy
},
},
},
+ {
+ name: "POPCNTQ",
+ argLen: 1,
+ clobberFlags: true,
+ asm: x86.APOPCNTQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+ },
+ outputs: []outputInfo{
+ {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+ },
+ },
+ },
+ {
+ name: "POPCNTL",
+ argLen: 1,
+ clobberFlags: true,
+ asm: x86.APOPCNTL,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+ },
+ outputs: []outputInfo{
+ {0, 65519}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R14 R15
+ },
+ },
+ },
{
name: "SQRTSD",
argLen: 1,
argLen: 1,
generic: true,
},
+ {
+ name: "PopCount8",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "PopCount16",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "PopCount32",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "PopCount64",
+ argLen: 1,
+ generic: true,
+ },
{
name: "Sqrt",
argLen: 1,
return rewriteValueAMD64_OpOr8(v)
case OpOrB:
return rewriteValueAMD64_OpOrB(v)
+ case OpPopCount16:
+ return rewriteValueAMD64_OpPopCount16(v)
+ case OpPopCount32:
+ return rewriteValueAMD64_OpPopCount32(v)
+ case OpPopCount64:
+ return rewriteValueAMD64_OpPopCount64(v)
+ case OpPopCount8:
+ return rewriteValueAMD64_OpPopCount8(v)
case OpRound32F:
return rewriteValueAMD64_OpRound32F(v)
case OpRound64F:
return true
}
}
+func rewriteValueAMD64_OpPopCount16(v *Value) bool {
+ b := v.Block
+ _ = b
+ types := &b.Func.Config.Types
+ _ = types
+ // match: (PopCount16 x)
+ // cond:
+ // result: (POPCNTL (MOVWQZX <types.UInt32> x))
+ for {
+ x := v.Args[0]
+ v.reset(OpAMD64POPCNTL)
+ v0 := b.NewValue0(v.Pos, OpAMD64MOVWQZX, types.UInt32)
+ v0.AddArg(x)
+ v.AddArg(v0)
+ return true
+ }
+}
+func rewriteValueAMD64_OpPopCount32(v *Value) bool {
+ // match: (PopCount32 x)
+ // cond:
+ // result: (POPCNTL x)
+ for {
+ x := v.Args[0]
+ v.reset(OpAMD64POPCNTL)
+ v.AddArg(x)
+ return true
+ }
+}
+func rewriteValueAMD64_OpPopCount64(v *Value) bool {
+ // match: (PopCount64 x)
+ // cond:
+ // result: (POPCNTQ x)
+ for {
+ x := v.Args[0]
+ v.reset(OpAMD64POPCNTQ)
+ v.AddArg(x)
+ return true
+ }
+}
+func rewriteValueAMD64_OpPopCount8(v *Value) bool {
+ b := v.Block
+ _ = b
+ types := &b.Func.Config.Types
+ _ = types
+ // match: (PopCount8 x)
+ // cond:
+ // result: (POPCNTL (MOVBQZX <types.UInt32> x))
+ for {
+ x := v.Args[0]
+ v.reset(OpAMD64POPCNTL)
+ v0 := b.NewValue0(v.Pos, OpAMD64MOVBQZX, types.UInt32)
+ v0.AddArg(x)
+ v.AddArg(v0)
+ return true
+ }
+}
func rewriteValueAMD64_OpRound32F(v *Value) bool {
// match: (Round32F x)
// cond: