From: Michael Munday Date: Fri, 25 May 2018 16:54:58 +0000 (+0100) Subject: cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x X-Git-Tag: go1.12beta1~1181 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=6f9b94ab6658bbebe4c89791dc3e5ebe53be3d82;p=gostls13.git cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x This CL implements the math/bits.OnesCount{8,16,32,64} functions as intrinsics on s390x using the 'population count' (popcnt) instruction. This instruction was released as the 'population-count' facility which uses the same facility bit (45) as the 'distinct-operands' facility which is a pre-requisite for Go on s390x. We can therefore use it without a feature check. The s390x popcnt instruction treats a 64 bit register as a vector of 8 bytes, summing the number of ones in each byte individually. It then writes the results to the corresponding bytes in the output register. Therefore to implement OnesCount{16,32,64} we need to sum the individual byte counts using some extra instructions. To do this efficiently I've added some additional pseudo operations to the s390x SSA backend. Unlike other architectures the new instruction sequence is faster for OnesCount8, so that is implemented using the intrinsic. name old time/op new time/op delta OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20) OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20) OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17) OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20) OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20) Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0 Reviewed-on: https://go-review.googlesource.com/114675 Run-TryBot: Michael Munday TryBot-Result: Gobot Gobot Reviewed-by: Cherry Zhang --- diff --git a/src/cmd/asm/internal/asm/testdata/s390x.s b/src/cmd/asm/internal/asm/testdata/s390x.s index fce855ee30..ad70d2af44 100644 --- a/src/cmd/asm/internal/asm/testdata/s390x.s +++ b/src/cmd/asm/internal/asm/testdata/s390x.s @@ -115,6 +115,7 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16- NEGW R1 // b9130011 NEGW R1, R2 // b9130021 FLOGR R2, R2 // b9830022 + POPCNT R3, R4 // b9e10043 AND R1, R2 // b9800021 AND R1, R2, R3 // b9e42031 diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index 4c2f0098ce..3aef7e6b6d 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -3410,7 +3410,7 @@ func init() { func(s *state, n *Node, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpPopCount64, types.Types[TINT], args[0]) }, - sys.PPC64, sys.ARM64) + sys.PPC64, sys.ARM64, sys.S390X) addF("math/bits", "OnesCount32", makeOnesCountAMD64(ssa.OpPopCount32, ssa.OpPopCount32), sys.AMD64) @@ -3418,7 +3418,7 @@ func init() { func(s *state, n *Node, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpPopCount32, types.Types[TINT], args[0]) }, - sys.PPC64, sys.ARM64) + sys.PPC64, sys.ARM64, sys.S390X) addF("math/bits", "OnesCount16", makeOnesCountAMD64(ssa.OpPopCount16, ssa.OpPopCount16), sys.AMD64) @@ -3426,8 +3426,12 @@ func init() { func(s *state, n *Node, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpPopCount16, types.Types[TINT], args[0]) }, - sys.ARM64) - // Note: no OnesCount8, the Go implementation is faster - just a table load. + sys.ARM64, sys.S390X) + addF("math/bits", "OnesCount8", + func(s *state, n *Node, args []*ssa.Value) *ssa.Value { + return s.newValue1(ssa.OpPopCount8, types.Types[TINT], args[0]) + }, + sys.S390X) addF("math/bits", "OnesCount", makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32), sys.AMD64) diff --git a/src/cmd/compile/internal/s390x/ssa.go b/src/cmd/compile/internal/s390x/ssa.go index fe206f74e8..90e61c34fd 100644 --- a/src/cmd/compile/internal/s390x/ssa.go +++ b/src/cmd/compile/internal/s390x/ssa.go @@ -513,7 +513,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_MEM p.To.Name = obj.NAME_EXTERN p.To.Sym = v.Aux.(*obj.LSym) - case ssa.OpS390XFLOGR, ssa.OpS390XNEG, ssa.OpS390XNEGW, + case ssa.OpS390XFLOGR, ssa.OpS390XPOPCNT, + ssa.OpS390XNEG, ssa.OpS390XNEGW, ssa.OpS390XMOVWBR, ssa.OpS390XMOVDBR: p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG @@ -522,6 +523,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Reg = v.Reg() case ssa.OpS390XNOT, ssa.OpS390XNOTW: v.Fatalf("NOT/NOTW generated %s", v.LongString()) + case ssa.OpS390XSumBytes2, ssa.OpS390XSumBytes4, ssa.OpS390XSumBytes8: + v.Fatalf("SumBytes generated %s", v.LongString()) case ssa.OpS390XMOVDEQ, ssa.OpS390XMOVDNE, ssa.OpS390XMOVDLT, ssa.OpS390XMOVDLE, ssa.OpS390XMOVDGT, ssa.OpS390XMOVDGE, diff --git a/src/cmd/compile/internal/ssa/gen/S390X.rules b/src/cmd/compile/internal/ssa/gen/S390X.rules index 960d68845f..4fbdef38e7 100644 --- a/src/cmd/compile/internal/ssa/gen/S390X.rules +++ b/src/cmd/compile/internal/ssa/gen/S390X.rules @@ -88,6 +88,34 @@ (BitLen64 x) -> (SUB (MOVDconst [64]) (FLOGR x)) +// POPCNT treats the input register as a vector of 8 bytes, producing +// a population count for each individual byte. For inputs larger than +// a single byte we therefore need to sum the individual bytes produced +// by the POPCNT instruction. For example, the following instruction +// sequence could be used to calculate the population count of a 4-byte +// value: +// +// MOVD $0x12345678, R1 // R1=0x12345678 <-- input +// POPCNT R1, R2 // R2=0x02030404 +// SRW $16, R2, R3 // R3=0x00000203 +// ADDW R2, R3, R4 // R4=0x02030607 +// SRW $8, R4, R5 // R5=0x00020306 +// ADDW R4, R5, R6 // R6=0x0205090d +// MOVBZ R6, R7 // R7=0x0000000d <-- result is 13 +// +(PopCount8 x) -> (POPCNT (MOVBZreg x)) +(PopCount16 x) -> (MOVBZreg (SumBytes2 (POPCNT x))) +(PopCount32 x) -> (MOVBZreg (SumBytes4 (POPCNT x))) +(PopCount64 x) -> (MOVBZreg (SumBytes8 (POPCNT x))) + +// SumBytes{2,4,8} pseudo operations sum the values of the rightmost +// 2, 4 or 8 bytes respectively. The result is a single byte however +// other bytes might contain junk so a zero extension is required if +// the desired output type is larger than 1 byte. +(SumBytes2 x) -> (ADDW (SRWconst x [8]) x) +(SumBytes4 x) -> (SumBytes2 (ADDW (SRWconst x [16]) x)) +(SumBytes8 x) -> (SumBytes4 (ADDW (SRDconst x [32]) x)) + (Bswap64 x) -> (MOVDBR x) (Bswap32 x) -> (MOVWBR x) diff --git a/src/cmd/compile/internal/ssa/gen/S390XOps.go b/src/cmd/compile/internal/ssa/gen/S390XOps.go index ae01375473..9b5f525531 100644 --- a/src/cmd/compile/internal/ssa/gen/S390XOps.go +++ b/src/cmd/compile/internal/ssa/gen/S390XOps.go @@ -530,6 +530,25 @@ func init() { clobberFlags: true, }, + // population count + // + // Counts the number of ones in each byte of arg0 + // and places the result into the corresponding byte + // of the result. + { + name: "POPCNT", + argLength: 1, + reg: gp11, + asm: "POPCNT", + typ: "UInt64", + clobberFlags: true, + }, + + // pseudo operations to sum the output of the POPCNT instruction + {name: "SumBytes2", argLength: 1, typ: "UInt8"}, // sum the rightmost 2 bytes in arg0 ignoring overflow + {name: "SumBytes4", argLength: 1, typ: "UInt8"}, // sum the rightmost 4 bytes in arg0 ignoring overflow + {name: "SumBytes8", argLength: 1, typ: "UInt8"}, // sum all the bytes in arg0 ignoring overflow + // store multiple { name: "STMG2", diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 0689c0ef32..1c9d263deb 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1898,6 +1898,10 @@ const ( OpS390XLoweredAtomicExchange32 OpS390XLoweredAtomicExchange64 OpS390XFLOGR + OpS390XPOPCNT + OpS390XSumBytes2 + OpS390XSumBytes4 + OpS390XSumBytes8 OpS390XSTMG2 OpS390XSTMG3 OpS390XSTMG4 @@ -25473,6 +25477,35 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "POPCNT", + argLen: 1, + clobberFlags: true, + asm: s390x.APOPCNT, + reg: regInfo{ + inputs: []inputInfo{ + {0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14 + }, + outputs: []outputInfo{ + {0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14 + }, + }, + }, + { + name: "SumBytes2", + argLen: 1, + reg: regInfo{}, + }, + { + name: "SumBytes4", + argLen: 1, + reg: regInfo{}, + }, + { + name: "SumBytes8", + argLen: 1, + reg: regInfo{}, + }, { name: "STMG2", auxType: auxSymOff, diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go index 7125b888bd..768b802ec1 100644 --- a/src/cmd/compile/internal/ssa/rewriteS390X.go +++ b/src/cmd/compile/internal/ssa/rewriteS390X.go @@ -383,6 +383,14 @@ func rewriteValueS390X(v *Value) bool { return rewriteValueS390X_OpOr8_0(v) case OpOrB: return rewriteValueS390X_OpOrB_0(v) + case OpPopCount16: + return rewriteValueS390X_OpPopCount16_0(v) + case OpPopCount32: + return rewriteValueS390X_OpPopCount32_0(v) + case OpPopCount64: + return rewriteValueS390X_OpPopCount64_0(v) + case OpPopCount8: + return rewriteValueS390X_OpPopCount8_0(v) case OpRound: return rewriteValueS390X_OpRound_0(v) case OpRound32F: @@ -691,6 +699,12 @@ func rewriteValueS390X(v *Value) bool { return rewriteValueS390X_OpS390XSUBconst_0(v) case OpS390XSUBload: return rewriteValueS390X_OpS390XSUBload_0(v) + case OpS390XSumBytes2: + return rewriteValueS390X_OpS390XSumBytes2_0(v) + case OpS390XSumBytes4: + return rewriteValueS390X_OpS390XSumBytes4_0(v) + case OpS390XSumBytes8: + return rewriteValueS390X_OpS390XSumBytes8_0(v) case OpS390XXOR: return rewriteValueS390X_OpS390XXOR_0(v) || rewriteValueS390X_OpS390XXOR_10(v) case OpS390XXORW: @@ -5311,6 +5325,80 @@ func rewriteValueS390X_OpOrB_0(v *Value) bool { return true } } +func rewriteValueS390X_OpPopCount16_0(v *Value) bool { + b := v.Block + _ = b + typ := &b.Func.Config.Types + _ = typ + // match: (PopCount16 x) + // cond: + // result: (MOVBZreg (SumBytes2 (POPCNT x))) + for { + x := v.Args[0] + v.reset(OpS390XMOVBZreg) + v0 := b.NewValue0(v.Pos, OpS390XSumBytes2, typ.UInt8) + v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt16) + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + return true + } +} +func rewriteValueS390X_OpPopCount32_0(v *Value) bool { + b := v.Block + _ = b + typ := &b.Func.Config.Types + _ = typ + // match: (PopCount32 x) + // cond: + // result: (MOVBZreg (SumBytes4 (POPCNT x))) + for { + x := v.Args[0] + v.reset(OpS390XMOVBZreg) + v0 := b.NewValue0(v.Pos, OpS390XSumBytes4, typ.UInt8) + v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt32) + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + return true + } +} +func rewriteValueS390X_OpPopCount64_0(v *Value) bool { + b := v.Block + _ = b + typ := &b.Func.Config.Types + _ = typ + // match: (PopCount64 x) + // cond: + // result: (MOVBZreg (SumBytes8 (POPCNT x))) + for { + x := v.Args[0] + v.reset(OpS390XMOVBZreg) + v0 := b.NewValue0(v.Pos, OpS390XSumBytes8, typ.UInt8) + v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt64) + v1.AddArg(x) + v0.AddArg(v1) + v.AddArg(v0) + return true + } +} +func rewriteValueS390X_OpPopCount8_0(v *Value) bool { + b := v.Block + _ = b + typ := &b.Func.Config.Types + _ = typ + // match: (PopCount8 x) + // cond: + // result: (POPCNT (MOVBZreg x)) + for { + x := v.Args[0] + v.reset(OpS390XPOPCNT) + v0 := b.NewValue0(v.Pos, OpS390XMOVBZreg, typ.UInt64) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} func rewriteValueS390X_OpRound_0(v *Value) bool { // match: (Round x) // cond: @@ -40417,6 +40505,67 @@ func rewriteValueS390X_OpS390XSUBload_0(v *Value) bool { } return false } +func rewriteValueS390X_OpS390XSumBytes2_0(v *Value) bool { + b := v.Block + _ = b + typ := &b.Func.Config.Types + _ = typ + // match: (SumBytes2 x) + // cond: + // result: (ADDW (SRWconst x [8]) x) + for { + x := v.Args[0] + v.reset(OpS390XADDW) + v0 := b.NewValue0(v.Pos, OpS390XSRWconst, typ.UInt8) + v0.AuxInt = 8 + v0.AddArg(x) + v.AddArg(v0) + v.AddArg(x) + return true + } +} +func rewriteValueS390X_OpS390XSumBytes4_0(v *Value) bool { + b := v.Block + _ = b + typ := &b.Func.Config.Types + _ = typ + // match: (SumBytes4 x) + // cond: + // result: (SumBytes2 (ADDW (SRWconst x [16]) x)) + for { + x := v.Args[0] + v.reset(OpS390XSumBytes2) + v0 := b.NewValue0(v.Pos, OpS390XADDW, typ.UInt16) + v1 := b.NewValue0(v.Pos, OpS390XSRWconst, typ.UInt16) + v1.AuxInt = 16 + v1.AddArg(x) + v0.AddArg(v1) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} +func rewriteValueS390X_OpS390XSumBytes8_0(v *Value) bool { + b := v.Block + _ = b + typ := &b.Func.Config.Types + _ = typ + // match: (SumBytes8 x) + // cond: + // result: (SumBytes4 (ADDW (SRDconst x [32]) x)) + for { + x := v.Args[0] + v.reset(OpS390XSumBytes4) + v0 := b.NewValue0(v.Pos, OpS390XADDW, typ.UInt32) + v1 := b.NewValue0(v.Pos, OpS390XSRDconst, typ.UInt32) + v1.AuxInt = 32 + v1.AddArg(x) + v0.AddArg(v1) + v0.AddArg(x) + v.AddArg(v0) + return true + } +} func rewriteValueS390X_OpS390XXOR_0(v *Value) bool { // match: (XOR x (MOVDconst [c])) // cond: isU32Bit(c) diff --git a/src/cmd/internal/obj/s390x/a.out.go b/src/cmd/internal/obj/s390x/a.out.go index babcd2af01..9ee02a2d0d 100644 --- a/src/cmd/internal/obj/s390x/a.out.go +++ b/src/cmd/internal/obj/s390x/a.out.go @@ -271,6 +271,9 @@ const ( // find leftmost one AFLOGR + // population count + APOPCNT + // integer bitwise AAND AANDW diff --git a/src/cmd/internal/obj/s390x/anames.go b/src/cmd/internal/obj/s390x/anames.go index 7edbdd68df..2d6ea5abb4 100644 --- a/src/cmd/internal/obj/s390x/anames.go +++ b/src/cmd/internal/obj/s390x/anames.go @@ -45,6 +45,7 @@ var Anames = []string{ "MOVDLT", "MOVDNE", "FLOGR", + "POPCNT", "AND", "ANDW", "OR", diff --git a/src/cmd/internal/obj/s390x/asmz.go b/src/cmd/internal/obj/s390x/asmz.go index ce3fe6af73..359610c41d 100644 --- a/src/cmd/internal/obj/s390x/asmz.go +++ b/src/cmd/internal/obj/s390x/asmz.go @@ -246,6 +246,9 @@ var optab = []Optab{ // find leftmost one Optab{AFLOGR, C_REG, C_NONE, C_NONE, C_REG, 8, 0}, + // population count + Optab{APOPCNT, C_REG, C_NONE, C_NONE, C_REG, 9, 0}, + // compare Optab{ACMP, C_REG, C_NONE, C_NONE, C_REG, 70, 0}, Optab{ACMP, C_REG, C_NONE, C_NONE, C_LCON, 71, 0}, @@ -2849,6 +2852,9 @@ func (c *ctxtz) asmout(p *obj.Prog, asm *[]byte) { // FLOGR also writes a mask to p.To.Reg+1. zRRE(op_FLOGR, uint32(p.To.Reg), uint32(p.From.Reg), asm) + case 9: // population count + zRRE(op_POPCNT, uint32(p.To.Reg), uint32(p.From.Reg), asm) + case 10: // subtract reg [reg] reg r := int(p.Reg) diff --git a/test/codegen/mathbits.go b/test/codegen/mathbits.go index 85c54ea61b..ad2c5abb02 100644 --- a/test/codegen/mathbits.go +++ b/test/codegen/mathbits.go @@ -103,27 +103,36 @@ func Len8(n uint8) int { func OnesCount(n uint) int { // amd64:"POPCNTQ",".*support_popcnt" // arm64:"VCNT","VUADDLV" + // s390x:"POPCNT" return bits.OnesCount(n) } func OnesCount64(n uint64) int { // amd64:"POPCNTQ",".*support_popcnt" // arm64:"VCNT","VUADDLV" + // s390x:"POPCNT" return bits.OnesCount64(n) } func OnesCount32(n uint32) int { // amd64:"POPCNTL",".*support_popcnt" // arm64:"VCNT","VUADDLV" + // s390x:"POPCNT" return bits.OnesCount32(n) } func OnesCount16(n uint16) int { // amd64:"POPCNTL",".*support_popcnt" // arm64:"VCNT","VUADDLV" + // s390x:"POPCNT" return bits.OnesCount16(n) } +func OnesCount8(n uint8) int { + // s390x:"POPCNT" + return bits.OnesCount8(n) +} + // ----------------------- // // bits.ReverseBytes // // ----------------------- //