]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
authorMichael Munday <mike.munday@ibm.com>
Fri, 25 May 2018 16:54:58 +0000 (17:54 +0100)
committerMichael Munday <mike.munday@ibm.com>
Mon, 3 Sep 2018 14:35:38 +0000 (14:35 +0000)
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.

The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.

Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.

name         old time/op  new time/op  delta
OnesCount    3.21ns ± 1%  1.35ns ± 0%  -58.00%  (p=0.000 n=20+20)
OnesCount8   0.91ns ± 1%  0.81ns ± 0%  -11.43%  (p=0.000 n=20+20)
OnesCount16  1.51ns ± 3%  1.21ns ± 0%  -19.71%  (p=0.000 n=20+17)
OnesCount32  1.91ns ± 0%  1.12ns ± 1%  -41.60%  (p=0.000 n=19+20)
OnesCount64  3.18ns ± 4%  1.35ns ± 0%  -57.52%  (p=0.000 n=20+20)

Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
src/cmd/asm/internal/asm/testdata/s390x.s
src/cmd/compile/internal/gc/ssa.go
src/cmd/compile/internal/s390x/ssa.go
src/cmd/compile/internal/ssa/gen/S390X.rules
src/cmd/compile/internal/ssa/gen/S390XOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteS390X.go
src/cmd/internal/obj/s390x/a.out.go
src/cmd/internal/obj/s390x/anames.go
src/cmd/internal/obj/s390x/asmz.go
test/codegen/mathbits.go

index fce855ee3008eb6f7235d670ab67be9f801dddf7..ad70d2af44822671a99de6f2cd8694c704ef4a7d 100644 (file)
@@ -115,6 +115,7 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
        NEGW    R1                    // b9130011
        NEGW    R1, R2                // b9130021
        FLOGR   R2, R2                // b9830022
+       POPCNT  R3, R4                // b9e10043
 
        AND     R1, R2                // b9800021
        AND     R1, R2, R3            // b9e42031
index 4c2f0098ce6aaae0b402872d1b4d1154bd40c3c1..3aef7e6b6d92725ffe428927fe70599ef993f2ed 100644 (file)
@@ -3410,7 +3410,7 @@ func init() {
                func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpPopCount64, types.Types[TINT], args[0])
                },
-               sys.PPC64, sys.ARM64)
+               sys.PPC64, sys.ARM64, sys.S390X)
        addF("math/bits", "OnesCount32",
                makeOnesCountAMD64(ssa.OpPopCount32, ssa.OpPopCount32),
                sys.AMD64)
@@ -3418,7 +3418,7 @@ func init() {
                func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpPopCount32, types.Types[TINT], args[0])
                },
-               sys.PPC64, sys.ARM64)
+               sys.PPC64, sys.ARM64, sys.S390X)
        addF("math/bits", "OnesCount16",
                makeOnesCountAMD64(ssa.OpPopCount16, ssa.OpPopCount16),
                sys.AMD64)
@@ -3426,8 +3426,12 @@ func init() {
                func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
                        return s.newValue1(ssa.OpPopCount16, types.Types[TINT], args[0])
                },
-               sys.ARM64)
-       // Note: no OnesCount8, the Go implementation is faster - just a table load.
+               sys.ARM64, sys.S390X)
+       addF("math/bits", "OnesCount8",
+               func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+                       return s.newValue1(ssa.OpPopCount8, types.Types[TINT], args[0])
+               },
+               sys.S390X)
        addF("math/bits", "OnesCount",
                makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32),
                sys.AMD64)
index fe206f74e865b32cac9c81370c220feeb20db78b..90e61c34fd35cda8c7b9337bddf54ed387609b8f 100644 (file)
@@ -513,7 +513,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.To.Type = obj.TYPE_MEM
                p.To.Name = obj.NAME_EXTERN
                p.To.Sym = v.Aux.(*obj.LSym)
-       case ssa.OpS390XFLOGR, ssa.OpS390XNEG, ssa.OpS390XNEGW,
+       case ssa.OpS390XFLOGR, ssa.OpS390XPOPCNT,
+               ssa.OpS390XNEG, ssa.OpS390XNEGW,
                ssa.OpS390XMOVWBR, ssa.OpS390XMOVDBR:
                p := s.Prog(v.Op.Asm())
                p.From.Type = obj.TYPE_REG
@@ -522,6 +523,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.To.Reg = v.Reg()
        case ssa.OpS390XNOT, ssa.OpS390XNOTW:
                v.Fatalf("NOT/NOTW generated %s", v.LongString())
+       case ssa.OpS390XSumBytes2, ssa.OpS390XSumBytes4, ssa.OpS390XSumBytes8:
+               v.Fatalf("SumBytes generated %s", v.LongString())
        case ssa.OpS390XMOVDEQ, ssa.OpS390XMOVDNE,
                ssa.OpS390XMOVDLT, ssa.OpS390XMOVDLE,
                ssa.OpS390XMOVDGT, ssa.OpS390XMOVDGE,
index 960d68845f1c32c8e9922dcfd643bf8c67ccfb3c..4fbdef38e7d108e2fcaa9e9b1d088cb644c4a55c 100644 (file)
 
 (BitLen64 x) -> (SUB (MOVDconst [64]) (FLOGR x))
 
+// POPCNT treats the input register as a vector of 8 bytes, producing
+// a population count for each individual byte. For inputs larger than
+// a single byte we therefore need to sum the individual bytes produced
+// by the POPCNT instruction. For example, the following instruction
+// sequence could be used to calculate the population count of a 4-byte
+// value:
+//
+//     MOVD   $0x12345678, R1 // R1=0x12345678 <-- input
+//     POPCNT R1, R2          // R2=0x02030404
+//     SRW    $16, R2, R3     // R3=0x00000203
+//     ADDW   R2, R3, R4      // R4=0x02030607
+//     SRW    $8, R4, R5      // R5=0x00020306
+//     ADDW   R4, R5, R6      // R6=0x0205090d
+//     MOVBZ  R6, R7          // R7=0x0000000d <-- result is 13
+//
+(PopCount8  x) -> (POPCNT (MOVBZreg x))
+(PopCount16 x) -> (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
+(PopCount32 x) -> (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
+(PopCount64 x) -> (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
+
+// SumBytes{2,4,8} pseudo operations sum the values of the rightmost
+// 2, 4 or 8 bytes respectively. The result is a single byte however
+// other bytes might contain junk so a zero extension is required if
+// the desired output type is larger than 1 byte.
+(SumBytes2 x) -> (ADDW (SRWconst <typ.UInt8> x [8]) x)
+(SumBytes4 x) -> (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
+(SumBytes8 x) -> (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
+
 (Bswap64 x) -> (MOVDBR x)
 (Bswap32 x) -> (MOVWBR x)
 
index ae01375473396f5efa3f46a7d1c65b6da442f4e4..9b5f525531a6e600cf67dae232242f652bf46f0f 100644 (file)
@@ -530,6 +530,25 @@ func init() {
                        clobberFlags: true,
                },
 
+               // population count
+               //
+               // Counts the number of ones in each byte of arg0
+               // and places the result into the corresponding byte
+               // of the result.
+               {
+                       name:         "POPCNT",
+                       argLength:    1,
+                       reg:          gp11,
+                       asm:          "POPCNT",
+                       typ:          "UInt64",
+                       clobberFlags: true,
+               },
+
+               // pseudo operations to sum the output of the POPCNT instruction
+               {name: "SumBytes2", argLength: 1, typ: "UInt8"}, // sum the rightmost 2 bytes in arg0 ignoring overflow
+               {name: "SumBytes4", argLength: 1, typ: "UInt8"}, // sum the rightmost 4 bytes in arg0 ignoring overflow
+               {name: "SumBytes8", argLength: 1, typ: "UInt8"}, // sum all the bytes in arg0 ignoring overflow
+
                // store multiple
                {
                        name:           "STMG2",
index 0689c0ef323dc63c847d997b68a854932a63b855..1c9d263debd0cc99c03f4e2aa025b1d4108089af 100644 (file)
@@ -1898,6 +1898,10 @@ const (
        OpS390XLoweredAtomicExchange32
        OpS390XLoweredAtomicExchange64
        OpS390XFLOGR
+       OpS390XPOPCNT
+       OpS390XSumBytes2
+       OpS390XSumBytes4
+       OpS390XSumBytes8
        OpS390XSTMG2
        OpS390XSTMG3
        OpS390XSTMG4
@@ -25473,6 +25477,35 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "POPCNT",
+               argLen:       1,
+               clobberFlags: true,
+               asm:          s390x.APOPCNT,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
+                       },
+                       outputs: []outputInfo{
+                               {0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
+                       },
+               },
+       },
+       {
+               name:   "SumBytes2",
+               argLen: 1,
+               reg:    regInfo{},
+       },
+       {
+               name:   "SumBytes4",
+               argLen: 1,
+               reg:    regInfo{},
+       },
+       {
+               name:   "SumBytes8",
+               argLen: 1,
+               reg:    regInfo{},
+       },
        {
                name:           "STMG2",
                auxType:        auxSymOff,
index 7125b888bdbfdc5d57b9490fc7a5e87eae1cd9f5..768b802ec1f22c6c2aa31121e95676f77422001c 100644 (file)
@@ -383,6 +383,14 @@ func rewriteValueS390X(v *Value) bool {
                return rewriteValueS390X_OpOr8_0(v)
        case OpOrB:
                return rewriteValueS390X_OpOrB_0(v)
+       case OpPopCount16:
+               return rewriteValueS390X_OpPopCount16_0(v)
+       case OpPopCount32:
+               return rewriteValueS390X_OpPopCount32_0(v)
+       case OpPopCount64:
+               return rewriteValueS390X_OpPopCount64_0(v)
+       case OpPopCount8:
+               return rewriteValueS390X_OpPopCount8_0(v)
        case OpRound:
                return rewriteValueS390X_OpRound_0(v)
        case OpRound32F:
@@ -691,6 +699,12 @@ func rewriteValueS390X(v *Value) bool {
                return rewriteValueS390X_OpS390XSUBconst_0(v)
        case OpS390XSUBload:
                return rewriteValueS390X_OpS390XSUBload_0(v)
+       case OpS390XSumBytes2:
+               return rewriteValueS390X_OpS390XSumBytes2_0(v)
+       case OpS390XSumBytes4:
+               return rewriteValueS390X_OpS390XSumBytes4_0(v)
+       case OpS390XSumBytes8:
+               return rewriteValueS390X_OpS390XSumBytes8_0(v)
        case OpS390XXOR:
                return rewriteValueS390X_OpS390XXOR_0(v) || rewriteValueS390X_OpS390XXOR_10(v)
        case OpS390XXORW:
@@ -5311,6 +5325,80 @@ func rewriteValueS390X_OpOrB_0(v *Value) bool {
                return true
        }
 }
+func rewriteValueS390X_OpPopCount16_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       typ := &b.Func.Config.Types
+       _ = typ
+       // match: (PopCount16 x)
+       // cond:
+       // result: (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
+       for {
+               x := v.Args[0]
+               v.reset(OpS390XMOVBZreg)
+               v0 := b.NewValue0(v.Pos, OpS390XSumBytes2, typ.UInt8)
+               v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt16)
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               return true
+       }
+}
+func rewriteValueS390X_OpPopCount32_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       typ := &b.Func.Config.Types
+       _ = typ
+       // match: (PopCount32 x)
+       // cond:
+       // result: (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
+       for {
+               x := v.Args[0]
+               v.reset(OpS390XMOVBZreg)
+               v0 := b.NewValue0(v.Pos, OpS390XSumBytes4, typ.UInt8)
+               v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt32)
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               return true
+       }
+}
+func rewriteValueS390X_OpPopCount64_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       typ := &b.Func.Config.Types
+       _ = typ
+       // match: (PopCount64 x)
+       // cond:
+       // result: (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
+       for {
+               x := v.Args[0]
+               v.reset(OpS390XMOVBZreg)
+               v0 := b.NewValue0(v.Pos, OpS390XSumBytes8, typ.UInt8)
+               v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt64)
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               return true
+       }
+}
+func rewriteValueS390X_OpPopCount8_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       typ := &b.Func.Config.Types
+       _ = typ
+       // match: (PopCount8 x)
+       // cond:
+       // result: (POPCNT (MOVBZreg x))
+       for {
+               x := v.Args[0]
+               v.reset(OpS390XPOPCNT)
+               v0 := b.NewValue0(v.Pos, OpS390XMOVBZreg, typ.UInt64)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               return true
+       }
+}
 func rewriteValueS390X_OpRound_0(v *Value) bool {
        // match: (Round x)
        // cond:
@@ -40417,6 +40505,67 @@ func rewriteValueS390X_OpS390XSUBload_0(v *Value) bool {
        }
        return false
 }
+func rewriteValueS390X_OpS390XSumBytes2_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       typ := &b.Func.Config.Types
+       _ = typ
+       // match: (SumBytes2 x)
+       // cond:
+       // result: (ADDW (SRWconst <typ.UInt8> x [8]) x)
+       for {
+               x := v.Args[0]
+               v.reset(OpS390XADDW)
+               v0 := b.NewValue0(v.Pos, OpS390XSRWconst, typ.UInt8)
+               v0.AuxInt = 8
+               v0.AddArg(x)
+               v.AddArg(v0)
+               v.AddArg(x)
+               return true
+       }
+}
+func rewriteValueS390X_OpS390XSumBytes4_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       typ := &b.Func.Config.Types
+       _ = typ
+       // match: (SumBytes4 x)
+       // cond:
+       // result: (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
+       for {
+               x := v.Args[0]
+               v.reset(OpS390XSumBytes2)
+               v0 := b.NewValue0(v.Pos, OpS390XADDW, typ.UInt16)
+               v1 := b.NewValue0(v.Pos, OpS390XSRWconst, typ.UInt16)
+               v1.AuxInt = 16
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               return true
+       }
+}
+func rewriteValueS390X_OpS390XSumBytes8_0(v *Value) bool {
+       b := v.Block
+       _ = b
+       typ := &b.Func.Config.Types
+       _ = typ
+       // match: (SumBytes8 x)
+       // cond:
+       // result: (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
+       for {
+               x := v.Args[0]
+               v.reset(OpS390XSumBytes4)
+               v0 := b.NewValue0(v.Pos, OpS390XADDW, typ.UInt32)
+               v1 := b.NewValue0(v.Pos, OpS390XSRDconst, typ.UInt32)
+               v1.AuxInt = 32
+               v1.AddArg(x)
+               v0.AddArg(v1)
+               v0.AddArg(x)
+               v.AddArg(v0)
+               return true
+       }
+}
 func rewriteValueS390X_OpS390XXOR_0(v *Value) bool {
        // match: (XOR x (MOVDconst [c]))
        // cond: isU32Bit(c)
index babcd2af010b25cbec4a3cfa26e0c49a53875880..9ee02a2d0d72e219df459da24d3815620609496a 100644 (file)
@@ -271,6 +271,9 @@ const (
        // find leftmost one
        AFLOGR
 
+       // population count
+       APOPCNT
+
        // integer bitwise
        AAND
        AANDW
index 7edbdd68dfd023037e587d3507c831769f37a4ae..2d6ea5abb4431444d3f5435d4aca591812f80cf6 100644 (file)
@@ -45,6 +45,7 @@ var Anames = []string{
        "MOVDLT",
        "MOVDNE",
        "FLOGR",
+       "POPCNT",
        "AND",
        "ANDW",
        "OR",
index ce3fe6af73cf0f9ddc46575f7b603d193ca7d128..359610c41df15371ef64d391dbf0d30aae2ba47b 100644 (file)
@@ -246,6 +246,9 @@ var optab = []Optab{
        // find leftmost one
        Optab{AFLOGR, C_REG, C_NONE, C_NONE, C_REG, 8, 0},
 
+       // population count
+       Optab{APOPCNT, C_REG, C_NONE, C_NONE, C_REG, 9, 0},
+
        // compare
        Optab{ACMP, C_REG, C_NONE, C_NONE, C_REG, 70, 0},
        Optab{ACMP, C_REG, C_NONE, C_NONE, C_LCON, 71, 0},
@@ -2849,6 +2852,9 @@ func (c *ctxtz) asmout(p *obj.Prog, asm *[]byte) {
                // FLOGR also writes a mask to p.To.Reg+1.
                zRRE(op_FLOGR, uint32(p.To.Reg), uint32(p.From.Reg), asm)
 
+       case 9: // population count
+               zRRE(op_POPCNT, uint32(p.To.Reg), uint32(p.From.Reg), asm)
+
        case 10: // subtract reg [reg] reg
                r := int(p.Reg)
 
index 85c54ea61b78526ee6d7640692e0f6cff224e827..ad2c5abb029f6272c2c53e45c4a607c4c3505742 100644 (file)
@@ -103,27 +103,36 @@ func Len8(n uint8) int {
 func OnesCount(n uint) int {
        // amd64:"POPCNTQ",".*support_popcnt"
        // arm64:"VCNT","VUADDLV"
+       // s390x:"POPCNT"
        return bits.OnesCount(n)
 }
 
 func OnesCount64(n uint64) int {
        // amd64:"POPCNTQ",".*support_popcnt"
        // arm64:"VCNT","VUADDLV"
+       // s390x:"POPCNT"
        return bits.OnesCount64(n)
 }
 
 func OnesCount32(n uint32) int {
        // amd64:"POPCNTL",".*support_popcnt"
        // arm64:"VCNT","VUADDLV"
+       // s390x:"POPCNT"
        return bits.OnesCount32(n)
 }
 
 func OnesCount16(n uint16) int {
        // amd64:"POPCNTL",".*support_popcnt"
        // arm64:"VCNT","VUADDLV"
+       // s390x:"POPCNT"
        return bits.OnesCount16(n)
 }
 
+func OnesCount8(n uint8) int {
+       // s390x:"POPCNT"
+       return bits.OnesCount8(n)
+}
+
 // ----------------------- //
 //    bits.ReverseBytes    //
 // ----------------------- //