]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: use vsx loads and stores for LoweredMove, LoweredZero on ppc64x
authorLynn Boger <laboger@linux.vnet.ibm.com>
Fri, 4 Oct 2019 20:03:36 +0000 (16:03 -0400)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Tue, 8 Oct 2019 16:41:02 +0000 (16:41 +0000)
This improves the code generated for LoweredMove and LoweredZero by
using LXVD2X and STXVD2X to move 16 bytes at a time. These instructions
are now used if the size to be moved or zeroed is >= 64. These same
instructions have already been used in the asm implementations for
memmove and memclr.

Some examples where this shows an improvement on power8:

MakeSlice/Byte                                  27.3ns ± 1%     25.2ns ± 0%    -7.69%
MakeSlice/Int16                                 40.2ns ± 0%     35.2ns ± 0%   -12.39%
MakeSlice/Int                                   94.9ns ± 1%     77.9ns ± 0%   -17.92%
MakeSlice/Ptr                                    129ns ± 1%      103ns ± 0%   -20.16%
MakeSlice/Struct/24                              176ns ± 1%      131ns ± 0%   -25.67%
MakeSlice/Struct/32                              200ns ± 1%      142ns ± 0%   -29.09%
MakeSlice/Struct/40                              220ns ± 2%      156ns ± 0%   -28.82%
GrowSlice/Byte                                  81.4ns ± 0%     73.4ns ± 0%    -9.88%
GrowSlice/Int16                                  118ns ± 1%       98ns ± 0%   -17.03%
GrowSlice/Int                                    178ns ± 1%      134ns ± 1%   -24.65%
GrowSlice/Ptr                                    249ns ± 4%      212ns ± 0%   -14.94%
GrowSlice/Struct/24                              294ns ± 5%      215ns ± 0%   -27.08%
GrowSlice/Struct/32                              315ns ± 1%      248ns ± 0%   -21.49%
GrowSlice/Struct/40                              382ns ± 4%      289ns ± 1%   -24.38%
ExtendSlice/IntSlice                             109ns ± 1%       90ns ± 1%   -17.51%
ExtendSlice/PointerSlice                         142ns ± 2%      118ns ± 0%   -16.75%
ExtendSlice/NoGrow                              6.02ns ± 0%     5.88ns ± 0%    -2.33%
Append                                          27.2ns ± 0%     27.6ns ± 0%    +1.38%
AppendGrowByte                                  4.20ms ± 3%     2.60ms ± 0%   -38.18%
AppendGrowString                                 134ms ± 3%      102ms ± 2%   -23.62%
AppendSlice/1Bytes                              5.65ns ± 0%     5.67ns ± 0%    +0.35%
AppendSlice/4Bytes                              6.40ns ± 0%     6.55ns ± 0%    +2.34%
AppendSlice/7Bytes                              8.74ns ± 0%     8.84ns ± 0%    +1.14%
AppendSlice/8Bytes                              5.68ns ± 0%     5.70ns ± 0%    +0.40%
AppendSlice/15Bytes                             9.31ns ± 0%     9.39ns ± 0%    +0.86%
AppendSlice/16Bytes                             14.0ns ± 0%      5.8ns ± 0%   -58.32%
AppendSlice/32Bytes                             5.72ns ± 0%     5.68ns ± 0%    -0.66%
AppendSliceLarge/1024Bytes                       918ns ± 8%      615ns ± 1%   -33.00%
AppendSliceLarge/4096Bytes                      3.25µs ± 1%     1.92µs ± 1%   -40.84%
AppendSliceLarge/16384Bytes                     8.70µs ± 2%     4.69µs ± 0%   -46.08%
AppendSliceLarge/65536Bytes                     18.1µs ± 3%      7.9µs ± 0%   -56.30%
AppendSliceLarge/262144Bytes                    69.8µs ± 2%     25.9µs ± 0%   -62.91%
AppendSliceLarge/1048576Bytes                    258µs ± 1%       93µs ± 0%   -63.96%

Change-Id: I21625dbe231a2029ddb9f7d73f5a6417b35c1e49
Reviewed-on: https://go-review.googlesource.com/c/go/+/199639
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
src/cmd/compile/internal/ppc64/ssa.go
src/cmd/compile/internal/ssa/gen/PPC64Ops.go
src/cmd/compile/internal/ssa/opGen.go

index 69847c38d2021f3fdb3d1ece955b806c9108b506..4f852b883a616a93e870b4593fa692c56df12f5c 100644 (file)
@@ -855,13 +855,13 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                // for sizes >= 64 generate a loop as follows:
 
                // set up loop counter in CTR, used by BC
+               //       XXLXOR VS32,VS32,VS32
                //       MOVD len/32,REG_TMP
                //       MOVD REG_TMP,CTR
+               //       MOVD $16,REG_TMP
                //       loop:
-               //       MOVD R0,(R3)
-               //       MOVD R0,8(R3)
-               //       MOVD R0,16(R3)
-               //       MOVD R0,24(R3)
+               //       STXVD2X VS32,(R0)(R3)
+               //       STXVD2X VS32,(R31)(R3)
                //       ADD  $32,R3
                //       BC   16, 0, loop
                //
@@ -895,8 +895,16 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                // only generate a loop if there is more
                // than 1 iteration.
                if ctr > 1 {
+                       // Set up VS32 (V0) to hold 0s
+                       p := s.Prog(ppc64.AXXLXOR)
+                       p.From.Type = obj.TYPE_REG
+                       p.From.Reg = ppc64.REG_VS32
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = ppc64.REG_VS32
+                       p.Reg = ppc64.REG_VS32
+
                        // Set up CTR loop counter
-                       p := s.Prog(ppc64.AMOVD)
+                       p = s.Prog(ppc64.AMOVD)
                        p.From.Type = obj.TYPE_CONST
                        p.From.Offset = ctr
                        p.To.Type = obj.TYPE_REG
@@ -908,23 +916,35 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                        p.To.Type = obj.TYPE_REG
                        p.To.Reg = ppc64.REG_CTR
 
-                       // generate 4 MOVDs
+                       // Set up R31 to hold index value 16
+                       p = s.Prog(ppc64.AMOVD)
+                       p.From.Type = obj.TYPE_CONST
+                       p.From.Offset = 16
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = ppc64.REGTMP
+
+                       // generate 2 STXVD2Xs to store 16 bytes
                        // when this is a loop then the top must be saved
                        var top *obj.Prog
-                       for offset := int64(0); offset < 32; offset += 8 {
-                               // This is the top of loop
-                               p := s.Prog(ppc64.AMOVD)
-                               p.From.Type = obj.TYPE_REG
-                               p.From.Reg = ppc64.REG_R0
-                               p.To.Type = obj.TYPE_MEM
-                               p.To.Reg = v.Args[0].Reg()
-                               p.To.Offset = offset
-                               // Save the top of loop
-                               if top == nil {
-                                       top = p
-                               }
+                       // This is the top of loop
+                       p = s.Prog(ppc64.ASTXVD2X)
+                       p.From.Type = obj.TYPE_REG
+                       p.From.Reg = ppc64.REG_VS32
+                       p.To.Type = obj.TYPE_MEM
+                       p.To.Reg = v.Args[0].Reg()
+                       p.To.Index = ppc64.REGZERO
+                       // Save the top of loop
+                       if top == nil {
+                               top = p
                        }
 
+                       p = s.Prog(ppc64.ASTXVD2X)
+                       p.From.Type = obj.TYPE_REG
+                       p.From.Reg = ppc64.REG_VS32
+                       p.To.Type = obj.TYPE_MEM
+                       p.To.Reg = v.Args[0].Reg()
+                       p.To.Index = ppc64.REGTMP
+
                        // Increment address for the
                        // 4 doublewords just zeroed.
                        p = s.Prog(ppc64.AADD)
@@ -994,30 +1014,27 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                // When moving >= 64 bytes a loop is used
                //      MOVD len/32,REG_TMP
                //      MOVD REG_TMP,CTR
+               //      MOVD $16,REG_TMP
                // top:
-               //      MOVD (R4),R7
-               //      MOVD 8(R4),R8
-               //      MOVD 16(R4),R9
-               //      MOVD 24(R4),R10
-               //      ADD  R4,$32
-               //      MOVD R7,(R3)
-               //      MOVD R8,8(R3)
-               //      MOVD R9,16(R3)
-               //      MOVD R10,24(R3)
-               //      ADD  R3,$32
+               //      LXVD2X (R0)(R4),VS32
+               //      LXVD2X (R31)(R4),VS33
+               //      ADD $32,R4
+               //      STXVD2X VS32,(R0)(R3)
+               //      STXVD2X VS33,(R31)(R4)
+               //      ADD $32,R3
                //      BC 16,0,top
                // Bytes not moved by this loop are moved
                // with a combination of the following instructions,
                // starting with the largest sizes and generating as
                // many as needed, using the appropriate offset value.
-               //      MOVD  n(R4),R7
-               //      MOVD  R7,n(R3)
-               //      MOVW  n1(R4),R7
-               //      MOVW  R7,n1(R3)
-               //      MOVH  n2(R4),R7
-               //      MOVH  R7,n2(R3)
-               //      MOVB  n3(R4),R7
-               //      MOVB  R7,n3(R3)
+               //      MOVD  n(R4),R14
+               //      MOVD  R14,n(R3)
+               //      MOVW  n1(R4),R14
+               //      MOVW  R14,n1(R3)
+               //      MOVH  n2(R4),R14
+               //      MOVH  R14,n2(R3)
+               //      MOVB  n3(R4),R14
+               //      MOVB  R14,n3(R3)
 
                // Each loop iteration moves 32 bytes
                ctr := v.AuxInt / 32
@@ -1030,7 +1047,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 
                // The set of registers used here, must match the clobbered reg list
                // in PPC64Ops.go.
-               useregs := []int16{ppc64.REG_R7, ppc64.REG_R8, ppc64.REG_R9, ppc64.REG_R10}
                offset := int64(0)
 
                // top of the loop
@@ -1050,22 +1066,35 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                        p.To.Type = obj.TYPE_REG
                        p.To.Reg = ppc64.REG_CTR
 
-                       // Generate all the MOVDs for loads
-                       // based off the same register, increasing
-                       // the offset by 8 for each instruction
-                       for _, rg := range useregs {
-                               p := s.Prog(ppc64.AMOVD)
-                               p.From.Type = obj.TYPE_MEM
-                               p.From.Reg = src_reg
-                               p.From.Offset = offset
-                               p.To.Type = obj.TYPE_REG
-                               p.To.Reg = rg
-                               if top == nil {
-                                       top = p
-                               }
-                               offset += 8
+                       // Use REGTMP as index reg
+                       p = s.Prog(ppc64.AMOVD)
+                       p.From.Type = obj.TYPE_CONST
+                       p.From.Offset = 16
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = ppc64.REGTMP
+
+                       // Generate 16 byte loads and stores.
+                       // Use temp register for index (16)
+                       // on the second one.
+                       p = s.Prog(ppc64.ALXVD2X)
+                       p.From.Type = obj.TYPE_MEM
+                       p.From.Reg = src_reg
+                       p.From.Index = ppc64.REGZERO
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = ppc64.REG_VS32
+
+                       if top == nil {
+                               top = p
                        }
-                       // increment the src_reg for next iteration
+
+                       p = s.Prog(ppc64.ALXVD2X)
+                       p.From.Type = obj.TYPE_MEM
+                       p.From.Reg = src_reg
+                       p.From.Index = ppc64.REGTMP
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = ppc64.REG_VS33
+
+                       // increment the src reg for next iteration
                        p = s.Prog(ppc64.AADD)
                        p.Reg = src_reg
                        p.From.Type = obj.TYPE_CONST
@@ -1073,20 +1102,22 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                        p.To.Type = obj.TYPE_REG
                        p.To.Reg = src_reg
 
-                       // generate the MOVDs for stores, based
-                       // off the same register, using the same
-                       // offsets as in the loads.
-                       offset = int64(0)
-                       for _, rg := range useregs {
-                               p := s.Prog(ppc64.AMOVD)
-                               p.From.Type = obj.TYPE_REG
-                               p.From.Reg = rg
-                               p.To.Type = obj.TYPE_MEM
-                               p.To.Reg = dst_reg
-                               p.To.Offset = offset
-                               offset += 8
-                       }
-                       // increment the dst_reg for next iteration
+                       // generate 16 byte stores
+                       p = s.Prog(ppc64.ASTXVD2X)
+                       p.From.Type = obj.TYPE_REG
+                       p.From.Reg = ppc64.REG_VS32
+                       p.To.Type = obj.TYPE_MEM
+                       p.To.Reg = dst_reg
+                       p.To.Index = ppc64.REGZERO
+
+                       p = s.Prog(ppc64.ASTXVD2X)
+                       p.From.Type = obj.TYPE_REG
+                       p.From.Reg = ppc64.REG_VS33
+                       p.To.Type = obj.TYPE_MEM
+                       p.To.Reg = dst_reg
+                       p.To.Index = ppc64.REGTMP
+
+                       // increment the dst reg for next iteration
                        p = s.Prog(ppc64.AADD)
                        p.Reg = dst_reg
                        p.From.Type = obj.TYPE_CONST
@@ -1114,6 +1145,57 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                        rem += 32
                }
 
+               if rem >= 16 {
+                       // Generate 16 byte loads and stores.
+                       // Use temp register for index (value 16)
+                       // on the second one.
+                       p := s.Prog(ppc64.ALXVD2X)
+                       p.From.Type = obj.TYPE_MEM
+                       p.From.Reg = src_reg
+                       p.From.Index = ppc64.REGZERO
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = ppc64.REG_VS32
+
+                       p = s.Prog(ppc64.ASTXVD2X)
+                       p.From.Type = obj.TYPE_REG
+                       p.From.Reg = ppc64.REG_VS32
+                       p.To.Type = obj.TYPE_MEM
+                       p.To.Reg = dst_reg
+                       p.To.Index = ppc64.REGZERO
+
+                       offset = 16
+                       rem -= 16
+
+                       if rem >= 16 {
+                               // Use REGTMP as index reg
+                               p = s.Prog(ppc64.AMOVD)
+                               p.From.Type = obj.TYPE_CONST
+                               p.From.Offset = 16
+                               p.To.Type = obj.TYPE_REG
+                               p.To.Reg = ppc64.REGTMP
+
+                               // Generate 16 byte loads and stores.
+                               // Use temp register for index (16)
+                               // on the second one.
+                               p = s.Prog(ppc64.ALXVD2X)
+                               p.From.Type = obj.TYPE_MEM
+                               p.From.Reg = src_reg
+                               p.From.Index = ppc64.REGTMP
+                               p.To.Type = obj.TYPE_REG
+                               p.To.Reg = ppc64.REG_VS32
+
+                               p = s.Prog(ppc64.ASTXVD2X)
+                               p.From.Type = obj.TYPE_REG
+                               p.From.Reg = ppc64.REG_VS32
+                               p.To.Type = obj.TYPE_MEM
+                               p.To.Reg = dst_reg
+                               p.To.Index = ppc64.REGTMP
+
+                               offset = 32
+                               rem -= 16
+                       }
+               }
+
                // Generate all the remaining load and store pairs, starting with
                // as many 8 byte moves as possible, then 4, 2, 1.
                for rem > 0 {
@@ -1129,7 +1211,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                        // Load
                        p := s.Prog(op)
                        p.To.Type = obj.TYPE_REG
-                       p.To.Reg = ppc64.REG_R7
+                       p.To.Reg = ppc64.REG_R14
                        p.From.Type = obj.TYPE_MEM
                        p.From.Reg = src_reg
                        p.From.Offset = offset
@@ -1137,7 +1219,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                        // Store
                        p = s.Prog(op)
                        p.From.Type = obj.TYPE_REG
-                       p.From.Reg = ppc64.REG_R7
+                       p.From.Reg = ppc64.REG_R14
                        p.To.Type = obj.TYPE_MEM
                        p.To.Reg = dst_reg
                        p.To.Offset = offset
index 5505db522251c31111facebf548e679b9f272eef..a6bcc265432e8f87826e6fc958d3a6e501708246 100644 (file)
@@ -416,13 +416,13 @@ func init() {
                // a loop is generated when there is more than one iteration
                // needed to clear 4 doublewords
                //
+               //      XXLXOR  VS32,VS32,VS32
                //      MOVD    $len/32,R31
                //      MOVD    R31,CTR
+               //      MOVD    $16,R31
                //      loop:
-               //      MOVD    R0,(R3)
-               //      MOVD    R0,8(R3)
-               //      MOVD    R0,16(R3)
-               //      MOVD    R0,24(R3)
+               //      STXVD2X VS32,(R0)(R3)
+               //      STXVD2X VS32,(R31),R3)
                //      ADD     R3,32
                //      BC      loop
 
@@ -448,33 +448,38 @@ func init() {
                        typ:            "Mem",
                        faultOnNilArg0: true,
                },
+               // R31 is temp register
                // Loop code:
-               //      MOVD len/32,REG_TMP  only for loop
-               //      MOVD REG_TMP,CTR     only for loop
+               //      MOVD len/32,R31         set up loop ctr
+               //      MOVD R31,CTR
+               //      MOVD $16,R31            index register
                // loop:
-               //      MOVD (R4),R7
-               //      MOVD 8(R4),R8
-               //      MOVD 16(R4),R9
-               //      MOVD 24(R4),R10
-               //      ADD  R4,$32          only with loop
-               //      MOVD R7,(R3)
-               //      MOVD R8,8(R3)
-               //      MOVD R9,16(R3)
-               //      MOVD R10,24(R3)
-               //      ADD  R3,$32          only with loop
-               //      BC 16,0,loop         only with loop
+               //      LXVD2X (R0)(R4),VS32
+               //      LXVD2X (R31)(R4),VS33
+               //      ADD  R4,$32          increment src
+               //      STXVD2X VS32,(R0)(R3)
+               //      STXVD2X VS33,(R31)(R3)
+               //      ADD  R3,$32          increment dst
+               //      BC 16,0,loop         branch ctr
+               // For this purpose, VS32 and VS33 are treated as
+               // scratch registers. Since regalloc does not
+               // track vector registers, even if it could be marked
+               // as clobbered it would have no effect.
+               // TODO: If vector registers are managed by regalloc
+               // mark these as clobbered.
+               //
                // Bytes not moved by this loop are moved
                // with a combination of the following instructions,
                // starting with the largest sizes and generating as
                // many as needed, using the appropriate offset value.
-               //      MOVD  n(R4),R7
-               //      MOVD  R7,n(R3)
-               //      MOVW  n1(R4),R7
-               //      MOVW  R7,n1(R3)
-               //      MOVH  n2(R4),R7
-               //      MOVH  R7,n2(R3)
-               //      MOVB  n3(R4),R7
-               //      MOVB  R7,n3(R3)
+               //      MOVD  n(R4),R14
+               //      MOVD  R14,n(R3)
+               //      MOVW  n1(R4),R14
+               //      MOVW  R14,n1(R3)
+               //      MOVH  n2(R4),R14
+               //      MOVH  R14,n2(R3)
+               //      MOVB  n3(R4),R14
+               //      MOVB  R14,n3(R3)
 
                {
                        name:      "LoweredMove",
@@ -482,7 +487,7 @@ func init() {
                        argLength: 3,
                        reg: regInfo{
                                inputs:   []regMask{buildReg("R3"), buildReg("R4")},
-                               clobbers: buildReg("R3 R4 R7 R8 R9 R10"),
+                               clobbers: buildReg("R3 R4 R14"),
                        },
                        clobberFlags:   true,
                        typ:            "Mem",
index 795c6bbdf7d8b4d39267e5983283e650bc979c78..c30654dda7006f19308f67733790e5a25b9843c2 100644 (file)
@@ -24486,7 +24486,7 @@ var opcodeTable = [...]opInfo{
                                {0, 8},  // R3
                                {1, 16}, // R4
                        },
-                       clobbers: 1944, // R3 R4 R7 R8 R9 R10
+                       clobbers: 16408, // R3 R4 R14
                },
        },
        {