]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: improve LoweredZero performance for ppc64x
authorLynn Boger <laboger@linux.vnet.ibm.com>
Mon, 13 Mar 2017 14:16:30 +0000 (10:16 -0400)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Tue, 21 Mar 2017 15:08:02 +0000 (15:08 +0000)
This change improves the performance of the LoweredZero rule
on ppc64x.

The improvement can be seen in the runtime ClearFat
benchmarks:

BenchmarkClearFat12-16       2.40          0.69          -71.25%
BenchmarkClearFat16-16       9.98          0.93          -90.68%
BenchmarkClearFat24-16       4.75          0.93          -80.42%
BenchmarkClearFat32-16       6.02          0.93          -84.55%
BenchmarkClearFat40-16       7.19          1.16          -83.87%
BenchmarkClearFat48-16       15.0          1.39          -90.73%
BenchmarkClearFat56-16       9.95          1.62          -83.72%
BenchmarkClearFat64-16       18.0          1.86          -89.67%
BenchmarkClearFat128-16      30.0          8.08          -73.07%
BenchmarkClearFat256-16      52.5          11.3          -78.48%
BenchmarkClearFat512-16      97.0          19.0          -80.41%
BenchmarkClearFat1024-16     244           34.2          -85.98%

Fixes: #19532
Change-Id: If493e28bc1d8e61bc79978498be9f5336a36cd3f
Reviewed-on: https://go-review.googlesource.com/38096
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Munday <munday@ca.ibm.com>
src/cmd/compile/internal/ppc64/ssa.go
src/cmd/compile/internal/ssa/gen/PPC64.rules
src/cmd/compile/internal/ssa/gen/PPC64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewritePPC64.go

index eba99f8720fa032cc69666ae6a258ace933acb17..f79d26275f7c21312ddc194d35aefbd0ea2bca39 100644 (file)
@@ -831,62 +831,135 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                ssaGenISEL(v, ppc64.C_COND_EQ, iselRegs[1], v.Reg())
 
        case ssa.OpPPC64LoweredZero:
-               // Similar to how this is done on ARM,
-               // except that PPC MOVDU x,off(y) is *(y+off) = x; y=y+off
-               // not store-and-increment.
-               // Therefore R3 should be dest-align
-               // and arg1 should be dest+size-align
-               // HOWEVER, the input dest address cannot be dest-align because
-               // that does not necessarily address valid memory and it's not
-               // known how that might be optimized.  Therefore, correct it in
-               // in the expansion:
+
+               // unaligned data doesn't hurt performance
+               // for these instructions on power8 or later
+
+               // for sizes >= 64 generate a loop as follows:
+
+               // set up loop counter in CTR, used by BC
+               //       MOVD len/32,REG_TMP
+               //       MOVD REG_TMP,CTR
+               //       loop:
+               //       MOVD R0,(R3)
+               //       MOVD R0,8(R3)
+               //       MOVD R0,16(R3)
+               //       MOVD R0,24(R3)
+               //       ADD  $32,R3
+               //       BC   16, 0, loop
                //
-               // ADD    -8,R3,R3
-               // MOVDU  R0, 8(R3)
-               // CMP    R3, Rarg1
-               // BL     -2(PC)
-               // arg1 is the address of the last element to zero
-               // auxint is alignment
-               var sz int64
-               var movu obj.As
-               switch {
-               case v.AuxInt%8 == 0:
-                       sz = 8
-                       movu = ppc64.AMOVDU
-               case v.AuxInt%4 == 0:
-                       sz = 4
-                       movu = ppc64.AMOVWZU // MOVWU instruction not implemented
-               case v.AuxInt%2 == 0:
-                       sz = 2
-                       movu = ppc64.AMOVHU
-               default:
-                       sz = 1
-                       movu = ppc64.AMOVBU
-               }
+               // any remainder is done as described below
 
-               p := gc.Prog(ppc64.AADD)
-               p.Reg = v.Args[0].Reg()
-               p.From.Type = obj.TYPE_CONST
-               p.From.Offset = -sz
-               p.To.Type = obj.TYPE_REG
-               p.To.Reg = v.Args[0].Reg()
+               // for sizes < 64 bytes, first clear as many doublewords as possible,
+               // then handle the remainder
+               //      MOVD R0,(R3)
+               //      MOVD R0,8(R3)
+               // .... etc.
+               //
+               // the remainder bytes are cleared using one or more
+               // of the following instructions with the appropriate
+               // offsets depending which instructions are needed
+               //
+               //      MOVW R0,n1(R3)  4 bytes
+               //      MOVH R0,n2(R3)  2 bytes
+               //      MOVB R0,n3(R3)  1 byte
+               //
+               // 7 bytes: MOVW, MOVH, MOVB
+               // 6 bytes: MOVW, MOVH
+               // 5 bytes: MOVW, MOVB
+               // 3 bytes: MOVH, MOVB
 
-               p = gc.Prog(movu)
-               p.From.Type = obj.TYPE_REG
-               p.From.Reg = ppc64.REG_R0
-               p.To.Type = obj.TYPE_MEM
-               p.To.Reg = v.Args[0].Reg()
-               p.To.Offset = sz
+               // each loop iteration does 32 bytes
+               ctr := v.AuxInt / 32
 
-               p2 := gc.Prog(ppc64.ACMPU)
-               p2.From.Type = obj.TYPE_REG
-               p2.From.Reg = v.Args[0].Reg()
-               p2.To.Reg = v.Args[1].Reg()
-               p2.To.Type = obj.TYPE_REG
+               // remainder bytes
+               rem := v.AuxInt % 32
 
-               p3 := gc.Prog(ppc64.ABLT)
-               p3.To.Type = obj.TYPE_BRANCH
-               gc.Patch(p3, p)
+               // only generate a loop if there is more
+               // than 1 iteration.
+               if ctr > 1 {
+                       // Set up CTR loop counter
+                       p := gc.Prog(ppc64.AMOVD)
+                       p.From.Type = obj.TYPE_CONST
+                       p.From.Offset = ctr
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = ppc64.REGTMP
+
+                       p = gc.Prog(ppc64.AMOVD)
+                       p.From.Type = obj.TYPE_REG
+                       p.From.Reg = ppc64.REGTMP
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = ppc64.REG_CTR
+
+                       // generate 4 MOVDs
+                       // when this is a loop then the top must be saved
+                       var top *obj.Prog
+                       for offset := int64(0); offset < 32; offset += 8 {
+                               // This is the top of loop
+                               p := gc.Prog(ppc64.AMOVD)
+                               p.From.Type = obj.TYPE_REG
+                               p.From.Reg = ppc64.REG_R0
+                               p.To.Type = obj.TYPE_MEM
+                               p.To.Reg = v.Args[0].Reg()
+                               p.To.Offset = offset
+                               // Save the top of loop
+                               if top == nil {
+                                       top = p
+                               }
+                       }
+
+                       // Increment address for the
+                       // 4 doublewords just zeroed.
+                       p = gc.Prog(ppc64.AADD)
+                       p.Reg = v.Args[0].Reg()
+                       p.From.Type = obj.TYPE_CONST
+                       p.From.Offset = 32
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = v.Args[0].Reg()
+
+                       // Branch back to top of loop
+                       // based on CTR
+                       // BC with BO_BCTR generates bdnz
+                       p = gc.Prog(ppc64.ABC)
+                       p.From.Type = obj.TYPE_CONST
+                       p.From.Offset = ppc64.BO_BCTR
+                       p.Reg = ppc64.REG_R0
+                       p.To.Type = obj.TYPE_BRANCH
+                       gc.Patch(p, top)
+               }
+
+               // when ctr == 1 the loop was not generated but
+               // there are at least 32 bytes to clear, so add
+               // that to the remainder to generate the code
+               // to clear those doublewords
+               if ctr == 1 {
+                       rem += 32
+               }
+
+               // clear the remainder starting at offset zero
+               offset := int64(0)
+
+               // first clear as many doublewords as possible
+               // then clear remaining sizes as available
+               for rem > 0 {
+                       op, size := ppc64.AMOVB, int64(1)
+                       switch {
+                       case rem >= 8:
+                               op, size = ppc64.AMOVD, 8
+                       case rem >= 4:
+                               op, size = ppc64.AMOVW, 4
+                       case rem >= 2:
+                               op, size = ppc64.AMOVH, 2
+                       }
+                       p := gc.Prog(op)
+                       p.From.Type = obj.TYPE_REG
+                       p.From.Reg = ppc64.REG_R0
+                       p.To.Type = obj.TYPE_MEM
+                       p.To.Reg = v.Args[0].Reg()
+                       p.To.Offset = offset
+                       rem -= size
+                       offset += size
+               }
 
        case ssa.OpPPC64LoweredMove:
                // Similar to how this is done on ARM,
index 48d7de569b7d0d3575ebbe5ddc0dcb3ec1a80d7b..a44e50629de572021ff6cf5105b2ff0ee6aef97c 100644 (file)
 (Store {t} ptr val mem) && t.(Type).Size() == 2 -> (MOVHstore ptr val mem)
 (Store {t} ptr val mem) && t.(Type).Size() == 1 -> (MOVBstore ptr val mem)
 
+// Using Zero instead of LoweredZero allows the
+// target address to be folded where possible.
 (Zero [0] _ mem) -> mem
 (Zero [1] destptr mem) -> (MOVBstorezero destptr mem)
-(Zero [2] {t} destptr mem) && t.(Type).Alignment()%2 == 0 ->
-       (MOVHstorezero destptr mem)
 (Zero [2] destptr mem) ->
-       (MOVBstorezero [1] destptr
-               (MOVBstorezero [0] destptr mem))
-(Zero [4] {t} destptr mem) && t.(Type).Alignment()%4 == 0 ->
-       (MOVWstorezero destptr mem)
-(Zero [4] {t} destptr mem) && t.(Type).Alignment()%2 == 0 ->
-       (MOVHstorezero [2] destptr
-               (MOVHstorezero [0] destptr mem))
-(Zero [4] destptr mem) ->
-       (MOVBstorezero [3] destptr
-               (MOVBstorezero [2] destptr
-                       (MOVBstorezero [1] destptr
-                               (MOVBstorezero [0] destptr mem))))
-(Zero [8] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
-       (MOVDstorezero [0] destptr mem)
-(Zero [8] {t} destptr mem) && t.(Type).Alignment()%4 == 0 ->
-       (MOVWstorezero [4] destptr
-               (MOVWstorezero [0] destptr mem))
-(Zero [8] {t} destptr mem) && t.(Type).Alignment()%2 == 0 ->
-       (MOVHstorezero [6] destptr
-               (MOVHstorezero [4] destptr
-                       (MOVHstorezero [2] destptr
-                               (MOVHstorezero [0] destptr mem))))
-
+       (MOVHstorezero destptr mem)
 (Zero [3] destptr mem) ->
        (MOVBstorezero [2] destptr
-               (MOVBstorezero [1] destptr
-                       (MOVBstorezero [0] destptr mem)))
+               (MOVHstorezero destptr mem))
+(Zero [4] destptr mem) ->
+       (MOVWstorezero destptr mem)
+(Zero [5] destptr mem) ->
+       (MOVBstorezero [4] destptr
+               (MOVWstorezero destptr mem))
+(Zero [6] destptr mem) ->
+       (MOVHstorezero [4] destptr
+               (MOVWstorezero destptr mem))
+(Zero [7] destptr mem) ->
+       (MOVBstorezero [6] destptr
+               (MOVHstorezero [4] destptr
+                       (MOVWstorezero destptr mem)))
+(Zero [8] destptr mem) ->
+       (MOVDstorezero destptr mem)
 
 // Zero small numbers of words directly.
-(Zero [16] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
+(Zero [12] destptr mem) ->
+        (MOVWstorezero [8] destptr
+                (MOVDstorezero [0] destptr mem))
+(Zero [16] destptr mem) ->
        (MOVDstorezero [8] destptr
                 (MOVDstorezero [0] destptr mem))
-(Zero [24] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
+(Zero [24] destptr mem) ->
        (MOVDstorezero [16] destptr
                (MOVDstorezero [8] destptr
                        (MOVDstorezero [0] destptr mem)))
-(Zero [32] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
+(Zero [32] destptr mem) ->
        (MOVDstorezero [24] destptr
                (MOVDstorezero [16] destptr
                        (MOVDstorezero [8] destptr
                                (MOVDstorezero [0] destptr mem))))
 
-// Large zeroing uses a loop
-(Zero [s] {t} ptr mem)
-       && (s > 512 || config.noDuffDevice) || t.(Type).Alignment()%8 != 0 ->
-       (LoweredZero [t.(Type).Alignment()]
-               ptr
-               (ADDconst <ptr.Type> ptr [s-moveSize(t.(Type).Alignment(), config)])
-               mem)
+(Zero [40] destptr mem) ->
+       (MOVDstorezero [32] destptr
+               (MOVDstorezero [24] destptr
+                       (MOVDstorezero [16] destptr
+                               (MOVDstorezero [8] destptr
+                                       (MOVDstorezero [0] destptr mem)))))
+
+(Zero [48] destptr mem) ->
+       (MOVDstorezero [40] destptr
+               (MOVDstorezero [32] destptr
+                       (MOVDstorezero [24] destptr
+                               (MOVDstorezero [16] destptr
+                                       (MOVDstorezero [8] destptr
+                                               (MOVDstorezero [0] destptr mem))))))
+
+(Zero [56] destptr mem) ->
+       (MOVDstorezero [48] destptr
+               (MOVDstorezero [40] destptr
+                       (MOVDstorezero [32] destptr
+                               (MOVDstorezero [24] destptr
+                                       (MOVDstorezero [16] destptr
+                                               (MOVDstorezero [8] destptr
+                                                       (MOVDstorezero [0] destptr mem)))))))
+
+// Handle cases not handled above
+(Zero [s] ptr mem) -> (LoweredZero [s] ptr mem)
 
 // moves
 (Move [0] _ _ mem) -> mem
index 10010459097d8583e8758fbdefcb89521ece1b1a..387584dbdaaa14f00002da5c4531ae2b766836d0 100644 (file)
@@ -312,19 +312,37 @@ func init() {
 
                // large or unaligned zeroing
                // arg0 = address of memory to zero (in R3, changed as side effect)
-               // arg1 = address of the last element to zero
-               // arg2 = mem
                // returns mem
-               //  ADD -8,R3,R3 // intermediate value not valid GC ptr, cannot expose to opt+GC
-               //      MOVDU   R0, 8(R3)
-               //      CMP     R3, Rarg1
-               //      BLE     -2(PC)
+               //
+               // a loop is generated when there is more than one iteration
+               // needed to clear 4 doublewords
+               //
+               //      MOVD    $len/32,R31
+               //      MOVD    R31,CTR
+               //      loop:
+               //      MOVD    R0,(R3)
+               //      MOVD    R0,8(R3)
+               //      MOVD    R0,16(R3)
+               //      MOVD    R0,24(R3)
+               //      ADD     R3,32
+               //      BC      loop
+
+               // remaining doubleword clears generated as needed
+               //      MOVD    R0,(R3)
+               //      MOVD    R0,8(R3)
+               //      MOVD    R0,16(R3)
+               //      MOVD    R0,24(R3)
+
+               // one or more of these to clear remainder < 8 bytes
+               //      MOVW    R0,n1(R3)
+               //      MOVH    R0,n2(R3)
+               //      MOVB    R0,n3(R3)
                {
                        name:      "LoweredZero",
                        aux:       "Int64",
-                       argLength: 3,
+                       argLength: 2,
                        reg: regInfo{
-                               inputs:   []regMask{buildReg("R3"), gp},
+                               inputs:   []regMask{buildReg("R3")},
                                clobbers: buildReg("R3"),
                        },
                        clobberFlags:   true,
index 4361b2fa45853ee2d4eb61b53ca6443727c89e1d..ce6988e0142afb12e63190371ff60988d2381550 100644 (file)
@@ -17368,13 +17368,12 @@ var opcodeTable = [...]opInfo{
        {
                name:           "LoweredZero",
                auxType:        auxInt64,
-               argLen:         3,
+               argLen:         2,
                clobberFlags:   true,
                faultOnNilArg0: true,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 8},          // R3
-                               {1, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+                               {0, 8}, // R3
                        },
                        clobbers: 8, // R3
                },
index 0943dfa18bcc4528f5f3deda86f508588368de8b..785fbd211ff1666dddfaa3fadbef80aa3efee0f2 100644 (file)
@@ -9656,8 +9656,6 @@ func rewriteValuePPC64_OpXor8(v *Value) bool {
 func rewriteValuePPC64_OpZero(v *Value) bool {
        b := v.Block
        _ = b
-       config := b.Func.Config
-       _ = config
        // match: (Zero [0] _ mem)
        // cond:
        // result: mem
@@ -9685,200 +9683,178 @@ func rewriteValuePPC64_OpZero(v *Value) bool {
                v.AddArg(mem)
                return true
        }
-       // match: (Zero [2] {t} destptr mem)
-       // cond: t.(Type).Alignment()%2 == 0
+       // match: (Zero [2] destptr mem)
+       // cond:
        // result: (MOVHstorezero destptr mem)
        for {
                if v.AuxInt != 2 {
                        break
                }
-               t := v.Aux
                destptr := v.Args[0]
                mem := v.Args[1]
-               if !(t.(Type).Alignment()%2 == 0) {
-                       break
-               }
                v.reset(OpPPC64MOVHstorezero)
                v.AddArg(destptr)
                v.AddArg(mem)
                return true
        }
-       // match: (Zero [2] destptr mem)
+       // match: (Zero [3] destptr mem)
        // cond:
-       // result: (MOVBstorezero [1] destptr           (MOVBstorezero [0] destptr mem))
+       // result: (MOVBstorezero [2] destptr           (MOVHstorezero destptr mem))
        for {
-               if v.AuxInt != 2 {
+               if v.AuxInt != 3 {
                        break
                }
                destptr := v.Args[0]
                mem := v.Args[1]
                v.reset(OpPPC64MOVBstorezero)
-               v.AuxInt = 1
+               v.AuxInt = 2
                v.AddArg(destptr)
-               v0 := b.NewValue0(v.Pos, OpPPC64MOVBstorezero, TypeMem)
-               v0.AuxInt = 0
+               v0 := b.NewValue0(v.Pos, OpPPC64MOVHstorezero, TypeMem)
                v0.AddArg(destptr)
                v0.AddArg(mem)
                v.AddArg(v0)
                return true
        }
-       // match: (Zero [4] {t} destptr mem)
-       // cond: t.(Type).Alignment()%4 == 0
+       // match: (Zero [4] destptr mem)
+       // cond:
        // result: (MOVWstorezero destptr mem)
        for {
                if v.AuxInt != 4 {
                        break
                }
-               t := v.Aux
                destptr := v.Args[0]
                mem := v.Args[1]
-               if !(t.(Type).Alignment()%4 == 0) {
-                       break
-               }
                v.reset(OpPPC64MOVWstorezero)
                v.AddArg(destptr)
                v.AddArg(mem)
                return true
        }
-       // match: (Zero [4] {t} destptr mem)
-       // cond: t.(Type).Alignment()%2 == 0
-       // result: (MOVHstorezero [2] destptr           (MOVHstorezero [0] destptr mem))
+       // match: (Zero [5] destptr mem)
+       // cond:
+       // result: (MOVBstorezero [4] destptr           (MOVWstorezero destptr mem))
        for {
-               if v.AuxInt != 4 {
+               if v.AuxInt != 5 {
                        break
                }
-               t := v.Aux
                destptr := v.Args[0]
                mem := v.Args[1]
-               if !(t.(Type).Alignment()%2 == 0) {
+               v.reset(OpPPC64MOVBstorezero)
+               v.AuxInt = 4
+               v.AddArg(destptr)
+               v0 := b.NewValue0(v.Pos, OpPPC64MOVWstorezero, TypeMem)
+               v0.AddArg(destptr)
+               v0.AddArg(mem)
+               v.AddArg(v0)
+               return true
+       }
+       // match: (Zero [6] destptr mem)
+       // cond:
+       // result: (MOVHstorezero [4] destptr           (MOVWstorezero destptr mem))
+       for {
+               if v.AuxInt != 6 {
                        break
                }
+               destptr := v.Args[0]
+               mem := v.Args[1]
                v.reset(OpPPC64MOVHstorezero)
-               v.AuxInt = 2
+               v.AuxInt = 4
                v.AddArg(destptr)
-               v0 := b.NewValue0(v.Pos, OpPPC64MOVHstorezero, TypeMem)
-               v0.AuxInt = 0
+               v0 := b.NewValue0(v.Pos, OpPPC64MOVWstorezero, TypeMem)
                v0.AddArg(destptr)
                v0.AddArg(mem)
                v.AddArg(v0)
                return true
        }
-       // match: (Zero [4] destptr mem)
+       // match: (Zero [7] destptr mem)
        // cond:
-       // result: (MOVBstorezero [3] destptr           (MOVBstorezero [2] destptr                      (MOVBstorezero [1] destptr                              (MOVBstorezero [0] destptr mem))))
+       // result: (MOVBstorezero [6] destptr           (MOVHstorezero [4] destptr                      (MOVWstorezero destptr mem)))
        for {
-               if v.AuxInt != 4 {
+               if v.AuxInt != 7 {
                        break
                }
                destptr := v.Args[0]
                mem := v.Args[1]
                v.reset(OpPPC64MOVBstorezero)
-               v.AuxInt = 3
+               v.AuxInt = 6
                v.AddArg(destptr)
-               v0 := b.NewValue0(v.Pos, OpPPC64MOVBstorezero, TypeMem)
-               v0.AuxInt = 2
+               v0 := b.NewValue0(v.Pos, OpPPC64MOVHstorezero, TypeMem)
+               v0.AuxInt = 4
                v0.AddArg(destptr)
-               v1 := b.NewValue0(v.Pos, OpPPC64MOVBstorezero, TypeMem)
-               v1.AuxInt = 1
+               v1 := b.NewValue0(v.Pos, OpPPC64MOVWstorezero, TypeMem)
                v1.AddArg(destptr)
-               v2 := b.NewValue0(v.Pos, OpPPC64MOVBstorezero, TypeMem)
-               v2.AuxInt = 0
-               v2.AddArg(destptr)
-               v2.AddArg(mem)
-               v1.AddArg(v2)
+               v1.AddArg(mem)
                v0.AddArg(v1)
                v.AddArg(v0)
                return true
        }
-       // match: (Zero [8] {t} destptr mem)
-       // cond: t.(Type).Alignment()%8 == 0
-       // result: (MOVDstorezero [0] destptr mem)
+       // match: (Zero [8] destptr mem)
+       // cond:
+       // result: (MOVDstorezero destptr mem)
        for {
                if v.AuxInt != 8 {
                        break
                }
-               t := v.Aux
                destptr := v.Args[0]
                mem := v.Args[1]
-               if !(t.(Type).Alignment()%8 == 0) {
-                       break
-               }
                v.reset(OpPPC64MOVDstorezero)
-               v.AuxInt = 0
                v.AddArg(destptr)
                v.AddArg(mem)
                return true
        }
-       // match: (Zero [8] {t} destptr mem)
-       // cond: t.(Type).Alignment()%4 == 0
-       // result: (MOVWstorezero [4] destptr           (MOVWstorezero [0] destptr mem))
+       // match: (Zero [12] destptr mem)
+       // cond:
+       // result: (MOVWstorezero [8] destptr                 (MOVDstorezero [0] destptr mem))
        for {
-               if v.AuxInt != 8 {
+               if v.AuxInt != 12 {
                        break
                }
-               t := v.Aux
                destptr := v.Args[0]
                mem := v.Args[1]
-               if !(t.(Type).Alignment()%4 == 0) {
-                       break
-               }
                v.reset(OpPPC64MOVWstorezero)
-               v.AuxInt = 4
+               v.AuxInt = 8
                v.AddArg(destptr)
-               v0 := b.NewValue0(v.Pos, OpPPC64MOVWstorezero, TypeMem)
+               v0 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
                v0.AuxInt = 0
                v0.AddArg(destptr)
                v0.AddArg(mem)
                v.AddArg(v0)
                return true
        }
-       // match: (Zero [8] {t} destptr mem)
-       // cond: t.(Type).Alignment()%2 == 0
-       // result: (MOVHstorezero [6] destptr           (MOVHstorezero [4] destptr                      (MOVHstorezero [2] destptr                              (MOVHstorezero [0] destptr mem))))
+       // match: (Zero [16] destptr mem)
+       // cond:
+       // result: (MOVDstorezero [8] destptr                 (MOVDstorezero [0] destptr mem))
        for {
-               if v.AuxInt != 8 {
+               if v.AuxInt != 16 {
                        break
                }
-               t := v.Aux
                destptr := v.Args[0]
                mem := v.Args[1]
-               if !(t.(Type).Alignment()%2 == 0) {
-                       break
-               }
-               v.reset(OpPPC64MOVHstorezero)
-               v.AuxInt = 6
+               v.reset(OpPPC64MOVDstorezero)
+               v.AuxInt = 8
                v.AddArg(destptr)
-               v0 := b.NewValue0(v.Pos, OpPPC64MOVHstorezero, TypeMem)
-               v0.AuxInt = 4
+               v0 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v0.AuxInt = 0
                v0.AddArg(destptr)
-               v1 := b.NewValue0(v.Pos, OpPPC64MOVHstorezero, TypeMem)
-               v1.AuxInt = 2
-               v1.AddArg(destptr)
-               v2 := b.NewValue0(v.Pos, OpPPC64MOVHstorezero, TypeMem)
-               v2.AuxInt = 0
-               v2.AddArg(destptr)
-               v2.AddArg(mem)
-               v1.AddArg(v2)
-               v0.AddArg(v1)
+               v0.AddArg(mem)
                v.AddArg(v0)
                return true
        }
-       // match: (Zero [3] destptr mem)
+       // match: (Zero [24] destptr mem)
        // cond:
-       // result: (MOVBstorezero [2] destptr           (MOVBstorezero [1] destptr                      (MOVBstorezero [0] destptr mem)))
+       // result: (MOVDstorezero [16] destptr          (MOVDstorezero [8] destptr                      (MOVDstorezero [0] destptr mem)))
        for {
-               if v.AuxInt != 3 {
+               if v.AuxInt != 24 {
                        break
                }
                destptr := v.Args[0]
                mem := v.Args[1]
-               v.reset(OpPPC64MOVBstorezero)
-               v.AuxInt = 2
+               v.reset(OpPPC64MOVDstorezero)
+               v.AuxInt = 16
                v.AddArg(destptr)
-               v0 := b.NewValue0(v.Pos, OpPPC64MOVBstorezero, TypeMem)
-               v0.AuxInt = 1
+               v0 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v0.AuxInt = 8
                v0.AddArg(destptr)
-               v1 := b.NewValue0(v.Pos, OpPPC64MOVBstorezero, TypeMem)
+               v1 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
                v1.AuxInt = 0
                v1.AddArg(destptr)
                v1.AddArg(mem)
@@ -9886,109 +9862,151 @@ func rewriteValuePPC64_OpZero(v *Value) bool {
                v.AddArg(v0)
                return true
        }
-       // match: (Zero [16] {t} destptr mem)
-       // cond: t.(Type).Alignment()%8 == 0
-       // result: (MOVDstorezero [8] destptr                 (MOVDstorezero [0] destptr mem))
+       // match: (Zero [32] destptr mem)
+       // cond:
+       // result: (MOVDstorezero [24] destptr          (MOVDstorezero [16] destptr                     (MOVDstorezero [8] destptr                              (MOVDstorezero [0] destptr mem))))
        for {
-               if v.AuxInt != 16 {
+               if v.AuxInt != 32 {
                        break
                }
-               t := v.Aux
                destptr := v.Args[0]
                mem := v.Args[1]
-               if !(t.(Type).Alignment()%8 == 0) {
-                       break
-               }
                v.reset(OpPPC64MOVDstorezero)
-               v.AuxInt = 8
+               v.AuxInt = 24
                v.AddArg(destptr)
                v0 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
-               v0.AuxInt = 0
+               v0.AuxInt = 16
                v0.AddArg(destptr)
-               v0.AddArg(mem)
+               v1 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v1.AuxInt = 8
+               v1.AddArg(destptr)
+               v2 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v2.AuxInt = 0
+               v2.AddArg(destptr)
+               v2.AddArg(mem)
+               v1.AddArg(v2)
+               v0.AddArg(v1)
                v.AddArg(v0)
                return true
        }
-       // match: (Zero [24] {t} destptr mem)
-       // cond: t.(Type).Alignment()%8 == 0
-       // result: (MOVDstorezero [16] destptr          (MOVDstorezero [8] destptr                      (MOVDstorezero [0] destptr mem)))
+       // match: (Zero [40] destptr mem)
+       // cond:
+       // result: (MOVDstorezero [32] destptr          (MOVDstorezero [24] destptr                     (MOVDstorezero [16] destptr                             (MOVDstorezero [8] destptr                                      (MOVDstorezero [0] destptr mem)))))
        for {
-               if v.AuxInt != 24 {
+               if v.AuxInt != 40 {
                        break
                }
-               t := v.Aux
                destptr := v.Args[0]
                mem := v.Args[1]
-               if !(t.(Type).Alignment()%8 == 0) {
-                       break
-               }
                v.reset(OpPPC64MOVDstorezero)
-               v.AuxInt = 16
+               v.AuxInt = 32
                v.AddArg(destptr)
                v0 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
-               v0.AuxInt = 8
+               v0.AuxInt = 24
                v0.AddArg(destptr)
                v1 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
-               v1.AuxInt = 0
+               v1.AuxInt = 16
                v1.AddArg(destptr)
-               v1.AddArg(mem)
+               v2 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v2.AuxInt = 8
+               v2.AddArg(destptr)
+               v3 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v3.AuxInt = 0
+               v3.AddArg(destptr)
+               v3.AddArg(mem)
+               v2.AddArg(v3)
+               v1.AddArg(v2)
                v0.AddArg(v1)
                v.AddArg(v0)
                return true
        }
-       // match: (Zero [32] {t} destptr mem)
-       // cond: t.(Type).Alignment()%8 == 0
-       // result: (MOVDstorezero [24] destptr          (MOVDstorezero [16] destptr                     (MOVDstorezero [8] destptr                              (MOVDstorezero [0] destptr mem))))
+       // match: (Zero [48] destptr mem)
+       // cond:
+       // result: (MOVDstorezero [40] destptr          (MOVDstorezero [32] destptr                     (MOVDstorezero [24] destptr                             (MOVDstorezero [16] destptr                                     (MOVDstorezero [8] destptr                                              (MOVDstorezero [0] destptr mem))))))
        for {
-               if v.AuxInt != 32 {
+               if v.AuxInt != 48 {
                        break
                }
-               t := v.Aux
                destptr := v.Args[0]
                mem := v.Args[1]
-               if !(t.(Type).Alignment()%8 == 0) {
+               v.reset(OpPPC64MOVDstorezero)
+               v.AuxInt = 40
+               v.AddArg(destptr)
+               v0 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v0.AuxInt = 32
+               v0.AddArg(destptr)
+               v1 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v1.AuxInt = 24
+               v1.AddArg(destptr)
+               v2 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v2.AuxInt = 16
+               v2.AddArg(destptr)
+               v3 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v3.AuxInt = 8
+               v3.AddArg(destptr)
+               v4 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v4.AuxInt = 0
+               v4.AddArg(destptr)
+               v4.AddArg(mem)
+               v3.AddArg(v4)
+               v2.AddArg(v3)
+               v1.AddArg(v2)
+               v0.AddArg(v1)
+               v.AddArg(v0)
+               return true
+       }
+       // match: (Zero [56] destptr mem)
+       // cond:
+       // result: (MOVDstorezero [48] destptr          (MOVDstorezero [40] destptr                     (MOVDstorezero [32] destptr                             (MOVDstorezero [24] destptr                                     (MOVDstorezero [16] destptr                                             (MOVDstorezero [8] destptr                                                      (MOVDstorezero [0] destptr mem)))))))
+       for {
+               if v.AuxInt != 56 {
                        break
                }
+               destptr := v.Args[0]
+               mem := v.Args[1]
                v.reset(OpPPC64MOVDstorezero)
-               v.AuxInt = 24
+               v.AuxInt = 48
                v.AddArg(destptr)
                v0 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
-               v0.AuxInt = 16
+               v0.AuxInt = 40
                v0.AddArg(destptr)
                v1 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
-               v1.AuxInt = 8
+               v1.AuxInt = 32
                v1.AddArg(destptr)
                v2 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
-               v2.AuxInt = 0
+               v2.AuxInt = 24
                v2.AddArg(destptr)
-               v2.AddArg(mem)
+               v3 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v3.AuxInt = 16
+               v3.AddArg(destptr)
+               v4 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v4.AuxInt = 8
+               v4.AddArg(destptr)
+               v5 := b.NewValue0(v.Pos, OpPPC64MOVDstorezero, TypeMem)
+               v5.AuxInt = 0
+               v5.AddArg(destptr)
+               v5.AddArg(mem)
+               v4.AddArg(v5)
+               v3.AddArg(v4)
+               v2.AddArg(v3)
                v1.AddArg(v2)
                v0.AddArg(v1)
                v.AddArg(v0)
                return true
        }
-       // match: (Zero [s] {t} ptr mem)
-       // cond: (s > 512 || config.noDuffDevice) || t.(Type).Alignment()%8 != 0
-       // result: (LoweredZero [t.(Type).Alignment()]          ptr             (ADDconst <ptr.Type> ptr [s-moveSize(t.(Type).Alignment(), config)])            mem)
+       // match: (Zero [s] ptr mem)
+       // cond:
+       // result: (LoweredZero [s] ptr mem)
        for {
                s := v.AuxInt
-               t := v.Aux
                ptr := v.Args[0]
                mem := v.Args[1]
-               if !((s > 512 || config.noDuffDevice) || t.(Type).Alignment()%8 != 0) {
-                       break
-               }
                v.reset(OpPPC64LoweredZero)
-               v.AuxInt = t.(Type).Alignment()
+               v.AuxInt = s
                v.AddArg(ptr)
-               v0 := b.NewValue0(v.Pos, OpPPC64ADDconst, ptr.Type)
-               v0.AuxInt = s - moveSize(t.(Type).Alignment(), config)
-               v0.AddArg(ptr)
-               v.AddArg(v0)
                v.AddArg(mem)
                return true
        }
-       return false
 }
 func rewriteValuePPC64_OpZeroExt16to32(v *Value) bool {
        // match: (ZeroExt16to32 x)