]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: improve LoweredMove performance on ppc64x
authorLynn Boger <laboger@linux.vnet.ibm.com>
Thu, 30 Mar 2017 15:07:36 +0000 (11:07 -0400)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Fri, 31 Mar 2017 21:24:09 +0000 (21:24 +0000)
This change improves the performance for LoweredMove on ppc64le
and ppc64.

benchmark                   old ns/op     new ns/op     delta
BenchmarkCopyFat8-16        0.93          0.69          -25.81%
BenchmarkCopyFat12-16       2.61          1.85          -29.12%
BenchmarkCopyFat16-16       9.68          1.89          -80.48%
BenchmarkCopyFat24-16       4.48          1.85          -58.71%
BenchmarkCopyFat32-16       6.12          1.82          -70.26%
BenchmarkCopyFat64-16       21.2          2.70          -87.26%
BenchmarkCopyFat128-16      29.6          3.97          -86.59%
BenchmarkCopyFat256-16      52.6          13.4          -74.52%
BenchmarkCopyFat512-16      97.1          18.7          -80.74%
BenchmarkCopyFat1024-16     186           35.3          -81.02%

BenchmarkAssertE2TLarge-16      14.2          5.06          -64.37%

Fixes #19785

Change-Id: I7d5e0052712b75811c02c7d86c5112e5649ad782
Reviewed-on: https://go-review.googlesource.com/38950
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/compile/internal/ppc64/ssa.go
src/cmd/compile/internal/ssa/gen/PPC64.rules
src/cmd/compile/internal/ssa/gen/PPC64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewritePPC64.go

index 5f7b168ef76cf8fd0ba42b05e2784b936f4b3207..041ff3abede643a59fab83a01bb2c0ea2b83c320 100644 (file)
@@ -917,75 +917,171 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                }
 
        case ssa.OpPPC64LoweredMove:
-               // Similar to how this is done on ARM,
-               // except that PPC MOVDU x,off(y) is *(y+off) = x; y=y+off,
-               // not store-and-increment.
-               // Inputs must be valid pointers to memory,
-               // so adjust arg0 and arg1 as part of the expansion.
-               // arg2 should be src+size-align,
-               //
-               // ADD    -8,R3,R3
-               // ADD    -8,R4,R4
-               // MOVDU        8(R4), Rtmp
-               // MOVDU        Rtmp, 8(R3)
-               // CMP  R4, Rarg2
-               // BL   -3(PC)
-               // arg2 is the address of the last element of src
-               // auxint is alignment
-               var sz int64
-               var movu obj.As
-               switch {
-               case v.AuxInt%8 == 0:
-                       sz = 8
-                       movu = ppc64.AMOVDU
-               case v.AuxInt%4 == 0:
-                       sz = 4
-                       movu = ppc64.AMOVWZU // MOVWU instruction not implemented
-               case v.AuxInt%2 == 0:
-                       sz = 2
-                       movu = ppc64.AMOVHU
-               default:
-                       sz = 1
-                       movu = ppc64.AMOVBU
-               }
 
-               p := s.Prog(ppc64.AADD)
-               p.Reg = v.Args[0].Reg()
-               p.From.Type = obj.TYPE_CONST
-               p.From.Offset = -sz
-               p.To.Type = obj.TYPE_REG
-               p.To.Reg = v.Args[0].Reg()
+               // This will be used when moving more
+               // than 8 bytes.  Moves start with as
+               // as many 8 byte moves as possible, then
+               // 4, 2, or 1 byte(s) as remaining.  This will
+               // work and be efficient for power8 or later.
+               // If there are 64 or more bytes, then a
+               // loop is generated to move 32 bytes and
+               // update the src and dst addresses on each
+               // iteration. When < 64 bytes, the appropriate
+               // number of moves are generated based on the
+               // size.
+               // When moving >= 64 bytes a loop is used
+               //      MOVD len/32,REG_TMP
+               //      MOVD REG_TMP,CTR
+               // top:
+               //      MOVD (R4),R7
+               //      MOVD 8(R4),R8
+               //      MOVD 16(R4),R9
+               //      MOVD 24(R4),R10
+               //      ADD  R4,$32
+               //      MOVD R7,(R3)
+               //      MOVD R8,8(R3)
+               //      MOVD R9,16(R3)
+               //      MOVD R10,24(R3)
+               //      ADD  R3,$32
+               //      BC 16,0,top
+               // Bytes not moved by this loop are moved
+               // with a combination of the following instructions,
+               // starting with the largest sizes and generating as
+               // many as needed, using the appropriate offset value.
+               //      MOVD  n(R4),R7
+               //      MOVD  R7,n(R3)
+               //      MOVW  n1(R4),R7
+               //      MOVW  R7,n1(R3)
+               //      MOVH  n2(R4),R7
+               //      MOVH  R7,n2(R3)
+               //      MOVB  n3(R4),R7
+               //      MOVB  R7,n3(R3)
+
+               // Each loop iteration moves 32 bytes
+               ctr := v.AuxInt / 32
 
-               p = s.Prog(ppc64.AADD)
-               p.Reg = v.Args[1].Reg()
-               p.From.Type = obj.TYPE_CONST
-               p.From.Offset = -sz
-               p.To.Type = obj.TYPE_REG
-               p.To.Reg = v.Args[1].Reg()
+               // Remainder after the loop
+               rem := v.AuxInt % 32
 
-               p = s.Prog(movu)
-               p.From.Type = obj.TYPE_MEM
-               p.From.Reg = v.Args[1].Reg()
-               p.From.Offset = sz
-               p.To.Type = obj.TYPE_REG
-               p.To.Reg = ppc64.REGTMP
+               dst_reg := v.Args[0].Reg()
+               src_reg := v.Args[1].Reg()
 
-               p2 := s.Prog(movu)
-               p2.From.Type = obj.TYPE_REG
-               p2.From.Reg = ppc64.REGTMP
-               p2.To.Type = obj.TYPE_MEM
-               p2.To.Reg = v.Args[0].Reg()
-               p2.To.Offset = sz
+               // The set of registers used here, must match the clobbered reg list
+               // in PPC64Ops.go.
+               useregs := []int16{ppc64.REG_R7, ppc64.REG_R8, ppc64.REG_R9, ppc64.REG_R10}
+               offset := int64(0)
 
-               p3 := s.Prog(ppc64.ACMPU)
-               p3.From.Reg = v.Args[1].Reg()
-               p3.From.Type = obj.TYPE_REG
-               p3.To.Reg = v.Args[2].Reg()
-               p3.To.Type = obj.TYPE_REG
+               // top of the loop
+               var top *obj.Prog
+               // Only generate looping code when loop counter is > 1 for >= 64 bytes
+               if ctr > 1 {
+                       // Set up the CTR
+                       p := s.Prog(ppc64.AMOVD)
+                       p.From.Type = obj.TYPE_CONST
+                       p.From.Offset = ctr
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = ppc64.REGTMP
 
-               p4 := s.Prog(ppc64.ABLT)
-               p4.To.Type = obj.TYPE_BRANCH
-               gc.Patch(p4, p)
+                       p = s.Prog(ppc64.AMOVD)
+                       p.From.Type = obj.TYPE_REG
+                       p.From.Reg = ppc64.REGTMP
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = ppc64.REG_CTR
+
+                       // Generate all the MOVDs for loads
+                       // based off the same register, increasing
+                       // the offset by 8 for each instruction
+                       for _, rg := range useregs {
+                               p := s.Prog(ppc64.AMOVD)
+                               p.From.Type = obj.TYPE_MEM
+                               p.From.Reg = src_reg
+                               p.From.Offset = offset
+                               p.To.Type = obj.TYPE_REG
+                               p.To.Reg = rg
+                               if top == nil {
+                                       top = p
+                               }
+                               offset += 8
+                       }
+                       // increment the src_reg for next iteration
+                       p = s.Prog(ppc64.AADD)
+                       p.Reg = src_reg
+                       p.From.Type = obj.TYPE_CONST
+                       p.From.Offset = 32
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = src_reg
+
+                       // generate the MOVDs for stores, based
+                       // off the same register, using the same
+                       // offsets as in the loads.
+                       offset = int64(0)
+                       for _, rg := range useregs {
+                               p := s.Prog(ppc64.AMOVD)
+                               p.From.Type = obj.TYPE_REG
+                               p.From.Reg = rg
+                               p.To.Type = obj.TYPE_MEM
+                               p.To.Reg = dst_reg
+                               p.To.Offset = offset
+                               offset += 8
+                       }
+                       // increment the dst_reg for next iteration
+                       p = s.Prog(ppc64.AADD)
+                       p.Reg = dst_reg
+                       p.From.Type = obj.TYPE_CONST
+                       p.From.Offset = 32
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = dst_reg
+
+                       // BC with BO_BCTR generates bdnz to branch on nonzero CTR
+                       // to loop top.
+                       p = s.Prog(ppc64.ABC)
+                       p.From.Type = obj.TYPE_CONST
+                       p.From.Offset = ppc64.BO_BCTR
+                       p.Reg = ppc64.REG_R0
+                       p.To.Type = obj.TYPE_BRANCH
+                       gc.Patch(p, top)
+
+                       // src_reg and dst_reg were incremented in the loop, so
+                       // later instructions start with offset 0.
+                       offset = int64(0)
+               }
+
+               // No loop was generated for one iteration, so
+               // add 32 bytes to the remainder to move those bytes.
+               if ctr == 1 {
+                       rem += 32
+               }
+
+               // Generate all the remaining load and store pairs, starting with
+               // as many 8 byte moves as possible, then 4, 2, 1.
+               for rem > 0 {
+                       op, size := ppc64.AMOVB, int64(1)
+                       switch {
+                       case rem >= 8:
+                               op, size = ppc64.AMOVD, 8
+                       case rem >= 4:
+                               op, size = ppc64.AMOVW, 4
+                       case rem >= 2:
+                               op, size = ppc64.AMOVH, 2
+                       }
+                       // Load
+                       p := s.Prog(op)
+                       p.To.Type = obj.TYPE_REG
+                       p.To.Reg = ppc64.REG_R7
+                       p.From.Type = obj.TYPE_MEM
+                       p.From.Reg = src_reg
+                       p.From.Offset = offset
+
+                       // Store
+                       p = s.Prog(op)
+                       p.From.Type = obj.TYPE_REG
+                       p.From.Reg = ppc64.REG_R7
+                       p.To.Type = obj.TYPE_MEM
+                       p.To.Reg = dst_reg
+                       p.To.Offset = offset
+                       rem -= size
+                       offset += size
+               }
 
        case ssa.OpPPC64CALLstatic:
                s.Call(v)
index a44e50629de572021ff6cf5105b2ff0ee6aef97c..4b96d9fc521796df8b8fdbcb4cb2a90b9153d152 100644 (file)
 // moves
 (Move [0] _ _ mem) -> mem
 (Move [1] dst src mem) -> (MOVBstore dst (MOVBZload src mem) mem)
-(Move [2] {t} dst src mem) && t.(Type).Alignment()%2 == 0 ->
-       (MOVHstore dst (MOVHZload src mem) mem)
 (Move [2] dst src mem) ->
-       (MOVBstore [1] dst (MOVBZload [1] src mem)
-               (MOVBstore dst (MOVBZload src mem) mem))
-(Move [4] {t} dst src mem) && t.(Type).Alignment()%4 == 0 ->
-       (MOVWstore dst (MOVWload src mem) mem)
-(Move [4] {t} dst src mem) && t.(Type).Alignment()%2 == 0 ->
-       (MOVHstore [2] dst (MOVHZload [2] src mem)
-               (MOVHstore dst (MOVHZload src mem) mem))
-(Move [4] dst src mem) ->
-       (MOVBstore [3] dst (MOVBZload [3] src mem)
-               (MOVBstore [2] dst (MOVBZload [2] src mem)
-                       (MOVBstore [1] dst (MOVBZload [1] src mem)
-                               (MOVBstore dst (MOVBZload src mem) mem))))
-
-(Move [8] {t} dst src mem) && t.(Type).Alignment()%8 == 0 ->
-       (MOVDstore dst (MOVDload src mem) mem)
-(Move [8] {t} dst src mem) && t.(Type).Alignment()%4 == 0 ->
-       (MOVWstore [4] dst (MOVWZload [4] src mem)
-               (MOVWstore dst (MOVWZload src mem) mem))
-(Move [8] {t} dst src mem) && t.(Type).Alignment()%2 == 0->
-       (MOVHstore [6] dst (MOVHZload [6] src mem)
-               (MOVHstore [4] dst (MOVHZload [4] src mem)
-                       (MOVHstore [2] dst (MOVHZload [2] src mem)
-                               (MOVHstore dst (MOVHZload src mem) mem))))
-
+        (MOVHstore dst (MOVHZload src mem) mem)
 (Move [3] dst src mem) ->
-       (MOVBstore [2] dst (MOVBZload [2] src mem)
-               (MOVBstore [1] dst (MOVBZload [1] src mem)
-                       (MOVBstore dst (MOVBZload src mem) mem)))
+        (MOVBstore [2] dst (MOVBZload [2] src mem)
+                (MOVHstore dst (MOVHload src mem) mem))
+(Move [4] dst src mem) ->
+        (MOVWstore dst (MOVWload src mem) mem)
+(Move [5] dst src mem) ->
+        (MOVBstore [4] dst (MOVBZload [4] src mem)
+                (MOVWstore dst (MOVWload src mem) mem))
+(Move [6] dst src mem) ->
+        (MOVHstore [4] dst (MOVHZload [4] src mem)
+                (MOVWstore dst (MOVWload src mem) mem))
+(Move [7] dst src mem) ->
+        (MOVBstore [6] dst (MOVBZload [6] src mem)
+                (MOVHstore [4] dst (MOVHZload [4] src mem)
+                        (MOVWstore dst (MOVWload src mem) mem)))
+(Move [8] dst src mem) ->
+        (MOVDstore dst (MOVDload src mem) mem)
 
 // Large move uses a loop
-(Move [s] {t} dst src mem)
-       && (s > 512 || config.noDuffDevice) || t.(Type).Alignment()%8 != 0 ->
-       (LoweredMove [t.(Type).Alignment()]
-               dst
-               src
-               (ADDconst <src.Type> src [s-moveSize(t.(Type).Alignment(), config)])
-               mem)
+(Move [s] dst src mem) && s > 8 ->
+        (LoweredMove [s] dst src mem)
 
 // Calls
 // Lowering calls
index 387584dbdaaa14f00002da5c4531ae2b766836d0..04810e2c7d9157d454b3b609fd256d1728d64ca2 100644 (file)
@@ -349,26 +349,41 @@ func init() {
                        typ:            "Mem",
                        faultOnNilArg0: true,
                },
+               // Loop code:
+               //      MOVD len/32,REG_TMP  only for loop
+               //      MOVD REG_TMP,CTR     only for loop
+               // loop:
+               //      MOVD (R4),R7
+               //      MOVD 8(R4),R8
+               //      MOVD 16(R4),R9
+               //      MOVD 24(R4),R10
+               //      ADD  R4,$32          only with loop
+               //      MOVD R7,(R3)
+               //      MOVD R8,8(R3)
+               //      MOVD R9,16(R3)
+               //      MOVD R10,24(R3)
+               //      ADD  R3,$32          only with loop
+               //      BC 16,0,loop         only with loop
+               // Bytes not moved by this loop are moved
+               // with a combination of the following instructions,
+               // starting with the largest sizes and generating as
+               // many as needed, using the appropriate offset value.
+               //      MOVD  n(R4),R7
+               //      MOVD  R7,n(R3)
+               //      MOVW  n1(R4),R7
+               //      MOVW  R7,n1(R3)
+               //      MOVH  n2(R4),R7
+               //      MOVH  R7,n2(R3)
+               //      MOVB  n3(R4),R7
+               //      MOVB  R7,n3(R3)
 
-               // large or unaligned move
-               // arg0 = address of dst memory (in R3, changed as side effect)
-               // arg1 = address of src memory (in R4, changed as side effect)
-               // arg2 = address of the last element of src
-               // arg3 = mem
-               // returns mem
-               //  ADD -8,R3,R3 // intermediate value not valid GC ptr, cannot expose to opt+GC
-               //  ADD -8,R4,R4 // intermediate value not valid GC ptr, cannot expose to opt+GC
-               //      MOVDU   8(R4), Rtmp
-               //      MOVDU   Rtmp, 8(R3)
-               //      CMP     R4, Rarg2
-               //      BLT     -3(PC)
                {
                        name:      "LoweredMove",
                        aux:       "Int64",
-                       argLength: 4,
+                       argLength: 3,
                        reg: regInfo{
-                               inputs:   []regMask{buildReg("R3"), buildReg("R4"), gp},
-                               clobbers: buildReg("R3 R4"),
+                               inputs:   []regMask{buildReg("R3"), buildReg("R4")},
+                               clobbers: buildReg("R3 R4 R7 R8 R9 R10"),
                        },
                        clobberFlags:   true,
                        typ:            "Mem",
index 74ad2d4eb16f38578ae879116d7f8d4f808941d5..4b7a6b87b193ba96579b7d07d9cdb57ae7b8800a 100644 (file)
@@ -17409,17 +17409,16 @@ var opcodeTable = [...]opInfo{
        {
                name:           "LoweredMove",
                auxType:        auxInt64,
-               argLen:         4,
+               argLen:         3,
                clobberFlags:   true,
                faultOnNilArg0: true,
                faultOnNilArg1: true,
                reg: regInfo{
                        inputs: []inputInfo{
-                               {0, 8},          // R3
-                               {1, 16},         // R4
-                               {2, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+                               {0, 8},  // R3
+                               {1, 16}, // R4
                        },
-                       clobbers: 24, // R3 R4
+                       clobbers: 1944, // R3 R4 R7 R8 R9 R10
                },
        },
        {
index 1c9a8f26679c05b5ee5ccd34f291c03ac7fa7e78..0c10487906b69660dd343097df0694db1cfb7c47 100644 (file)
@@ -3686,8 +3686,6 @@ func rewriteValuePPC64_OpMod8u(v *Value) bool {
 func rewriteValuePPC64_OpMove(v *Value) bool {
        b := v.Block
        _ = b
-       config := b.Func.Config
-       _ = config
        types := &b.Func.Config.Types
        _ = types
        // match: (Move [0] _ _ mem)
@@ -3722,20 +3720,16 @@ func rewriteValuePPC64_OpMove(v *Value) bool {
                v.AddArg(mem)
                return true
        }
-       // match: (Move [2] {t} dst src mem)
-       // cond: t.(Type).Alignment()%2 == 0
+       // match: (Move [2] dst src mem)
+       // cond:
        // result: (MOVHstore dst (MOVHZload src mem) mem)
        for {
                if v.AuxInt != 2 {
                        break
                }
-               t := v.Aux
                dst := v.Args[0]
                src := v.Args[1]
                mem := v.Args[2]
-               if !(t.(Type).Alignment()%2 == 0) {
-                       break
-               }
                v.reset(OpPPC64MOVHstore)
                v.AddArg(dst)
                v0 := b.NewValue0(v.Pos, OpPPC64MOVHZload, types.UInt16)
@@ -3745,27 +3739,27 @@ func rewriteValuePPC64_OpMove(v *Value) bool {
                v.AddArg(mem)
                return true
        }
-       // match: (Move [2] dst src mem)
+       // match: (Move [3] dst src mem)
        // cond:
-       // result: (MOVBstore [1] dst (MOVBZload [1] src mem)           (MOVBstore dst (MOVBZload src mem) mem))
+       // result: (MOVBstore [2] dst (MOVBZload [2] src mem)                 (MOVHstore dst (MOVHload src mem) mem))
        for {
-               if v.AuxInt != 2 {
+               if v.AuxInt != 3 {
                        break
                }
                dst := v.Args[0]
                src := v.Args[1]
                mem := v.Args[2]
                v.reset(OpPPC64MOVBstore)
-               v.AuxInt = 1
+               v.AuxInt = 2
                v.AddArg(dst)
                v0 := b.NewValue0(v.Pos, OpPPC64MOVBZload, types.UInt8)
-               v0.AuxInt = 1
+               v0.AuxInt = 2
                v0.AddArg(src)
                v0.AddArg(mem)
                v.AddArg(v0)
-               v1 := b.NewValue0(v.Pos, OpPPC64MOVBstore, TypeMem)
+               v1 := b.NewValue0(v.Pos, OpPPC64MOVHstore, TypeMem)
                v1.AddArg(dst)
-               v2 := b.NewValue0(v.Pos, OpPPC64MOVBZload, types.UInt8)
+               v2 := b.NewValue0(v.Pos, OpPPC64MOVHload, types.Int16)
                v2.AddArg(src)
                v2.AddArg(mem)
                v1.AddArg(v2)
@@ -3773,20 +3767,16 @@ func rewriteValuePPC64_OpMove(v *Value) bool {
                v.AddArg(v1)
                return true
        }
-       // match: (Move [4] {t} dst src mem)
-       // cond: t.(Type).Alignment()%4 == 0
+       // match: (Move [4] dst src mem)
+       // cond:
        // result: (MOVWstore dst (MOVWload src mem) mem)
        for {
                if v.AuxInt != 4 {
                        break
                }
-               t := v.Aux
                dst := v.Args[0]
                src := v.Args[1]
                mem := v.Args[2]
-               if !(t.(Type).Alignment()%4 == 0) {
-                       break
-               }
                v.reset(OpPPC64MOVWstore)
                v.AddArg(dst)
                v0 := b.NewValue0(v.Pos, OpPPC64MOVWload, types.Int32)
@@ -3796,132 +3786,55 @@ func rewriteValuePPC64_OpMove(v *Value) bool {
                v.AddArg(mem)
                return true
        }
-       // match: (Move [4] {t} dst src mem)
-       // cond: t.(Type).Alignment()%2 == 0
-       // result: (MOVHstore [2] dst (MOVHZload [2] src mem)           (MOVHstore dst (MOVHZload src mem) mem))
-       for {
-               if v.AuxInt != 4 {
-                       break
-               }
-               t := v.Aux
-               dst := v.Args[0]
-               src := v.Args[1]
-               mem := v.Args[2]
-               if !(t.(Type).Alignment()%2 == 0) {
-                       break
-               }
-               v.reset(OpPPC64MOVHstore)
-               v.AuxInt = 2
-               v.AddArg(dst)
-               v0 := b.NewValue0(v.Pos, OpPPC64MOVHZload, types.UInt16)
-               v0.AuxInt = 2
-               v0.AddArg(src)
-               v0.AddArg(mem)
-               v.AddArg(v0)
-               v1 := b.NewValue0(v.Pos, OpPPC64MOVHstore, TypeMem)
-               v1.AddArg(dst)
-               v2 := b.NewValue0(v.Pos, OpPPC64MOVHZload, types.UInt16)
-               v2.AddArg(src)
-               v2.AddArg(mem)
-               v1.AddArg(v2)
-               v1.AddArg(mem)
-               v.AddArg(v1)
-               return true
-       }
-       // match: (Move [4] dst src mem)
+       // match: (Move [5] dst src mem)
        // cond:
-       // result: (MOVBstore [3] dst (MOVBZload [3] src mem)           (MOVBstore [2] dst (MOVBZload [2] src mem)                      (MOVBstore [1] dst (MOVBZload [1] src mem)                              (MOVBstore dst (MOVBZload src mem) mem))))
+       // result: (MOVBstore [4] dst (MOVBZload [4] src mem)                 (MOVWstore dst (MOVWload src mem) mem))
        for {
-               if v.AuxInt != 4 {
+               if v.AuxInt != 5 {
                        break
                }
                dst := v.Args[0]
                src := v.Args[1]
                mem := v.Args[2]
                v.reset(OpPPC64MOVBstore)
-               v.AuxInt = 3
+               v.AuxInt = 4
                v.AddArg(dst)
                v0 := b.NewValue0(v.Pos, OpPPC64MOVBZload, types.UInt8)
-               v0.AuxInt = 3
+               v0.AuxInt = 4
                v0.AddArg(src)
                v0.AddArg(mem)
                v.AddArg(v0)
-               v1 := b.NewValue0(v.Pos, OpPPC64MOVBstore, TypeMem)
-               v1.AuxInt = 2
+               v1 := b.NewValue0(v.Pos, OpPPC64MOVWstore, TypeMem)
                v1.AddArg(dst)
-               v2 := b.NewValue0(v.Pos, OpPPC64MOVBZload, types.UInt8)
-               v2.AuxInt = 2
+               v2 := b.NewValue0(v.Pos, OpPPC64MOVWload, types.Int32)
                v2.AddArg(src)
                v2.AddArg(mem)
                v1.AddArg(v2)
-               v3 := b.NewValue0(v.Pos, OpPPC64MOVBstore, TypeMem)
-               v3.AuxInt = 1
-               v3.AddArg(dst)
-               v4 := b.NewValue0(v.Pos, OpPPC64MOVBZload, types.UInt8)
-               v4.AuxInt = 1
-               v4.AddArg(src)
-               v4.AddArg(mem)
-               v3.AddArg(v4)
-               v5 := b.NewValue0(v.Pos, OpPPC64MOVBstore, TypeMem)
-               v5.AddArg(dst)
-               v6 := b.NewValue0(v.Pos, OpPPC64MOVBZload, types.UInt8)
-               v6.AddArg(src)
-               v6.AddArg(mem)
-               v5.AddArg(v6)
-               v5.AddArg(mem)
-               v3.AddArg(v5)
-               v1.AddArg(v3)
+               v1.AddArg(mem)
                v.AddArg(v1)
                return true
        }
-       // match: (Move [8] {t} dst src mem)
-       // cond: t.(Type).Alignment()%8 == 0
-       // result: (MOVDstore dst (MOVDload src mem) mem)
-       for {
-               if v.AuxInt != 8 {
-                       break
-               }
-               t := v.Aux
-               dst := v.Args[0]
-               src := v.Args[1]
-               mem := v.Args[2]
-               if !(t.(Type).Alignment()%8 == 0) {
-                       break
-               }
-               v.reset(OpPPC64MOVDstore)
-               v.AddArg(dst)
-               v0 := b.NewValue0(v.Pos, OpPPC64MOVDload, types.Int64)
-               v0.AddArg(src)
-               v0.AddArg(mem)
-               v.AddArg(v0)
-               v.AddArg(mem)
-               return true
-       }
-       // match: (Move [8] {t} dst src mem)
-       // cond: t.(Type).Alignment()%4 == 0
-       // result: (MOVWstore [4] dst (MOVWZload [4] src mem)           (MOVWstore dst (MOVWZload src mem) mem))
+       // match: (Move [6] dst src mem)
+       // cond:
+       // result: (MOVHstore [4] dst (MOVHZload [4] src mem)                 (MOVWstore dst (MOVWload src mem) mem))
        for {
-               if v.AuxInt != 8 {
+               if v.AuxInt != 6 {
                        break
                }
-               t := v.Aux
                dst := v.Args[0]
                src := v.Args[1]
                mem := v.Args[2]
-               if !(t.(Type).Alignment()%4 == 0) {
-                       break
-               }
-               v.reset(OpPPC64MOVWstore)
+               v.reset(OpPPC64MOVHstore)
                v.AuxInt = 4
                v.AddArg(dst)
-               v0 := b.NewValue0(v.Pos, OpPPC64MOVWZload, types.UInt32)
+               v0 := b.NewValue0(v.Pos, OpPPC64MOVHZload, types.UInt16)
                v0.AuxInt = 4
                v0.AddArg(src)
                v0.AddArg(mem)
                v.AddArg(v0)
                v1 := b.NewValue0(v.Pos, OpPPC64MOVWstore, TypeMem)
                v1.AddArg(dst)
-               v2 := b.NewValue0(v.Pos, OpPPC64MOVWZload, types.UInt32)
+               v2 := b.NewValue0(v.Pos, OpPPC64MOVWload, types.Int32)
                v2.AddArg(src)
                v2.AddArg(mem)
                v1.AddArg(v2)
@@ -3929,24 +3842,20 @@ func rewriteValuePPC64_OpMove(v *Value) bool {
                v.AddArg(v1)
                return true
        }
-       // match: (Move [8] {t} dst src mem)
-       // cond: t.(Type).Alignment()%2 == 0
-       // result: (MOVHstore [6] dst (MOVHZload [6] src mem)           (MOVHstore [4] dst (MOVHZload [4] src mem)                      (MOVHstore [2] dst (MOVHZload [2] src mem)                              (MOVHstore dst (MOVHZload src mem) mem))))
+       // match: (Move [7] dst src mem)
+       // cond:
+       // result: (MOVBstore [6] dst (MOVBZload [6] src mem)                 (MOVHstore [4] dst (MOVHZload [4] src mem)                         (MOVWstore dst (MOVWload src mem) mem)))
        for {
-               if v.AuxInt != 8 {
+               if v.AuxInt != 7 {
                        break
                }
-               t := v.Aux
                dst := v.Args[0]
                src := v.Args[1]
                mem := v.Args[2]
-               if !(t.(Type).Alignment()%2 == 0) {
-                       break
-               }
-               v.reset(OpPPC64MOVHstore)
+               v.reset(OpPPC64MOVBstore)
                v.AuxInt = 6
                v.AddArg(dst)
-               v0 := b.NewValue0(v.Pos, OpPPC64MOVHZload, types.UInt16)
+               v0 := b.NewValue0(v.Pos, OpPPC64MOVBZload, types.UInt8)
                v0.AuxInt = 6
                v0.AddArg(src)
                v0.AddArg(mem)
@@ -3959,83 +3868,51 @@ func rewriteValuePPC64_OpMove(v *Value) bool {
                v2.AddArg(src)
                v2.AddArg(mem)
                v1.AddArg(v2)
-               v3 := b.NewValue0(v.Pos, OpPPC64MOVHstore, TypeMem)
-               v3.AuxInt = 2
+               v3 := b.NewValue0(v.Pos, OpPPC64MOVWstore, TypeMem)
                v3.AddArg(dst)
-               v4 := b.NewValue0(v.Pos, OpPPC64MOVHZload, types.UInt16)
-               v4.AuxInt = 2
+               v4 := b.NewValue0(v.Pos, OpPPC64MOVWload, types.Int32)
                v4.AddArg(src)
                v4.AddArg(mem)
                v3.AddArg(v4)
-               v5 := b.NewValue0(v.Pos, OpPPC64MOVHstore, TypeMem)
-               v5.AddArg(dst)
-               v6 := b.NewValue0(v.Pos, OpPPC64MOVHZload, types.UInt16)
-               v6.AddArg(src)
-               v6.AddArg(mem)
-               v5.AddArg(v6)
-               v5.AddArg(mem)
-               v3.AddArg(v5)
+               v3.AddArg(mem)
                v1.AddArg(v3)
                v.AddArg(v1)
                return true
        }
-       // match: (Move [3] dst src mem)
+       // match: (Move [8] dst src mem)
        // cond:
-       // result: (MOVBstore [2] dst (MOVBZload [2] src mem)           (MOVBstore [1] dst (MOVBZload [1] src mem)                      (MOVBstore dst (MOVBZload src mem) mem)))
+       // result: (MOVDstore dst (MOVDload src mem) mem)
        for {
-               if v.AuxInt != 3 {
+               if v.AuxInt != 8 {
                        break
                }
                dst := v.Args[0]
                src := v.Args[1]
                mem := v.Args[2]
-               v.reset(OpPPC64MOVBstore)
-               v.AuxInt = 2
+               v.reset(OpPPC64MOVDstore)
                v.AddArg(dst)
-               v0 := b.NewValue0(v.Pos, OpPPC64MOVBZload, types.UInt8)
-               v0.AuxInt = 2
+               v0 := b.NewValue0(v.Pos, OpPPC64MOVDload, types.Int64)
                v0.AddArg(src)
                v0.AddArg(mem)
                v.AddArg(v0)
-               v1 := b.NewValue0(v.Pos, OpPPC64MOVBstore, TypeMem)
-               v1.AuxInt = 1
-               v1.AddArg(dst)
-               v2 := b.NewValue0(v.Pos, OpPPC64MOVBZload, types.UInt8)
-               v2.AuxInt = 1
-               v2.AddArg(src)
-               v2.AddArg(mem)
-               v1.AddArg(v2)
-               v3 := b.NewValue0(v.Pos, OpPPC64MOVBstore, TypeMem)
-               v3.AddArg(dst)
-               v4 := b.NewValue0(v.Pos, OpPPC64MOVBZload, types.UInt8)
-               v4.AddArg(src)
-               v4.AddArg(mem)
-               v3.AddArg(v4)
-               v3.AddArg(mem)
-               v1.AddArg(v3)
-               v.AddArg(v1)
+               v.AddArg(mem)
                return true
        }
-       // match: (Move [s] {t} dst src mem)
-       // cond: (s > 512 || config.noDuffDevice) || t.(Type).Alignment()%8 != 0
-       // result: (LoweredMove [t.(Type).Alignment()]          dst             src             (ADDconst <src.Type> src [s-moveSize(t.(Type).Alignment(), config)])            mem)
+       // match: (Move [s] dst src mem)
+       // cond: s > 8
+       // result: (LoweredMove [s] dst src mem)
        for {
                s := v.AuxInt
-               t := v.Aux
                dst := v.Args[0]
                src := v.Args[1]
                mem := v.Args[2]
-               if !((s > 512 || config.noDuffDevice) || t.(Type).Alignment()%8 != 0) {
+               if !(s > 8) {
                        break
                }
                v.reset(OpPPC64LoweredMove)
-               v.AuxInt = t.(Type).Alignment()
+               v.AuxInt = s
                v.AddArg(dst)
                v.AddArg(src)
-               v0 := b.NewValue0(v.Pos, OpPPC64ADDconst, src.Type)
-               v0.AuxInt = s - moveSize(t.(Type).Alignment(), config)
-               v0.AddArg(src)
-               v.AddArg(v0)
                v.AddArg(mem)
                return true
        }